Skip to main content

core/stdarch/crates/core_arch/src/x86/
avx512fp16.rs

1use crate::arch::asm;
2use crate::core_arch::{simd::*, x86::*};
3use crate::intrinsics::{fmaf16, simd::*};
4use crate::ptr;
5
6/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
7///
8/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ph)
9#[inline]
10#[target_feature(enable = "avx512fp16")]
11#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
13pub const fn _mm_set_ph(
14    e7: f16,
15    e6: f16,
16    e5: f16,
17    e4: f16,
18    e3: f16,
19    e2: f16,
20    e1: f16,
21    e0: f16,
22) -> __m128h {
23    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
24}
25
26/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
27///
28/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ph)
29#[inline]
30#[target_feature(enable = "avx512fp16")]
31#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
32#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
33pub const fn _mm256_set_ph(
34    e15: f16,
35    e14: f16,
36    e13: f16,
37    e12: f16,
38    e11: f16,
39    e10: f16,
40    e9: f16,
41    e8: f16,
42    e7: f16,
43    e6: f16,
44    e5: f16,
45    e4: f16,
46    e3: f16,
47    e2: f16,
48    e1: f16,
49    e0: f16,
50) -> __m256h {
51    __m256h([
52        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
53    ])
54}
55
56/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
57///
58/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ph)
59#[inline]
60#[target_feature(enable = "avx512fp16")]
61#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
62#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
63pub const fn _mm512_set_ph(
64    e31: f16,
65    e30: f16,
66    e29: f16,
67    e28: f16,
68    e27: f16,
69    e26: f16,
70    e25: f16,
71    e24: f16,
72    e23: f16,
73    e22: f16,
74    e21: f16,
75    e20: f16,
76    e19: f16,
77    e18: f16,
78    e17: f16,
79    e16: f16,
80    e15: f16,
81    e14: f16,
82    e13: f16,
83    e12: f16,
84    e11: f16,
85    e10: f16,
86    e9: f16,
87    e8: f16,
88    e7: f16,
89    e6: f16,
90    e5: f16,
91    e4: f16,
92    e3: f16,
93    e2: f16,
94    e1: f16,
95    e0: f16,
96) -> __m512h {
97    __m512h([
98        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
99        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
100    ])
101}
102
103/// Copy half-precision (16-bit) floating-point elements from a to the lower element of dst and zero
104/// the upper 7 elements.
105///
106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sh)
107#[inline]
108#[target_feature(enable = "avx512fp16")]
109#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
110#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
111pub const fn _mm_set_sh(a: f16) -> __m128h {
112    __m128h([a, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
113}
114
115/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
116///
117/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ph)
118#[inline]
119#[target_feature(enable = "avx512fp16")]
120#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
121#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
122pub const fn _mm_set1_ph(a: f16) -> __m128h {
123    unsafe { transmute(f16x8::splat(a)) }
124}
125
126/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
127///
128/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ph)
129#[inline]
130#[target_feature(enable = "avx512fp16")]
131#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
132#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
133pub const fn _mm256_set1_ph(a: f16) -> __m256h {
134    unsafe { transmute(f16x16::splat(a)) }
135}
136
137/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
138///
139/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ph)
140#[inline]
141#[target_feature(enable = "avx512fp16")]
142#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
143#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
144pub const fn _mm512_set1_ph(a: f16) -> __m512h {
145    unsafe { transmute(f16x32::splat(a)) }
146}
147
148/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
149///
150/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ph)
151#[inline]
152#[target_feature(enable = "avx512fp16")]
153#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
154#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
155pub const fn _mm_setr_ph(
156    e0: f16,
157    e1: f16,
158    e2: f16,
159    e3: f16,
160    e4: f16,
161    e5: f16,
162    e6: f16,
163    e7: f16,
164) -> __m128h {
165    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
166}
167
168/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
169///
170/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ph)
171#[inline]
172#[target_feature(enable = "avx512fp16")]
173#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
174#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
175pub const fn _mm256_setr_ph(
176    e0: f16,
177    e1: f16,
178    e2: f16,
179    e3: f16,
180    e4: f16,
181    e5: f16,
182    e6: f16,
183    e7: f16,
184    e8: f16,
185    e9: f16,
186    e10: f16,
187    e11: f16,
188    e12: f16,
189    e13: f16,
190    e14: f16,
191    e15: f16,
192) -> __m256h {
193    __m256h([
194        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
195    ])
196}
197
198/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
199///
200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ph)
201#[inline]
202#[target_feature(enable = "avx512fp16")]
203#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
204#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
205pub const fn _mm512_setr_ph(
206    e0: f16,
207    e1: f16,
208    e2: f16,
209    e3: f16,
210    e4: f16,
211    e5: f16,
212    e6: f16,
213    e7: f16,
214    e8: f16,
215    e9: f16,
216    e10: f16,
217    e11: f16,
218    e12: f16,
219    e13: f16,
220    e14: f16,
221    e15: f16,
222    e16: f16,
223    e17: f16,
224    e18: f16,
225    e19: f16,
226    e20: f16,
227    e21: f16,
228    e22: f16,
229    e23: f16,
230    e24: f16,
231    e25: f16,
232    e26: f16,
233    e27: f16,
234    e28: f16,
235    e29: f16,
236    e30: f16,
237    e31: f16,
238) -> __m512h {
239    __m512h([
240        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
241        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
242    ])
243}
244
245/// Return vector of type __m128h with all elements set to zero.
246///
247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ph)
248#[inline]
249#[target_feature(enable = "avx512fp16,avx512vl")]
250#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
251#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
252pub const fn _mm_setzero_ph() -> __m128h {
253    unsafe { transmute(f16x8::ZERO) }
254}
255
256/// Return vector of type __m256h with all elements set to zero.
257///
258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ph)
259#[inline]
260#[target_feature(enable = "avx512fp16,avx512vl")]
261#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
262#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
263pub const fn _mm256_setzero_ph() -> __m256h {
264    f16x16::ZERO.as_m256h()
265}
266
267/// Return vector of type __m512h with all elements set to zero.
268///
269/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ph)
270#[inline]
271#[target_feature(enable = "avx512fp16")]
272#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
273#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
274pub const fn _mm512_setzero_ph() -> __m512h {
275    f16x32::ZERO.as_m512h()
276}
277
278/// Return vector of type `__m128h` with indetermination elements.
279/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
280/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
281/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
282///
283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ph)
284#[inline]
285#[target_feature(enable = "avx512fp16,avx512vl")]
286#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
287#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
288pub const fn _mm_undefined_ph() -> __m128h {
289    f16x8::ZERO.as_m128h()
290}
291
292/// Return vector of type `__m256h` with indetermination elements.
293/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
294/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
295/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
296///
297/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ph)
298#[inline]
299#[target_feature(enable = "avx512fp16,avx512vl")]
300#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
301#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
302pub const fn _mm256_undefined_ph() -> __m256h {
303    f16x16::ZERO.as_m256h()
304}
305
306/// Return vector of type `__m512h` with indetermination elements.
307/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
308/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
309/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
310///
311/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ph)
312#[inline]
313#[target_feature(enable = "avx512fp16")]
314#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
315#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
316pub const fn _mm512_undefined_ph() -> __m512h {
317    f16x32::ZERO.as_m512h()
318}
319
320/// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
321/// does not generate any instructions, thus it has zero latency.
322///
323/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ph)
324#[inline]
325#[target_feature(enable = "avx512fp16")]
326#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
327#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
328pub const fn _mm_castpd_ph(a: __m128d) -> __m128h {
329    unsafe { transmute(a) }
330}
331
332/// Cast vector of type `__m256d` to type `__m256h`. This intrinsic is only used for compilation and
333/// does not generate any instructions, thus it has zero latency.
334///
335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
336#[inline]
337#[target_feature(enable = "avx512fp16")]
338#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
339#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
340pub const fn _mm256_castpd_ph(a: __m256d) -> __m256h {
341    unsafe { transmute(a) }
342}
343
344/// Cast vector of type `__m512d` to type `__m512h`. This intrinsic is only used for compilation and
345/// does not generate any instructions, thus it has zero latency.
346///
347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ph)
348#[inline]
349#[target_feature(enable = "avx512fp16")]
350#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
351#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
352pub const fn _mm512_castpd_ph(a: __m512d) -> __m512h {
353    unsafe { transmute(a) }
354}
355
356/// Cast vector of type `__m128h` to type `__m128d`. This intrinsic is only used for compilation and
357/// does not generate any instructions, thus it has zero latency.
358///
359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_pd)
360#[inline]
361#[target_feature(enable = "avx512fp16")]
362#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
363#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
364pub const fn _mm_castph_pd(a: __m128h) -> __m128d {
365    unsafe { transmute(a) }
366}
367
368/// Cast vector of type `__m256h` to type `__m256d`. This intrinsic is only used for compilation and
369/// does not generate any instructions, thus it has zero latency.
370///
371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_pd)
372#[inline]
373#[target_feature(enable = "avx512fp16")]
374#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
375#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
376pub const fn _mm256_castph_pd(a: __m256h) -> __m256d {
377    unsafe { transmute(a) }
378}
379
380/// Cast vector of type `__m512h` to type `__m512d`. This intrinsic is only used for compilation and
381/// does not generate any instructions, thus it has zero latency.
382///
383/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_pd)
384#[inline]
385#[target_feature(enable = "avx512fp16")]
386#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
387#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
388pub const fn _mm512_castph_pd(a: __m512h) -> __m512d {
389    unsafe { transmute(a) }
390}
391
392/// Cast vector of type `__m128` to type `__m128h`. This intrinsic is only used for compilation and
393/// does not generate any instructions, thus it has zero latency.
394///
395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_ph)
396#[inline]
397#[target_feature(enable = "avx512fp16")]
398#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
399#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
400pub const fn _mm_castps_ph(a: __m128) -> __m128h {
401    unsafe { transmute(a) }
402}
403
404/// Cast vector of type `__m256` to type `__m256h`. This intrinsic is only used for compilation and
405/// does not generate any instructions, thus it has zero latency.
406///
407/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_ph)
408#[inline]
409#[target_feature(enable = "avx512fp16")]
410#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
411#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
412pub const fn _mm256_castps_ph(a: __m256) -> __m256h {
413    unsafe { transmute(a) }
414}
415
416/// Cast vector of type `__m512` to type `__m512h`. This intrinsic is only used for compilation and
417/// does not generate any instructions, thus it has zero latency.
418///
419/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_ph)
420#[inline]
421#[target_feature(enable = "avx512fp16")]
422#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
423#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
424pub const fn _mm512_castps_ph(a: __m512) -> __m512h {
425    unsafe { transmute(a) }
426}
427
428/// Cast vector of type `__m128h` to type `__m128`. This intrinsic is only used for compilation and
429/// does not generate any instructions, thus it has zero latency.
430///
431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_ps)
432#[inline]
433#[target_feature(enable = "avx512fp16")]
434#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
435#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
436pub const fn _mm_castph_ps(a: __m128h) -> __m128 {
437    unsafe { transmute(a) }
438}
439
440/// Cast vector of type `__m256h` to type `__m256`. This intrinsic is only used for compilation and
441/// does not generate any instructions, thus it has zero latency.
442///
443/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_ps)
444#[inline]
445#[target_feature(enable = "avx512fp16")]
446#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
447#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
448pub const fn _mm256_castph_ps(a: __m256h) -> __m256 {
449    unsafe { transmute(a) }
450}
451
452/// Cast vector of type `__m512h` to type `__m512`. This intrinsic is only used for compilation and
453/// does not generate any instructions, thus it has zero latency.
454///
455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_ps)
456#[inline]
457#[target_feature(enable = "avx512fp16")]
458#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
459#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
460pub const fn _mm512_castph_ps(a: __m512h) -> __m512 {
461    unsafe { transmute(a) }
462}
463
464/// Cast vector of type `__m128i` to type `__m128h`. This intrinsic is only used for compilation and
465/// does not generate any instructions, thus it has zero latency.
466///
467/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ph)
468#[inline]
469#[target_feature(enable = "avx512fp16")]
470#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
471#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
472pub const fn _mm_castsi128_ph(a: __m128i) -> __m128h {
473    unsafe { transmute(a) }
474}
475
476/// Cast vector of type `__m256i` to type `__m256h`. This intrinsic is only used for compilation and
477/// does not generate any instructions, thus it has zero latency.
478///
479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ph)
480#[inline]
481#[target_feature(enable = "avx512fp16")]
482#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
483#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
484pub const fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
485    unsafe { transmute(a) }
486}
487
488/// Cast vector of type `__m512i` to type `__m512h`. This intrinsic is only used for compilation and
489/// does not generate any instructions, thus it has zero latency.
490///
491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ph)
492#[inline]
493#[target_feature(enable = "avx512fp16")]
494#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
495#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
496pub const fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
497    unsafe { transmute(a) }
498}
499
500/// Cast vector of type `__m128h` to type `__m128i`. This intrinsic is only used for compilation and
501/// does not generate any instructions, thus it has zero latency.
502///
503/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_si128)
504#[inline]
505#[target_feature(enable = "avx512fp16")]
506#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
507#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
508pub const fn _mm_castph_si128(a: __m128h) -> __m128i {
509    unsafe { transmute(a) }
510}
511
512/// Cast vector of type `__m256h` to type `__m256i`. This intrinsic is only used for compilation and
513/// does not generate any instructions, thus it has zero latency.
514///
515/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_si256)
516#[inline]
517#[target_feature(enable = "avx512fp16")]
518#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
519#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
520pub const fn _mm256_castph_si256(a: __m256h) -> __m256i {
521    unsafe { transmute(a) }
522}
523
524/// Cast vector of type `__m512h` to type `__m512i`. This intrinsic is only used for compilation and
525/// does not generate any instructions, thus it has zero latency.
526///
527/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_si512)
528#[inline]
529#[target_feature(enable = "avx512fp16")]
530#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
531#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
532pub const fn _mm512_castph_si512(a: __m512h) -> __m512i {
533    unsafe { transmute(a) }
534}
535
536/// Cast vector of type `__m256h` to type `__m128h`. This intrinsic is only used for compilation and
537/// does not generate any instructions, thus it has zero latency.
538///
539/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph256_ph128)
540#[inline]
541#[target_feature(enable = "avx512fp16")]
542#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
543#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
544pub const fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
545    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
546}
547
548/// Cast vector of type `__m512h` to type `__m128h`. This intrinsic is only used for compilation and
549/// does not generate any instructions, thus it has zero latency.
550///
551/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph128)
552#[inline]
553#[target_feature(enable = "avx512fp16")]
554#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
555#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
556pub const fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
557    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
558}
559
560/// Cast vector of type `__m512h` to type `__m256h`. This intrinsic is only used for compilation and
561/// does not generate any instructions, thus it has zero latency.
562///
563/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph256)
564#[inline]
565#[target_feature(enable = "avx512fp16")]
566#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
567#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
568pub const fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
569    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
570}
571
572/// Cast vector of type `__m128h` to type `__m256h`. The upper 8 elements of the result are undefined.
573/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
574/// but most of the time it does not generate any instructions.
575///
576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph128_ph256)
577#[inline]
578#[target_feature(enable = "avx512fp16")]
579#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
580#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
581pub const fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
582    unsafe {
583        simd_shuffle!(
584            a,
585            _mm_undefined_ph(),
586            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
587        )
588    }
589}
590
591/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are undefined.
592/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
593/// but most of the time it does not generate any instructions.
594///
595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph128_ph512)
596#[inline]
597#[target_feature(enable = "avx512fp16")]
598#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
599#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
600pub const fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
601    unsafe {
602        simd_shuffle!(
603            a,
604            _mm_undefined_ph(),
605            [
606                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
607                8, 8, 8, 8
608            ]
609        )
610    }
611}
612
613/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are undefined.
614/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
615/// but most of the time it does not generate any instructions.
616///
617/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph256_ph512)
618#[inline]
619#[target_feature(enable = "avx512fp16")]
620#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
621#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
622pub const fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
623    unsafe {
624        simd_shuffle!(
625            a,
626            _mm256_undefined_ph(),
627            [
628                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
629                16, 16, 16, 16, 16, 16, 16, 16, 16
630            ]
631        )
632    }
633}
634
635/// Cast vector of type `__m256h` to type `__m128h`. The upper 8 elements of the result are zeroed.
636/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
637/// any instructions.
638///
639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextph128_ph256)
640#[inline]
641#[target_feature(enable = "avx512fp16")]
642#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
643#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
644pub const fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
645    unsafe {
646        simd_shuffle!(
647            a,
648            _mm_setzero_ph(),
649            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
650        )
651    }
652}
653
654/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
655/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
656/// any instructions.
657///
658/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
659#[inline]
660#[target_feature(enable = "avx512fp16")]
661#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
662#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
663pub const fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
664    unsafe {
665        simd_shuffle!(
666            a,
667            _mm256_setzero_ph(),
668            [
669                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
670                16, 16, 16, 16, 16, 16, 16, 16, 16
671            ]
672        )
673    }
674}
675
676/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
677/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
678/// any instructions.
679///
680/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph128_ph512)
681#[inline]
682#[target_feature(enable = "avx512fp16")]
683#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
684#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
685pub const fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
686    unsafe {
687        simd_shuffle!(
688            a,
689            _mm_setzero_ph(),
690            [
691                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
692                8, 8, 8, 8
693            ]
694        )
695    }
696}
697
698macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
699    ($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
700        let dst: $mask_type;
701        asm!(
702            "vcmpph {k}, {a}, {b}, {imm8}",
703            k = lateout(kreg) dst,
704            a = in($reg) $a,
705            b = in($reg) $b,
706            imm8 = const IMM5,
707            options(pure, nomem, nostack)
708        );
709        dst
710    }};
711    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
712        let dst: $mask_type;
713        asm!(
714            "vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
715            k = lateout(kreg) dst,
716            mask = in(kreg) $mask,
717            a = in($reg) $a,
718            b = in($reg) $b,
719            imm8 = const IMM5,
720            options(pure, nomem, nostack)
721        );
722        dst
723    }};
724}
725
726/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
727/// operand specified by imm8, and store the results in mask vector k.
728///
729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask)
730#[inline]
731#[target_feature(enable = "avx512fp16,avx512vl")]
732#[rustc_legacy_const_generics(2)]
733#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
734pub fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
735    unsafe {
736        static_assert_uimm_bits!(IMM5, 5);
737        cmp_asm!(__mmask8, xmm_reg, a, b)
738    }
739}
740
741/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
742/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
743/// zeroed out when the corresponding mask bit is not set).
744///
745/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask)
746#[inline]
747#[target_feature(enable = "avx512fp16,avx512vl")]
748#[rustc_legacy_const_generics(3)]
749#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
750pub fn _mm_mask_cmp_ph_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
751    unsafe {
752        static_assert_uimm_bits!(IMM5, 5);
753        cmp_asm!(__mmask8, k1, xmm_reg, a, b)
754    }
755}
756
757/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
758/// operand specified by imm8, and store the results in mask vector k.
759///
760/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ph_mask)
761#[inline]
762#[target_feature(enable = "avx512fp16,avx512vl")]
763#[rustc_legacy_const_generics(2)]
764#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
765pub fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
766    unsafe {
767        static_assert_uimm_bits!(IMM5, 5);
768        cmp_asm!(__mmask16, ymm_reg, a, b)
769    }
770}
771
772/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
773/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
774/// zeroed out when the corresponding mask bit is not set).
775///
776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask)
777#[inline]
778#[target_feature(enable = "avx512fp16,avx512vl")]
779#[rustc_legacy_const_generics(3)]
780#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
781pub fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
782    k1: __mmask16,
783    a: __m256h,
784    b: __m256h,
785) -> __mmask16 {
786    unsafe {
787        static_assert_uimm_bits!(IMM5, 5);
788        cmp_asm!(__mmask16, k1, ymm_reg, a, b)
789    }
790}
791
792/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
793/// operand specified by imm8, and store the results in mask vector k.
794///
795/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ph_mask)
796#[inline]
797#[target_feature(enable = "avx512fp16")]
798#[rustc_legacy_const_generics(2)]
799#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
800pub fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
801    unsafe {
802        static_assert_uimm_bits!(IMM5, 5);
803        cmp_asm!(__mmask32, zmm_reg, a, b)
804    }
805}
806
807/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
808/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
809/// zeroed out when the corresponding mask bit is not set).
810///
811/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask)
812#[inline]
813#[target_feature(enable = "avx512fp16")]
814#[rustc_legacy_const_generics(3)]
815#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
816pub fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
817    k1: __mmask32,
818    a: __m512h,
819    b: __m512h,
820) -> __mmask32 {
821    unsafe {
822        static_assert_uimm_bits!(IMM5, 5);
823        cmp_asm!(__mmask32, k1, zmm_reg, a, b)
824    }
825}
826
827/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
828/// operand specified by imm8, and store the results in mask vector k.
829///
830/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
831///
832/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
833#[inline]
834#[target_feature(enable = "avx512fp16")]
835#[rustc_legacy_const_generics(2, 3)]
836#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
837pub fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
838    a: __m512h,
839    b: __m512h,
840) -> __mmask32 {
841    unsafe {
842        static_assert_uimm_bits!(IMM5, 5);
843        static_assert_sae!(SAE);
844        if SAE == _MM_FROUND_NO_EXC {
845            let dst: __mmask32;
846            asm!(
847                "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
848                k = lateout(kreg) dst,
849                a = in(zmm_reg) a,
850                b = in(zmm_reg) b,
851                imm8 = const IMM5,
852                options(pure, nomem, nostack)
853            );
854            dst
855        } else {
856            cmp_asm!(__mmask32, zmm_reg, a, b)
857        }
858    }
859}
860
861/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
862/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
863/// zeroed out when the corresponding mask bit is not set).
864///
865/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
866///
867/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
868#[inline]
869#[target_feature(enable = "avx512fp16")]
870#[rustc_legacy_const_generics(3, 4)]
871#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
872pub fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
873    k1: __mmask32,
874    a: __m512h,
875    b: __m512h,
876) -> __mmask32 {
877    unsafe {
878        static_assert_uimm_bits!(IMM5, 5);
879        static_assert_sae!(SAE);
880        if SAE == _MM_FROUND_NO_EXC {
881            let dst: __mmask32;
882            asm!(
883                "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
884                k = lateout(kreg) dst,
885                k1 = in(kreg) k1,
886                a = in(zmm_reg) a,
887                b = in(zmm_reg) b,
888                imm8 = const IMM5,
889                options(pure, nomem, nostack)
890            );
891            dst
892        } else {
893            cmp_asm!(__mmask32, k1, zmm_reg, a, b)
894        }
895    }
896}
897
898/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
899/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
900/// passing _MM_FROUND_NO_EXC in the sae parameter.
901///
902/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask)
903#[inline]
904#[target_feature(enable = "avx512fp16")]
905#[rustc_legacy_const_generics(2, 3)]
906#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
907pub fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __mmask8 {
908    static_assert_uimm_bits!(IMM5, 5);
909    static_assert_sae!(SAE);
910    _mm_mask_cmp_round_sh_mask::<IMM5, SAE>(0xff, a, b)
911}
912
913/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
914/// operand specified by imm8, and store the result in mask vector k using zeromask k1. Exceptions can be
915/// suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
916///
917/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask)
918#[inline]
919#[target_feature(enable = "avx512fp16")]
920#[rustc_legacy_const_generics(3, 4)]
921#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
922pub fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
923    k1: __mmask8,
924    a: __m128h,
925    b: __m128h,
926) -> __mmask8 {
927    unsafe {
928        static_assert_uimm_bits!(IMM5, 5);
929        static_assert_sae!(SAE);
930        vcmpsh(a, b, IMM5, k1, SAE)
931    }
932}
933
934/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
935/// operand specified by imm8, and store the result in mask vector k.
936///
937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask)
938#[inline]
939#[target_feature(enable = "avx512fp16")]
940#[rustc_legacy_const_generics(2)]
941#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
942pub fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
943    static_assert_uimm_bits!(IMM5, 5);
944    _mm_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
945}
946
947/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
948/// operand specified by imm8, and store the result in mask vector k using zeromask k1.
949///
950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask)
951#[inline]
952#[target_feature(enable = "avx512fp16")]
953#[rustc_legacy_const_generics(3)]
954#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
955pub fn _mm_mask_cmp_sh_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
956    static_assert_uimm_bits!(IMM5, 5);
957    _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
958}
959
960/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
961/// operand specified by imm8, and return the boolean result (0 or 1).
962/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
963///
964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sh)
965#[inline]
966#[target_feature(enable = "avx512fp16")]
967#[rustc_legacy_const_generics(2, 3)]
968#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
969pub fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
970    unsafe {
971        static_assert_uimm_bits!(IMM5, 5);
972        static_assert_sae!(SAE);
973        vcomish(a, b, IMM5, SAE)
974    }
975}
976
977/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
978/// operand specified by imm8, and return the boolean result (0 or 1).
979///
980/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_sh)
981#[inline]
982#[target_feature(enable = "avx512fp16")]
983#[rustc_legacy_const_generics(2)]
984#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
985pub fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
986    static_assert_uimm_bits!(IMM5, 5);
987    _mm_comi_round_sh::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
988}
989
990/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and return
991/// the boolean result (0 or 1).
992///
993/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sh)
994#[inline]
995#[target_feature(enable = "avx512fp16")]
996#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
997pub fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
998    _mm_comi_sh::<_CMP_EQ_OS>(a, b)
999}
1000
1001/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
1002/// and return the boolean result (0 or 1).
1003///
1004/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sh)
1005#[inline]
1006#[target_feature(enable = "avx512fp16")]
1007#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1008pub fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
1009    _mm_comi_sh::<_CMP_GE_OS>(a, b)
1010}
1011
1012/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
1013/// the boolean result (0 or 1).
1014///
1015/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sh)
1016#[inline]
1017#[target_feature(enable = "avx512fp16")]
1018#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1019pub fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
1020    _mm_comi_sh::<_CMP_GT_OS>(a, b)
1021}
1022
1023/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
1024/// return the boolean result (0 or 1).
1025///
1026/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sh)
1027#[inline]
1028#[target_feature(enable = "avx512fp16")]
1029#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1030pub fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
1031    _mm_comi_sh::<_CMP_LE_OS>(a, b)
1032}
1033
1034/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
1035/// the boolean result (0 or 1).
1036///
1037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sh)
1038#[inline]
1039#[target_feature(enable = "avx512fp16")]
1040#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1041pub fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
1042    _mm_comi_sh::<_CMP_LT_OS>(a, b)
1043}
1044
1045/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1046/// the boolean result (0 or 1).
1047///
1048/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sh)
1049#[inline]
1050#[target_feature(enable = "avx512fp16")]
1051#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1052pub fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
1053    _mm_comi_sh::<_CMP_NEQ_US>(a, b)
1054}
1055
1056/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and
1057/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1058///
1059/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sh)
1060#[inline]
1061#[target_feature(enable = "avx512fp16")]
1062#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1063pub fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
1064    _mm_comi_sh::<_CMP_EQ_OQ>(a, b)
1065}
1066
1067/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
1068/// and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1069///
1070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sh)
1071#[inline]
1072#[target_feature(enable = "avx512fp16")]
1073#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1074pub fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
1075    _mm_comi_sh::<_CMP_GE_OQ>(a, b)
1076}
1077
1078/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
1079/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1080///
1081/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sh)
1082#[inline]
1083#[target_feature(enable = "avx512fp16")]
1084#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1085pub fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
1086    _mm_comi_sh::<_CMP_GT_OQ>(a, b)
1087}
1088
1089/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
1090/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1091///
1092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sh)
1093#[inline]
1094#[target_feature(enable = "avx512fp16")]
1095#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1096pub fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
1097    _mm_comi_sh::<_CMP_LE_OQ>(a, b)
1098}
1099
1100/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
1101/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1102///
1103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sh)
1104#[inline]
1105#[target_feature(enable = "avx512fp16")]
1106#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1107pub fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
1108    _mm_comi_sh::<_CMP_LT_OQ>(a, b)
1109}
1110
1111/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1112/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1113///
1114/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sh)
1115#[inline]
1116#[target_feature(enable = "avx512fp16")]
1117#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1118pub fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
1119    _mm_comi_sh::<_CMP_NEQ_UQ>(a, b)
1120}
1121
1122/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1123/// a new vector. The address must be aligned to 16 bytes or a general-protection exception may be generated.
1124///
1125/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ph)
1126#[inline]
1127#[target_feature(enable = "avx512fp16,avx512vl")]
1128#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1129#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1130pub const unsafe fn _mm_load_ph(mem_addr: *const f16) -> __m128h {
1131    *mem_addr.cast()
1132}
1133
1134/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1135/// a new vector. The address must be aligned to 32 bytes or a general-protection exception may be generated.
1136///
1137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ph)
1138#[inline]
1139#[target_feature(enable = "avx512fp16,avx512vl")]
1140#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1141#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1142pub const unsafe fn _mm256_load_ph(mem_addr: *const f16) -> __m256h {
1143    *mem_addr.cast()
1144}
1145
1146/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1147/// a new vector. The address must be aligned to 64 bytes or a general-protection exception may be generated.
1148///
1149/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ph)
1150#[inline]
1151#[target_feature(enable = "avx512fp16")]
1152#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1153#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1154pub const unsafe fn _mm512_load_ph(mem_addr: *const f16) -> __m512h {
1155    *mem_addr.cast()
1156}
1157
1158/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector,
1159/// and zero the upper elements
1160///
1161/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sh)
1162#[inline]
1163#[target_feature(enable = "avx512fp16")]
1164#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1165#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1166pub const unsafe fn _mm_load_sh(mem_addr: *const f16) -> __m128h {
1167    _mm_set_sh(*mem_addr)
1168}
1169
1170/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1171/// using writemask k (the element is copied from src when mask bit 0 is not set), and zero the upper elements.
1172///
1173/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sh)
1174#[inline]
1175#[target_feature(enable = "avx512fp16")]
1176#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1177pub unsafe fn _mm_mask_load_sh(src: __m128h, k: __mmask8, mem_addr: *const f16) -> __m128h {
1178    let mut dst = src;
1179    asm!(
1180        vpl!("vmovsh {dst}{{{k}}}"),
1181        dst = inout(xmm_reg) dst,
1182        k = in(kreg) k,
1183        p = in(reg) mem_addr,
1184        options(pure, readonly, nostack, preserves_flags)
1185    );
1186    dst
1187}
1188
1189/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1190/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and zero the upper elements.
1191///
1192/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sh)
1193#[inline]
1194#[target_feature(enable = "avx512fp16")]
1195#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1196pub unsafe fn _mm_maskz_load_sh(k: __mmask8, mem_addr: *const f16) -> __m128h {
1197    let mut dst: __m128h;
1198    asm!(
1199        vpl!("vmovsh {dst}{{{k}}}{{z}}"),
1200        dst = out(xmm_reg) dst,
1201        k = in(kreg) k,
1202        p = in(reg) mem_addr,
1203        options(pure, readonly, nostack, preserves_flags)
1204    );
1205    dst
1206}
1207
1208/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1209/// a new vector. The address does not need to be aligned to any particular boundary.
1210///
1211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ph)
1212#[inline]
1213#[target_feature(enable = "avx512fp16,avx512vl")]
1214#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1215#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1216pub const unsafe fn _mm_loadu_ph(mem_addr: *const f16) -> __m128h {
1217    ptr::read_unaligned(mem_addr.cast())
1218}
1219
1220/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1221/// a new vector. The address does not need to be aligned to any particular boundary.
1222///
1223/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ph)
1224#[inline]
1225#[target_feature(enable = "avx512fp16,avx512vl")]
1226#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1227#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1228pub const unsafe fn _mm256_loadu_ph(mem_addr: *const f16) -> __m256h {
1229    ptr::read_unaligned(mem_addr.cast())
1230}
1231
1232/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1233/// a new vector. The address does not need to be aligned to any particular boundary.
1234///
1235/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ph)
1236#[inline]
1237#[target_feature(enable = "avx512fp16")]
1238#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1239#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1240pub const unsafe fn _mm512_loadu_ph(mem_addr: *const f16) -> __m512h {
1241    ptr::read_unaligned(mem_addr.cast())
1242}
1243
1244/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1245/// using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper
1246/// 7 packed elements from a to the upper elements of dst.
1247///
1248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sh)
1249#[inline]
1250#[target_feature(enable = "avx512fp16")]
1251#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1252#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1253pub const fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1254    unsafe {
1255        let mut mov: f16 = simd_extract!(src, 0);
1256        if (k & 1) != 0 {
1257            mov = simd_extract!(b, 0);
1258        }
1259        simd_insert!(a, 0, mov)
1260    }
1261}
1262
1263/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1264/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
1265/// elements from a to the upper elements of dst.
1266///
1267/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sh)
1268#[inline]
1269#[target_feature(enable = "avx512fp16")]
1270#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1271#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1272pub const fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1273    unsafe {
1274        let mut mov: f16 = 0.;
1275        if (k & 1) != 0 {
1276            mov = simd_extract!(b, 0);
1277        }
1278        simd_insert!(a, 0, mov)
1279    }
1280}
1281
1282/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst,
1283/// and copy the upper 7 packed elements from a to the upper elements of dst.
1284///
1285/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sh)
1286#[inline]
1287#[target_feature(enable = "avx512fp16")]
1288#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1289#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1290pub const fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
1291    unsafe {
1292        let mov: f16 = simd_extract!(b, 0);
1293        simd_insert!(a, 0, mov)
1294    }
1295}
1296
1297/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1298/// The address must be aligned to 16 bytes or a general-protection exception may be generated.
1299///
1300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ph)
1301#[inline]
1302#[target_feature(enable = "avx512fp16,avx512vl")]
1303#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1304#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1305pub const unsafe fn _mm_store_ph(mem_addr: *mut f16, a: __m128h) {
1306    *mem_addr.cast() = a;
1307}
1308
1309/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1310/// The address must be aligned to 32 bytes or a general-protection exception may be generated.
1311///
1312/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ph)
1313#[inline]
1314#[target_feature(enable = "avx512fp16,avx512vl")]
1315#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1316#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1317pub const unsafe fn _mm256_store_ph(mem_addr: *mut f16, a: __m256h) {
1318    *mem_addr.cast() = a;
1319}
1320
1321/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1322/// The address must be aligned to 64 bytes or a general-protection exception may be generated.
1323///
1324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ph)
1325#[inline]
1326#[target_feature(enable = "avx512fp16")]
1327#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1328#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1329pub const unsafe fn _mm512_store_ph(mem_addr: *mut f16, a: __m512h) {
1330    *mem_addr.cast() = a;
1331}
1332
1333/// Store the lower half-precision (16-bit) floating-point element from a into memory.
1334///
1335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sh)
1336#[inline]
1337#[target_feature(enable = "avx512fp16")]
1338#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1339#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1340pub const unsafe fn _mm_store_sh(mem_addr: *mut f16, a: __m128h) {
1341    *mem_addr = simd_extract!(a, 0);
1342}
1343
1344/// Store the lower half-precision (16-bit) floating-point element from a into memory using writemask k
1345///
1346/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sh)
1347#[inline]
1348#[target_feature(enable = "avx512fp16")]
1349#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1350pub unsafe fn _mm_mask_store_sh(mem_addr: *mut f16, k: __mmask8, a: __m128h) {
1351    asm!(
1352        vps!("vmovdqu16", "{{{k}}}, {src}"),
1353        p = in(reg) mem_addr,
1354        k = in(kreg) k,
1355        src = in(xmm_reg) a,
1356        options(nostack, preserves_flags)
1357    );
1358}
1359
1360/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1361/// The address does not need to be aligned to any particular boundary.
1362///
1363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ph)
1364#[inline]
1365#[target_feature(enable = "avx512fp16,avx512vl")]
1366#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1367#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1368pub const unsafe fn _mm_storeu_ph(mem_addr: *mut f16, a: __m128h) {
1369    ptr::write_unaligned(mem_addr.cast(), a);
1370}
1371
1372/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1373/// The address does not need to be aligned to any particular boundary.
1374///
1375/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ph)
1376#[inline]
1377#[target_feature(enable = "avx512fp16,avx512vl")]
1378#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1379#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1380pub const unsafe fn _mm256_storeu_ph(mem_addr: *mut f16, a: __m256h) {
1381    ptr::write_unaligned(mem_addr.cast(), a);
1382}
1383
1384/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1385/// The address does not need to be aligned to any particular boundary.
1386///
1387/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ph)
1388#[inline]
1389#[target_feature(enable = "avx512fp16")]
1390#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1391#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1392pub const unsafe fn _mm512_storeu_ph(mem_addr: *mut f16, a: __m512h) {
1393    ptr::write_unaligned(mem_addr.cast(), a);
1394}
1395
1396/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1397///
1398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ph)
1399#[inline]
1400#[target_feature(enable = "avx512fp16,avx512vl")]
1401#[cfg_attr(test, assert_instr(vaddph))]
1402#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1403#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1404pub const fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
1405    unsafe { simd_add(a, b) }
1406}
1407
1408/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1409/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1410///
1411/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ph)
1412#[inline]
1413#[target_feature(enable = "avx512fp16,avx512vl")]
1414#[cfg_attr(test, assert_instr(vaddph))]
1415#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1416#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1417pub const fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1418    unsafe {
1419        let r = _mm_add_ph(a, b);
1420        simd_select_bitmask(k, r, src)
1421    }
1422}
1423
1424/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1425/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1426///
1427/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ph)
1428#[inline]
1429#[target_feature(enable = "avx512fp16,avx512vl")]
1430#[cfg_attr(test, assert_instr(vaddph))]
1431#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1432#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1433pub const fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1434    unsafe {
1435        let r = _mm_add_ph(a, b);
1436        simd_select_bitmask(k, r, _mm_setzero_ph())
1437    }
1438}
1439
1440/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1441///
1442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ph)
1443#[inline]
1444#[target_feature(enable = "avx512fp16,avx512vl")]
1445#[cfg_attr(test, assert_instr(vaddph))]
1446#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1447#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1448pub const fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
1449    unsafe { simd_add(a, b) }
1450}
1451
1452/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1453/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1454///
1455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ph)
1456#[inline]
1457#[target_feature(enable = "avx512fp16,avx512vl")]
1458#[cfg_attr(test, assert_instr(vaddph))]
1459#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1460#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1461pub const fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1462    unsafe {
1463        let r = _mm256_add_ph(a, b);
1464        simd_select_bitmask(k, r, src)
1465    }
1466}
1467
1468/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1469/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1470///
1471/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ph)
1472#[inline]
1473#[target_feature(enable = "avx512fp16,avx512vl")]
1474#[cfg_attr(test, assert_instr(vaddph))]
1475#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1476#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1477pub const fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1478    unsafe {
1479        let r = _mm256_add_ph(a, b);
1480        simd_select_bitmask(k, r, _mm256_setzero_ph())
1481    }
1482}
1483
1484/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1485///
1486/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ph)
1487#[inline]
1488#[target_feature(enable = "avx512fp16")]
1489#[cfg_attr(test, assert_instr(vaddph))]
1490#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1491#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1492pub const fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
1493    unsafe { simd_add(a, b) }
1494}
1495
1496/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1497/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1498///
1499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ph)
1500#[inline]
1501#[target_feature(enable = "avx512fp16")]
1502#[cfg_attr(test, assert_instr(vaddph))]
1503#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1504#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1505pub const fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1506    unsafe {
1507        let r = _mm512_add_ph(a, b);
1508        simd_select_bitmask(k, r, src)
1509    }
1510}
1511
1512/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1513/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1514///
1515/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ph)
1516#[inline]
1517#[target_feature(enable = "avx512fp16")]
1518#[cfg_attr(test, assert_instr(vaddph))]
1519#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1520#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1521pub const fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1522    unsafe {
1523        let r = _mm512_add_ph(a, b);
1524        simd_select_bitmask(k, r, _mm512_setzero_ph())
1525    }
1526}
1527
1528/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1529/// Rounding is done according to the rounding parameter, which can be one of:
1530///
1531/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1532/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1533/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1534/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1535/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1536///
1537/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ph)
1538#[inline]
1539#[target_feature(enable = "avx512fp16")]
1540#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1541#[rustc_legacy_const_generics(2)]
1542#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1543pub fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1544    unsafe {
1545        static_assert_rounding!(ROUNDING);
1546        vaddph(a, b, ROUNDING)
1547    }
1548}
1549
1550/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1551/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1552/// Rounding is done according to the rounding parameter, which can be one of:
1553///
1554/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1555/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1556/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1557/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1558/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1559///
1560/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ph)
1561#[inline]
1562#[target_feature(enable = "avx512fp16")]
1563#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1564#[rustc_legacy_const_generics(4)]
1565#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1566pub fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
1567    src: __m512h,
1568    k: __mmask32,
1569    a: __m512h,
1570    b: __m512h,
1571) -> __m512h {
1572    unsafe {
1573        static_assert_rounding!(ROUNDING);
1574        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1575        simd_select_bitmask(k, r, src)
1576    }
1577}
1578
1579/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1580/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1581/// Rounding is done according to the rounding parameter, which can be one of:
1582///
1583/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1584/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1585/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1586/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1587///
1588/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ph)
1589#[inline]
1590#[target_feature(enable = "avx512fp16")]
1591#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1592#[rustc_legacy_const_generics(3)]
1593#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1594pub fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
1595    k: __mmask32,
1596    a: __m512h,
1597    b: __m512h,
1598) -> __m512h {
1599    unsafe {
1600        static_assert_rounding!(ROUNDING);
1601        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1602        simd_select_bitmask(k, r, _mm512_setzero_ph())
1603    }
1604}
1605
1606/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1607/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1608/// Rounding is done according to the rounding parameter, which can be one of:
1609///
1610/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1611/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1612/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1613/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1614/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1615///
1616/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sh)
1617#[inline]
1618#[target_feature(enable = "avx512fp16")]
1619#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1620#[rustc_legacy_const_generics(2)]
1621#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1622pub fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1623    static_assert_rounding!(ROUNDING);
1624    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
1625}
1626
1627/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1628/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1629/// writemask k (the element is copied from src when mask bit 0 is not set).
1630/// Rounding is done according to the rounding parameter, which can be one of:
1631///
1632/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1633/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1634/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1635/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1636/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1637///
1638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sh)
1639#[inline]
1640#[target_feature(enable = "avx512fp16")]
1641#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1642#[rustc_legacy_const_generics(4)]
1643#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1644pub fn _mm_mask_add_round_sh<const ROUNDING: i32>(
1645    src: __m128h,
1646    k: __mmask8,
1647    a: __m128h,
1648    b: __m128h,
1649) -> __m128h {
1650    unsafe {
1651        static_assert_rounding!(ROUNDING);
1652        vaddsh(a, b, src, k, ROUNDING)
1653    }
1654}
1655
1656/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1657/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1658/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1659/// Rounding is done according to the rounding parameter, which can be one of:
1660///
1661/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1662/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1663/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1664/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1665/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1666///
1667/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sh)
1668#[inline]
1669#[target_feature(enable = "avx512fp16")]
1670#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1671#[rustc_legacy_const_generics(3)]
1672#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1673pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1674    static_assert_rounding!(ROUNDING);
1675    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
1676}
1677
1678/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1679/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1680///
1681/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sh)
1682#[inline]
1683#[target_feature(enable = "avx512fp16")]
1684#[cfg_attr(test, assert_instr(vaddsh))]
1685#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1686#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1687pub const fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
1688    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) + _mm_cvtsh_h(b)) }
1689}
1690
1691/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1692/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1693/// writemask k (the element is copied from src when mask bit 0 is not set).
1694///
1695/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sh)
1696#[inline]
1697#[target_feature(enable = "avx512fp16")]
1698#[cfg_attr(test, assert_instr(vaddsh))]
1699#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1700#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1701pub const fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1702    unsafe {
1703        let extractsrc: f16 = simd_extract!(src, 0);
1704        let mut add: f16 = extractsrc;
1705        if (k & 0b00000001) != 0 {
1706            let extracta: f16 = simd_extract!(a, 0);
1707            let extractb: f16 = simd_extract!(b, 0);
1708            add = extracta + extractb;
1709        }
1710        simd_insert!(a, 0, add)
1711    }
1712}
1713
1714/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1715/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1716/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1717///
1718/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sh)
1719#[inline]
1720#[target_feature(enable = "avx512fp16")]
1721#[cfg_attr(test, assert_instr(vaddsh))]
1722#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1723#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1724pub const fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1725    unsafe {
1726        let mut add: f16 = 0.;
1727        if (k & 0b00000001) != 0 {
1728            let extracta: f16 = simd_extract!(a, 0);
1729            let extractb: f16 = simd_extract!(b, 0);
1730            add = extracta + extractb;
1731        }
1732        simd_insert!(a, 0, add)
1733    }
1734}
1735
1736/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1737///
1738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ph)
1739#[inline]
1740#[target_feature(enable = "avx512fp16,avx512vl")]
1741#[cfg_attr(test, assert_instr(vsubph))]
1742#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1743#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1744pub const fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
1745    unsafe { simd_sub(a, b) }
1746}
1747
1748/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1749/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1750///
1751/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ph)
1752#[inline]
1753#[target_feature(enable = "avx512fp16,avx512vl")]
1754#[cfg_attr(test, assert_instr(vsubph))]
1755#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1756#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1757pub const fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1758    unsafe {
1759        let r = _mm_sub_ph(a, b);
1760        simd_select_bitmask(k, r, src)
1761    }
1762}
1763
1764/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1765/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1766///
1767/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ph)
1768#[inline]
1769#[target_feature(enable = "avx512fp16,avx512vl")]
1770#[cfg_attr(test, assert_instr(vsubph))]
1771#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1772#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1773pub const fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1774    unsafe {
1775        let r = _mm_sub_ph(a, b);
1776        simd_select_bitmask(k, r, _mm_setzero_ph())
1777    }
1778}
1779
1780/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1781///
1782/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ph)
1783#[inline]
1784#[target_feature(enable = "avx512fp16,avx512vl")]
1785#[cfg_attr(test, assert_instr(vsubph))]
1786#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1787#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1788pub const fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
1789    unsafe { simd_sub(a, b) }
1790}
1791
1792/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1793/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1794///
1795/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ph)
1796#[inline]
1797#[target_feature(enable = "avx512fp16,avx512vl")]
1798#[cfg_attr(test, assert_instr(vsubph))]
1799#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1800#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1801pub const fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1802    unsafe {
1803        let r = _mm256_sub_ph(a, b);
1804        simd_select_bitmask(k, r, src)
1805    }
1806}
1807
1808/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1809/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1810///
1811/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ph)
1812#[inline]
1813#[target_feature(enable = "avx512fp16,avx512vl")]
1814#[cfg_attr(test, assert_instr(vsubph))]
1815#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1816#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1817pub const fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1818    unsafe {
1819        let r = _mm256_sub_ph(a, b);
1820        simd_select_bitmask(k, r, _mm256_setzero_ph())
1821    }
1822}
1823
1824/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1825///
1826/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ph)
1827#[inline]
1828#[target_feature(enable = "avx512fp16")]
1829#[cfg_attr(test, assert_instr(vsubph))]
1830#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1831#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1832pub const fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
1833    unsafe { simd_sub(a, b) }
1834}
1835
1836/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1837/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1838///
1839/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ph)
1840#[inline]
1841#[target_feature(enable = "avx512fp16")]
1842#[cfg_attr(test, assert_instr(vsubph))]
1843#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1844#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1845pub const fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1846    unsafe {
1847        let r = _mm512_sub_ph(a, b);
1848        simd_select_bitmask(k, r, src)
1849    }
1850}
1851
1852/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1853/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1854///
1855/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ph)
1856#[inline]
1857#[target_feature(enable = "avx512fp16")]
1858#[cfg_attr(test, assert_instr(vsubph))]
1859#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1860#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1861pub const fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1862    unsafe {
1863        let r = _mm512_sub_ph(a, b);
1864        simd_select_bitmask(k, r, _mm512_setzero_ph())
1865    }
1866}
1867
1868/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1869/// Rounding is done according to the rounding parameter, which can be one of:
1870///
1871/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1872/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1873/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1874/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1875/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1876///
1877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ph)
1878#[inline]
1879#[target_feature(enable = "avx512fp16")]
1880#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1881#[rustc_legacy_const_generics(2)]
1882#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1883pub fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1884    unsafe {
1885        static_assert_rounding!(ROUNDING);
1886        vsubph(a, b, ROUNDING)
1887    }
1888}
1889
1890/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1891/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1892/// Rounding is done according to the rounding parameter, which can be one of:
1893///
1894/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1895/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1896/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1897/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1898/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1899///
1900/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ph)
1901#[inline]
1902#[target_feature(enable = "avx512fp16")]
1903#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1904#[rustc_legacy_const_generics(4)]
1905#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1906pub fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
1907    src: __m512h,
1908    k: __mmask32,
1909    a: __m512h,
1910    b: __m512h,
1911) -> __m512h {
1912    unsafe {
1913        static_assert_rounding!(ROUNDING);
1914        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1915        simd_select_bitmask(k, r, src)
1916    }
1917}
1918
1919/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1920/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1921/// Rounding is done according to the rounding parameter, which can be one of:
1922///
1923/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1924/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1925/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1926/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1927/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1928///
1929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ph)
1930#[inline]
1931#[target_feature(enable = "avx512fp16")]
1932#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1933#[rustc_legacy_const_generics(3)]
1934#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1935pub fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
1936    k: __mmask32,
1937    a: __m512h,
1938    b: __m512h,
1939) -> __m512h {
1940    unsafe {
1941        static_assert_rounding!(ROUNDING);
1942        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1943        simd_select_bitmask(k, r, _mm512_setzero_ph())
1944    }
1945}
1946
1947/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1948/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1949/// Rounding is done according to the rounding parameter, which can be one of:
1950///
1951/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1952/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1953/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1954/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1955/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1956///
1957/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sh)
1958#[inline]
1959#[target_feature(enable = "avx512fp16")]
1960#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1961#[rustc_legacy_const_generics(2)]
1962#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1963pub fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1964    static_assert_rounding!(ROUNDING);
1965    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
1966}
1967
1968/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1969/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1970/// writemask k (the element is copied from src when mask bit 0 is not set).
1971/// Rounding is done according to the rounding parameter, which can be one of:
1972///
1973/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1974/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1975/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1976/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1977/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1978///
1979/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sh)
1980#[inline]
1981#[target_feature(enable = "avx512fp16")]
1982#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1983#[rustc_legacy_const_generics(4)]
1984#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
1985pub fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
1986    src: __m128h,
1987    k: __mmask8,
1988    a: __m128h,
1989    b: __m128h,
1990) -> __m128h {
1991    unsafe {
1992        static_assert_rounding!(ROUNDING);
1993        vsubsh(a, b, src, k, ROUNDING)
1994    }
1995}
1996
1997/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1998/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1999/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2000/// Rounding is done according to the rounding parameter, which can be one of:
2001///
2002/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2003/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2004/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2005/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2006/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2007///
2008/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sh)
2009#[inline]
2010#[target_feature(enable = "avx512fp16")]
2011#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
2012#[rustc_legacy_const_generics(3)]
2013#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2014pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2015    static_assert_rounding!(ROUNDING);
2016    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2017}
2018
2019/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
2020/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2021///
2022/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sh)
2023#[inline]
2024#[target_feature(enable = "avx512fp16")]
2025#[cfg_attr(test, assert_instr(vsubsh))]
2026#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2027#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2028pub const fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
2029    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) - _mm_cvtsh_h(b)) }
2030}
2031
2032/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
2033/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2034/// writemask k (the element is copied from src when mask bit 0 is not set).
2035///
2036/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sh)
2037#[inline]
2038#[target_feature(enable = "avx512fp16")]
2039#[cfg_attr(test, assert_instr(vsubsh))]
2040#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2041#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2042pub const fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2043    unsafe {
2044        let extractsrc: f16 = simd_extract!(src, 0);
2045        let mut add: f16 = extractsrc;
2046        if (k & 0b00000001) != 0 {
2047            let extracta: f16 = simd_extract!(a, 0);
2048            let extractb: f16 = simd_extract!(b, 0);
2049            add = extracta - extractb;
2050        }
2051        simd_insert!(a, 0, add)
2052    }
2053}
2054
2055/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
2056/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2057/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2058///
2059/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sh)
2060#[inline]
2061#[target_feature(enable = "avx512fp16")]
2062#[cfg_attr(test, assert_instr(vsubsh))]
2063#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2064#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2065pub const fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2066    unsafe {
2067        let mut add: f16 = 0.;
2068        if (k & 0b00000001) != 0 {
2069            let extracta: f16 = simd_extract!(a, 0);
2070            let extractb: f16 = simd_extract!(b, 0);
2071            add = extracta - extractb;
2072        }
2073        simd_insert!(a, 0, add)
2074    }
2075}
2076
2077/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2078///
2079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ph)
2080#[inline]
2081#[target_feature(enable = "avx512fp16,avx512vl")]
2082#[cfg_attr(test, assert_instr(vmulph))]
2083#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2084#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2085pub const fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
2086    unsafe { simd_mul(a, b) }
2087}
2088
2089/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2090/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2091///
2092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ph)
2093#[inline]
2094#[target_feature(enable = "avx512fp16,avx512vl")]
2095#[cfg_attr(test, assert_instr(vmulph))]
2096#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2097#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2098pub const fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2099    unsafe {
2100        let r = _mm_mul_ph(a, b);
2101        simd_select_bitmask(k, r, src)
2102    }
2103}
2104
2105/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2106/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2107///
2108/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ph)
2109#[inline]
2110#[target_feature(enable = "avx512fp16,avx512vl")]
2111#[cfg_attr(test, assert_instr(vmulph))]
2112#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2113#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2114pub const fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2115    unsafe {
2116        let r = _mm_mul_ph(a, b);
2117        simd_select_bitmask(k, r, _mm_setzero_ph())
2118    }
2119}
2120
2121/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2122///
2123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ph)
2124#[inline]
2125#[target_feature(enable = "avx512fp16,avx512vl")]
2126#[cfg_attr(test, assert_instr(vmulph))]
2127#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2128#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2129pub const fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
2130    unsafe { simd_mul(a, b) }
2131}
2132
2133/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2134/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2135///
2136/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ph)
2137#[inline]
2138#[target_feature(enable = "avx512fp16,avx512vl")]
2139#[cfg_attr(test, assert_instr(vmulph))]
2140#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2141#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2142pub const fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2143    unsafe {
2144        let r = _mm256_mul_ph(a, b);
2145        simd_select_bitmask(k, r, src)
2146    }
2147}
2148
2149/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2150/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2151///
2152/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ph)
2153#[inline]
2154#[target_feature(enable = "avx512fp16,avx512vl")]
2155#[cfg_attr(test, assert_instr(vmulph))]
2156#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2157#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2158pub const fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2159    unsafe {
2160        let r = _mm256_mul_ph(a, b);
2161        simd_select_bitmask(k, r, _mm256_setzero_ph())
2162    }
2163}
2164
2165/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2166///
2167/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ph)
2168#[inline]
2169#[target_feature(enable = "avx512fp16")]
2170#[cfg_attr(test, assert_instr(vmulph))]
2171#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2172#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2173pub const fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
2174    unsafe { simd_mul(a, b) }
2175}
2176
2177/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2178/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2179///
2180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ph)
2181#[inline]
2182#[target_feature(enable = "avx512fp16")]
2183#[cfg_attr(test, assert_instr(vmulph))]
2184#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2185#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2186pub const fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2187    unsafe {
2188        let r = _mm512_mul_ph(a, b);
2189        simd_select_bitmask(k, r, src)
2190    }
2191}
2192
2193/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2194/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2195///
2196/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ph)
2197#[inline]
2198#[target_feature(enable = "avx512fp16")]
2199#[cfg_attr(test, assert_instr(vmulph))]
2200#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2201#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2202pub const fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2203    unsafe {
2204        let r = _mm512_mul_ph(a, b);
2205        simd_select_bitmask(k, r, _mm512_setzero_ph())
2206    }
2207}
2208
2209/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2210/// Rounding is done according to the rounding parameter, which can be one of:
2211///
2212/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2213/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2214/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2215/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2216/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2217///
2218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ph)
2219#[inline]
2220#[target_feature(enable = "avx512fp16")]
2221#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2222#[rustc_legacy_const_generics(2)]
2223#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2224pub fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2225    unsafe {
2226        static_assert_rounding!(ROUNDING);
2227        vmulph(a, b, ROUNDING)
2228    }
2229}
2230
2231/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2232/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2233/// Rounding is done according to the rounding parameter, which can be one of:
2234///
2235/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2236/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2237/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2238/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2239/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2240///
2241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ph)
2242#[inline]
2243#[target_feature(enable = "avx512fp16")]
2244#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2245#[rustc_legacy_const_generics(4)]
2246#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2247pub fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
2248    src: __m512h,
2249    k: __mmask32,
2250    a: __m512h,
2251    b: __m512h,
2252) -> __m512h {
2253    unsafe {
2254        static_assert_rounding!(ROUNDING);
2255        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2256        simd_select_bitmask(k, r, src)
2257    }
2258}
2259
2260/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2261/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2262/// Rounding is done according to the rounding parameter, which can be one of:
2263///
2264/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2265/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2266/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2267/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2268/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2269///
2270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ph)
2271#[inline]
2272#[target_feature(enable = "avx512fp16")]
2273#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2274#[rustc_legacy_const_generics(3)]
2275#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2276pub fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
2277    k: __mmask32,
2278    a: __m512h,
2279    b: __m512h,
2280) -> __m512h {
2281    unsafe {
2282        static_assert_rounding!(ROUNDING);
2283        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2284        simd_select_bitmask(k, r, _mm512_setzero_ph())
2285    }
2286}
2287
2288/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2289/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2290/// Rounding is done according to the rounding parameter, which can be one of:
2291///
2292/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2293/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2294/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2295/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2296/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2297///
2298/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sh)
2299#[inline]
2300#[target_feature(enable = "avx512fp16")]
2301#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2302#[rustc_legacy_const_generics(2)]
2303#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2304pub fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2305    static_assert_rounding!(ROUNDING);
2306    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2307}
2308
2309/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2310/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2311/// writemask k (the element is copied from src when mask bit 0 is not set).
2312/// Rounding is done according to the rounding parameter, which can be one of:
2313///
2314/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2315/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2316/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2317/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2318/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2319///
2320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sh)
2321#[inline]
2322#[target_feature(enable = "avx512fp16")]
2323#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2324#[rustc_legacy_const_generics(4)]
2325#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2326pub fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
2327    src: __m128h,
2328    k: __mmask8,
2329    a: __m128h,
2330    b: __m128h,
2331) -> __m128h {
2332    unsafe {
2333        static_assert_rounding!(ROUNDING);
2334        vmulsh(a, b, src, k, ROUNDING)
2335    }
2336}
2337
2338/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2339/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2340/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2341/// Rounding is done according to the rounding parameter, which can be one of:
2342///
2343/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2344/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2345/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2346/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2347/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2348///
2349/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sh)
2350#[inline]
2351#[target_feature(enable = "avx512fp16")]
2352#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2353#[rustc_legacy_const_generics(3)]
2354#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2355pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2356    static_assert_rounding!(ROUNDING);
2357    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2358}
2359
2360/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2361/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2362///
2363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sh)
2364#[inline]
2365#[target_feature(enable = "avx512fp16")]
2366#[cfg_attr(test, assert_instr(vmulsh))]
2367#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2368#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2369pub const fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
2370    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) * _mm_cvtsh_h(b)) }
2371}
2372
2373/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2374/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2375/// writemask k (the element is copied from src when mask bit 0 is not set).
2376///
2377/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sh)
2378#[inline]
2379#[target_feature(enable = "avx512fp16")]
2380#[cfg_attr(test, assert_instr(vmulsh))]
2381#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2382#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2383pub const fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2384    unsafe {
2385        let extractsrc: f16 = simd_extract!(src, 0);
2386        let mut add: f16 = extractsrc;
2387        if (k & 0b00000001) != 0 {
2388            let extracta: f16 = simd_extract!(a, 0);
2389            let extractb: f16 = simd_extract!(b, 0);
2390            add = extracta * extractb;
2391        }
2392        simd_insert!(a, 0, add)
2393    }
2394}
2395
2396/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2397/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2398/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2399///
2400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sh)
2401#[inline]
2402#[target_feature(enable = "avx512fp16")]
2403#[cfg_attr(test, assert_instr(vmulsh))]
2404#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2405#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2406pub const fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2407    unsafe {
2408        let mut add: f16 = 0.;
2409        if (k & 0b00000001) != 0 {
2410            let extracta: f16 = simd_extract!(a, 0);
2411            let extractb: f16 = simd_extract!(b, 0);
2412            add = extracta * extractb;
2413        }
2414        simd_insert!(a, 0, add)
2415    }
2416}
2417
2418/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2419///
2420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ph)
2421#[inline]
2422#[target_feature(enable = "avx512fp16,avx512vl")]
2423#[cfg_attr(test, assert_instr(vdivph))]
2424#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2425#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2426pub const fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
2427    unsafe { simd_div(a, b) }
2428}
2429
2430/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2431/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2432///
2433/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ph)
2434#[inline]
2435#[target_feature(enable = "avx512fp16,avx512vl")]
2436#[cfg_attr(test, assert_instr(vdivph))]
2437#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2438#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2439pub const fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2440    unsafe {
2441        let r = _mm_div_ph(a, b);
2442        simd_select_bitmask(k, r, src)
2443    }
2444}
2445
2446/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2447/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2448///
2449/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ph)
2450#[inline]
2451#[target_feature(enable = "avx512fp16,avx512vl")]
2452#[cfg_attr(test, assert_instr(vdivph))]
2453#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2454#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2455pub const fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2456    unsafe {
2457        let r = _mm_div_ph(a, b);
2458        simd_select_bitmask(k, r, _mm_setzero_ph())
2459    }
2460}
2461
2462/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2463///
2464/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ph)
2465#[inline]
2466#[target_feature(enable = "avx512fp16,avx512vl")]
2467#[cfg_attr(test, assert_instr(vdivph))]
2468#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2469#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2470pub const fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
2471    unsafe { simd_div(a, b) }
2472}
2473
2474/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2475/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2476///
2477/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ph)
2478#[inline]
2479#[target_feature(enable = "avx512fp16,avx512vl")]
2480#[cfg_attr(test, assert_instr(vdivph))]
2481#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2482#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2483pub const fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2484    unsafe {
2485        let r = _mm256_div_ph(a, b);
2486        simd_select_bitmask(k, r, src)
2487    }
2488}
2489
2490/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2491/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2492///
2493/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ph)
2494#[inline]
2495#[target_feature(enable = "avx512fp16,avx512vl")]
2496#[cfg_attr(test, assert_instr(vdivph))]
2497#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2498#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2499pub const fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2500    unsafe {
2501        let r = _mm256_div_ph(a, b);
2502        simd_select_bitmask(k, r, _mm256_setzero_ph())
2503    }
2504}
2505
2506/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2507///
2508/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ph)
2509#[inline]
2510#[target_feature(enable = "avx512fp16")]
2511#[cfg_attr(test, assert_instr(vdivph))]
2512#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2513#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2514pub const fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
2515    unsafe { simd_div(a, b) }
2516}
2517
2518/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2519/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2520///
2521/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ph)
2522#[inline]
2523#[target_feature(enable = "avx512fp16")]
2524#[cfg_attr(test, assert_instr(vdivph))]
2525#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2526#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2527pub const fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2528    unsafe {
2529        let r = _mm512_div_ph(a, b);
2530        simd_select_bitmask(k, r, src)
2531    }
2532}
2533
2534/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2535/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2536///
2537/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ph)
2538#[inline]
2539#[target_feature(enable = "avx512fp16")]
2540#[cfg_attr(test, assert_instr(vdivph))]
2541#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2542#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2543pub const fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2544    unsafe {
2545        let r = _mm512_div_ph(a, b);
2546        simd_select_bitmask(k, r, _mm512_setzero_ph())
2547    }
2548}
2549
2550/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2551/// Rounding is done according to the rounding parameter, which can be one of:
2552///
2553/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2554/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2555/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2556/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2557/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2558///
2559/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ph)
2560#[inline]
2561#[target_feature(enable = "avx512fp16")]
2562#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2563#[rustc_legacy_const_generics(2)]
2564#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2565pub fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2566    unsafe {
2567        static_assert_rounding!(ROUNDING);
2568        vdivph(a, b, ROUNDING)
2569    }
2570}
2571
2572/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2573/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2574/// Rounding is done according to the rounding parameter, which can be one of:
2575///
2576/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2577/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2578/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2579/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2580/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2581///
2582/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ph)
2583#[inline]
2584#[target_feature(enable = "avx512fp16")]
2585#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2586#[rustc_legacy_const_generics(4)]
2587#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2588pub fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
2589    src: __m512h,
2590    k: __mmask32,
2591    a: __m512h,
2592    b: __m512h,
2593) -> __m512h {
2594    unsafe {
2595        static_assert_rounding!(ROUNDING);
2596        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2597        simd_select_bitmask(k, r, src)
2598    }
2599}
2600
2601/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2602/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2603/// Rounding is done according to the rounding parameter, which can be one of:
2604///
2605/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2606/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2607/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2608/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2609/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2610///
2611/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ph)
2612#[inline]
2613#[target_feature(enable = "avx512fp16")]
2614#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2615#[rustc_legacy_const_generics(3)]
2616#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2617pub fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
2618    k: __mmask32,
2619    a: __m512h,
2620    b: __m512h,
2621) -> __m512h {
2622    unsafe {
2623        static_assert_rounding!(ROUNDING);
2624        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2625        simd_select_bitmask(k, r, _mm512_setzero_ph())
2626    }
2627}
2628
2629/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2630/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2631/// Rounding is done according to the rounding parameter, which can be one of:
2632///
2633/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2634/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2635/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2636/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2637/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2638///
2639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sh)
2640#[inline]
2641#[target_feature(enable = "avx512fp16")]
2642#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2643#[rustc_legacy_const_generics(2)]
2644#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2645pub fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2646    static_assert_rounding!(ROUNDING);
2647    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2648}
2649
2650/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2651/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2652/// writemask k (the element is copied from src when mask bit 0 is not set).
2653/// Rounding is done according to the rounding parameter, which can be one of:
2654///
2655/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2656/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2657/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2658/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2659/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2660///
2661/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sh)
2662#[inline]
2663#[target_feature(enable = "avx512fp16")]
2664#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2665#[rustc_legacy_const_generics(4)]
2666#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2667pub fn _mm_mask_div_round_sh<const ROUNDING: i32>(
2668    src: __m128h,
2669    k: __mmask8,
2670    a: __m128h,
2671    b: __m128h,
2672) -> __m128h {
2673    unsafe {
2674        static_assert_rounding!(ROUNDING);
2675        vdivsh(a, b, src, k, ROUNDING)
2676    }
2677}
2678
2679/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2680/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2681/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2682/// Rounding is done according to the rounding parameter, which can be one of:
2683///
2684/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2685/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2686/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2687/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2688/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2689///
2690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sh)
2691#[inline]
2692#[target_feature(enable = "avx512fp16")]
2693#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2694#[rustc_legacy_const_generics(3)]
2695#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2696pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2697    static_assert_rounding!(ROUNDING);
2698    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2699}
2700
2701/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2702/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2703///
2704/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sh)
2705#[inline]
2706#[target_feature(enable = "avx512fp16")]
2707#[cfg_attr(test, assert_instr(vdivsh))]
2708#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2709#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2710pub const fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
2711    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) / _mm_cvtsh_h(b)) }
2712}
2713
2714/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2715/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2716/// writemask k (the element is copied from src when mask bit 0 is not set).
2717///
2718/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sh)
2719#[inline]
2720#[target_feature(enable = "avx512fp16")]
2721#[cfg_attr(test, assert_instr(vdivsh))]
2722#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2723#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2724pub const fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2725    unsafe {
2726        let extractsrc: f16 = simd_extract!(src, 0);
2727        let mut add: f16 = extractsrc;
2728        if (k & 0b00000001) != 0 {
2729            let extracta: f16 = simd_extract!(a, 0);
2730            let extractb: f16 = simd_extract!(b, 0);
2731            add = extracta / extractb;
2732        }
2733        simd_insert!(a, 0, add)
2734    }
2735}
2736
2737/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2738/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2739/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2740///
2741/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sh)
2742#[inline]
2743#[target_feature(enable = "avx512fp16")]
2744#[cfg_attr(test, assert_instr(vdivsh))]
2745#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2746#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2747pub const fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2748    unsafe {
2749        let mut add: f16 = 0.;
2750        if (k & 0b00000001) != 0 {
2751            let extracta: f16 = simd_extract!(a, 0);
2752            let extractb: f16 = simd_extract!(b, 0);
2753            add = extracta / extractb;
2754        }
2755        simd_insert!(a, 0, add)
2756    }
2757}
2758
2759/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2760/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2761/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2762///
2763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch)
2764#[inline]
2765#[target_feature(enable = "avx512fp16,avx512vl")]
2766#[cfg_attr(test, assert_instr(vfmulcph))]
2767#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2768pub fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
2769    _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b)
2770}
2771
2772/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2773/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2774/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2775///
2776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch)
2777#[inline]
2778#[target_feature(enable = "avx512fp16,avx512vl")]
2779#[cfg_attr(test, assert_instr(vfmulcph))]
2780#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2781pub fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2782    unsafe { transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
2783}
2784
2785/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2786/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2787/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2788///
2789/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch)
2790#[inline]
2791#[target_feature(enable = "avx512fp16,avx512vl")]
2792#[cfg_attr(test, assert_instr(vfmulcph))]
2793#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2794pub fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2795    _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b)
2796}
2797
2798/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2799/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2800/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2801///
2802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch)
2803#[inline]
2804#[target_feature(enable = "avx512fp16,avx512vl")]
2805#[cfg_attr(test, assert_instr(vfmulcph))]
2806#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2807pub fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
2808    _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b)
2809}
2810
2811/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2812/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2813/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2814///
2815/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch)
2816#[inline]
2817#[target_feature(enable = "avx512fp16,avx512vl")]
2818#[cfg_attr(test, assert_instr(vfmulcph))]
2819#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2820pub fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2821    unsafe { transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
2822}
2823
2824/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2825/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2826/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2827///
2828/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch)
2829#[inline]
2830#[target_feature(enable = "avx512fp16,avx512vl")]
2831#[cfg_attr(test, assert_instr(vfmulcph))]
2832#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2833pub fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2834    _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b)
2835}
2836
2837/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2838/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2839/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2840///
2841/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch)
2842#[inline]
2843#[target_feature(enable = "avx512fp16")]
2844#[cfg_attr(test, assert_instr(vfmulcph))]
2845#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2846pub fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
2847    _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b)
2848}
2849
2850/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2851/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2852/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2853///
2854/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch)
2855#[inline]
2856#[target_feature(enable = "avx512fp16")]
2857#[cfg_attr(test, assert_instr(vfmulcph))]
2858#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2859pub fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2860    _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2861}
2862
2863/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2864/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2865/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2866///
2867/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch)
2868#[inline]
2869#[target_feature(enable = "avx512fp16")]
2870#[cfg_attr(test, assert_instr(vfmulcph))]
2871#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2872pub fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2873    _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b)
2874}
2875
2876/// Multiply the packed complex numbers in a and b, and store the results in dst. Each complex number is
2877/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2878/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2879///
2880/// Rounding is done according to the rounding parameter, which can be one of:
2881///
2882/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2883/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2884/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2885/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2886/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2887///
2888/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch)
2889#[inline]
2890#[target_feature(enable = "avx512fp16")]
2891#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2892#[rustc_legacy_const_generics(2)]
2893#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2894pub fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2895    static_assert_rounding!(ROUNDING);
2896    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
2897}
2898
2899/// Multiply the packed complex numbers in a and b, and store the results in dst using writemask k (the element
2900/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2901/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2902///
2903/// Rounding is done according to the rounding parameter, which can be one of:
2904///
2905/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2906/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2907/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2908/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2909/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2910///
2911/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch)
2912#[inline]
2913#[target_feature(enable = "avx512fp16")]
2914#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2915#[rustc_legacy_const_generics(4)]
2916#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2917pub fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
2918    src: __m512h,
2919    k: __mmask16,
2920    a: __m512h,
2921    b: __m512h,
2922) -> __m512h {
2923    unsafe {
2924        static_assert_rounding!(ROUNDING);
2925        transmute(vfmulcph_512(
2926            transmute(a),
2927            transmute(b),
2928            transmute(src),
2929            k,
2930            ROUNDING,
2931        ))
2932    }
2933}
2934
2935/// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2936/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2937/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2938///
2939/// Rounding is done according to the rounding parameter, which can be one of:
2940///
2941/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2942/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2943/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2944/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2945/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2946///
2947/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch)
2948#[inline]
2949#[target_feature(enable = "avx512fp16")]
2950#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2951#[rustc_legacy_const_generics(3)]
2952#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2953pub fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
2954    k: __mmask16,
2955    a: __m512h,
2956    b: __m512h,
2957) -> __m512h {
2958    static_assert_rounding!(ROUNDING);
2959    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
2960}
2961
2962/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2963/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2964/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2965/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2966///
2967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch)
2968#[inline]
2969#[target_feature(enable = "avx512fp16")]
2970#[cfg_attr(test, assert_instr(vfmulcsh))]
2971#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2972pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
2973    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
2974}
2975
2976/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2977/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2978/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent
2979/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2980///
2981/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch)
2982#[inline]
2983#[target_feature(enable = "avx512fp16")]
2984#[cfg_attr(test, assert_instr(vfmulcsh))]
2985#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
2986pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2987    _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2988}
2989
2990/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2991/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2992/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2993/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2994///
2995/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch)
2996#[inline]
2997#[target_feature(enable = "avx512fp16")]
2998#[cfg_attr(test, assert_instr(vfmulcsh))]
2999#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3000pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3001    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), k, a, b)
3002}
3003
3004/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
3005/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
3006/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3007/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3008///
3009/// Rounding is done according to the rounding parameter, which can be one of:
3010///
3011/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3012/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3013/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3014/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3015/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3016///
3017/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch)
3018#[inline]
3019#[target_feature(enable = "avx512fp16")]
3020#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3021#[rustc_legacy_const_generics(2)]
3022#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3023pub fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3024    static_assert_rounding!(ROUNDING);
3025    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
3026}
3027
3028/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
3029/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
3030/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
3031/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3032///
3033/// Rounding is done according to the rounding parameter, which can be one of:
3034///
3035/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3036/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3037/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3038/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3039/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3040///
3041/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch)
3042#[inline]
3043#[target_feature(enable = "avx512fp16")]
3044#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3045#[rustc_legacy_const_generics(4)]
3046#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3047pub fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
3048    src: __m128h,
3049    k: __mmask8,
3050    a: __m128h,
3051    b: __m128h,
3052) -> __m128h {
3053    unsafe {
3054        static_assert_rounding!(ROUNDING);
3055        transmute(vfmulcsh(
3056            transmute(a),
3057            transmute(b),
3058            transmute(src),
3059            k,
3060            ROUNDING,
3061        ))
3062    }
3063}
3064
3065/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
3066/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
3067/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
3068/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3069///
3070/// Rounding is done according to the rounding parameter, which can be one of:
3071///
3072/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3073/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3074/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3075/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3076/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3077///
3078/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch)
3079#[inline]
3080#[target_feature(enable = "avx512fp16")]
3081#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3082#[rustc_legacy_const_generics(3)]
3083#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3084pub fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
3085    k: __mmask8,
3086    a: __m128h,
3087    b: __m128h,
3088) -> __m128h {
3089    static_assert_rounding!(ROUNDING);
3090    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
3091}
3092
3093/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
3094/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3095/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3096///
3097/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch)
3098#[inline]
3099#[target_feature(enable = "avx512fp16,avx512vl")]
3100#[cfg_attr(test, assert_instr(vfmulcph))]
3101#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3102pub fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
3103    _mm_mul_pch(a, b)
3104}
3105
3106/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3107/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
3108/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3109///
3110/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch)
3111#[inline]
3112#[target_feature(enable = "avx512fp16,avx512vl")]
3113#[cfg_attr(test, assert_instr(vfmulcph))]
3114#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3115pub fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3116    _mm_mask_mul_pch(src, k, a, b)
3117}
3118
3119/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3120/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3121/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3122///
3123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch)
3124#[inline]
3125#[target_feature(enable = "avx512fp16,avx512vl")]
3126#[cfg_attr(test, assert_instr(vfmulcph))]
3127#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3128pub fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3129    _mm_maskz_mul_pch(k, a, b)
3130}
3131
3132/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
3133/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3134/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3135///
3136/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch)
3137#[inline]
3138#[target_feature(enable = "avx512fp16,avx512vl")]
3139#[cfg_attr(test, assert_instr(vfmulcph))]
3140#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3141pub fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
3142    _mm256_mul_pch(a, b)
3143}
3144
3145/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3146/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3147/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3148///
3149/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch)
3150#[inline]
3151#[target_feature(enable = "avx512fp16,avx512vl")]
3152#[cfg_attr(test, assert_instr(vfmulcph))]
3153#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3154pub fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3155    _mm256_mask_mul_pch(src, k, a, b)
3156}
3157
3158/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3159/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3160/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3161///
3162/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch)
3163#[inline]
3164#[target_feature(enable = "avx512fp16,avx512vl")]
3165#[cfg_attr(test, assert_instr(vfmulcph))]
3166#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3167pub fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3168    _mm256_maskz_mul_pch(k, a, b)
3169}
3170
3171/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
3172/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3173///
3174/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch)
3175#[inline]
3176#[target_feature(enable = "avx512fp16")]
3177#[cfg_attr(test, assert_instr(vfmulcph))]
3178#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3179pub fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
3180    _mm512_mul_pch(a, b)
3181}
3182
3183/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3184/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3185/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3186///
3187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch)
3188#[inline]
3189#[target_feature(enable = "avx512fp16")]
3190#[cfg_attr(test, assert_instr(vfmulcph))]
3191#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3192pub fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3193    _mm512_mask_mul_pch(src, k, a, b)
3194}
3195
3196/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3197/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3198/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3199///
3200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch)
3201#[inline]
3202#[target_feature(enable = "avx512fp16")]
3203#[cfg_attr(test, assert_instr(vfmulcph))]
3204#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3205pub fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3206    _mm512_maskz_mul_pch(k, a, b)
3207}
3208
3209/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
3210/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3211/// Rounding is done according to the rounding parameter, which can be one of:
3212///
3213/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3214/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3215/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3216/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3217/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3218///
3219/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch)
3220#[inline]
3221#[target_feature(enable = "avx512fp16")]
3222#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3223#[rustc_legacy_const_generics(2)]
3224#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3225pub fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3226    static_assert_rounding!(ROUNDING);
3227    _mm512_mul_round_pch::<ROUNDING>(a, b)
3228}
3229
3230/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3231/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3232/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3233/// Rounding is done according to the rounding parameter, which can be one of:
3234///
3235/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3236/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3237/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3238/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3239/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3240///
3241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch)
3242#[inline]
3243#[target_feature(enable = "avx512fp16")]
3244#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3245#[rustc_legacy_const_generics(4)]
3246#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3247pub fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
3248    src: __m512h,
3249    k: __mmask16,
3250    a: __m512h,
3251    b: __m512h,
3252) -> __m512h {
3253    static_assert_rounding!(ROUNDING);
3254    _mm512_mask_mul_round_pch::<ROUNDING>(src, k, a, b)
3255}
3256
3257/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3258/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3259/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3260/// Rounding is done according to the rounding parameter, which can be one of:
3261///
3262/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3263/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3264/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3265/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3266/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3267///
3268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch)
3269#[inline]
3270#[target_feature(enable = "avx512fp16")]
3271#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3272#[rustc_legacy_const_generics(3)]
3273#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3274pub fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
3275    k: __mmask16,
3276    a: __m512h,
3277    b: __m512h,
3278) -> __m512h {
3279    static_assert_rounding!(ROUNDING);
3280    _mm512_maskz_mul_round_pch::<ROUNDING>(k, a, b)
3281}
3282
3283/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is
3284/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3285/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3286///
3287/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch)
3288#[inline]
3289#[target_feature(enable = "avx512fp16")]
3290#[cfg_attr(test, assert_instr(vfmulcsh))]
3291#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3292pub fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
3293    _mm_mul_sch(a, b)
3294}
3295
3296/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3297/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3298/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3299///
3300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch)
3301#[inline]
3302#[target_feature(enable = "avx512fp16")]
3303#[cfg_attr(test, assert_instr(vfmulcsh))]
3304#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3305pub fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3306    _mm_mask_mul_sch(src, k, a, b)
3307}
3308
3309/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3310/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3311/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3312///
3313/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch)
3314#[inline]
3315#[target_feature(enable = "avx512fp16")]
3316#[cfg_attr(test, assert_instr(vfmulcsh))]
3317#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3318pub fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3319    _mm_maskz_mul_sch(k, a, b)
3320}
3321
3322/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is composed
3323/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3324///
3325/// Rounding is done according to the rounding parameter, which can be one of:
3326///
3327/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3328/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3329/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3330/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3331/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3332///
3333/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch)
3334#[inline]
3335#[target_feature(enable = "avx512fp16")]
3336#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3337#[rustc_legacy_const_generics(2)]
3338#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3339pub fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3340    static_assert_rounding!(ROUNDING);
3341    _mm_mul_round_sch::<ROUNDING>(a, b)
3342}
3343
3344/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3345/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3346/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3347///
3348/// Rounding is done according to the rounding parameter, which can be one of:
3349///
3350/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3351/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3352/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3353/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3354/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3355///
3356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch)
3357#[inline]
3358#[target_feature(enable = "avx512fp16")]
3359#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3360#[rustc_legacy_const_generics(4)]
3361#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3362pub fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
3363    src: __m128h,
3364    k: __mmask8,
3365    a: __m128h,
3366    b: __m128h,
3367) -> __m128h {
3368    static_assert_rounding!(ROUNDING);
3369    _mm_mask_mul_round_sch::<ROUNDING>(src, k, a, b)
3370}
3371
3372/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3373/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3374/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3375///
3376/// Rounding is done according to the rounding parameter, which can be one of:
3377///
3378/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3379/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3380/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3381/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3382/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3383///
3384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch)
3385#[inline]
3386#[target_feature(enable = "avx512fp16")]
3387#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3388#[rustc_legacy_const_generics(3)]
3389#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3390pub fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
3391    k: __mmask8,
3392    a: __m128h,
3393    b: __m128h,
3394) -> __m128h {
3395    static_assert_rounding!(ROUNDING);
3396    _mm_maskz_mul_round_sch::<ROUNDING>(k, a, b)
3397}
3398
3399/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3400/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3401/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3402/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3403///
3404/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch)
3405#[inline]
3406#[target_feature(enable = "avx512fp16,avx512vl")]
3407#[cfg_attr(test, assert_instr(vfcmulcph))]
3408#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3409pub fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
3410    _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b)
3411}
3412
3413/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3414/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3415/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3416/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3417///
3418/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch)
3419#[inline]
3420#[target_feature(enable = "avx512fp16,avx512vl")]
3421#[cfg_attr(test, assert_instr(vfcmulcph))]
3422#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3423pub fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3424    unsafe { transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
3425}
3426
3427/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3428/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3429/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3430/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3431///
3432/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch)
3433#[inline]
3434#[target_feature(enable = "avx512fp16,avx512vl")]
3435#[cfg_attr(test, assert_instr(vfcmulcph))]
3436#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3437pub fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3438    _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b)
3439}
3440
3441/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3442/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3443/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3444/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3445///
3446/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch)
3447#[inline]
3448#[target_feature(enable = "avx512fp16,avx512vl")]
3449#[cfg_attr(test, assert_instr(vfcmulcph))]
3450#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3451pub fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
3452    _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b)
3453}
3454
3455/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3456/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3457/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3458/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3459///
3460/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch)
3461#[inline]
3462#[target_feature(enable = "avx512fp16,avx512vl")]
3463#[cfg_attr(test, assert_instr(vfcmulcph))]
3464#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3465pub fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3466    unsafe { transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
3467}
3468
3469/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3470/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3471/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3472/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3473///
3474/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch)
3475#[inline]
3476#[target_feature(enable = "avx512fp16,avx512vl")]
3477#[cfg_attr(test, assert_instr(vfcmulcph))]
3478#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3479pub fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3480    _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b)
3481}
3482
3483/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3484/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3485/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3486/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3487///
3488/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch)
3489#[inline]
3490#[target_feature(enable = "avx512fp16")]
3491#[cfg_attr(test, assert_instr(vfcmulcph))]
3492#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3493pub fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
3494    _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b)
3495}
3496
3497/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3498/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3499/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3500/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3501///
3502/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch)
3503#[inline]
3504#[target_feature(enable = "avx512fp16")]
3505#[cfg_attr(test, assert_instr(vfcmulcph))]
3506#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3507pub fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3508    _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3509}
3510
3511/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3512/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3513/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3514/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3515///
3516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch)
3517#[inline]
3518#[target_feature(enable = "avx512fp16")]
3519#[cfg_attr(test, assert_instr(vfcmulcph))]
3520#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3521pub fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3522    _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b)
3523}
3524
3525/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3526/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3527/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3528/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3529///
3530/// Rounding is done according to the rounding parameter, which can be one of:
3531///
3532/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3533/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3534/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3535/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3536/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3537///
3538/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch)
3539#[inline]
3540#[target_feature(enable = "avx512fp16")]
3541#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3542#[rustc_legacy_const_generics(2)]
3543#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3544pub fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3545    static_assert_rounding!(ROUNDING);
3546    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
3547}
3548
3549/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3550/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3551/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3552/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3553///
3554/// Rounding is done according to the rounding parameter, which can be one of:
3555///
3556/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3557/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3558/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3559/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3560/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3561///
3562/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch)
3563#[inline]
3564#[target_feature(enable = "avx512fp16")]
3565#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3566#[rustc_legacy_const_generics(4)]
3567#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3568pub fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
3569    src: __m512h,
3570    k: __mmask16,
3571    a: __m512h,
3572    b: __m512h,
3573) -> __m512h {
3574    unsafe {
3575        static_assert_rounding!(ROUNDING);
3576        transmute(vfcmulcph_512(
3577            transmute(a),
3578            transmute(b),
3579            transmute(src),
3580            k,
3581            ROUNDING,
3582        ))
3583    }
3584}
3585
3586/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3587/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3588/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3589/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3590///
3591/// Rounding is done according to the rounding parameter, which can be one of:
3592///
3593/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3594/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3595/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3596/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3597/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3598///
3599/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch)
3600#[inline]
3601#[target_feature(enable = "avx512fp16")]
3602#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3603#[rustc_legacy_const_generics(3)]
3604#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3605pub fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
3606    k: __mmask16,
3607    a: __m512h,
3608    b: __m512h,
3609) -> __m512h {
3610    static_assert_rounding!(ROUNDING);
3611    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
3612}
3613
3614/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3615/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3616/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3617///
3618/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch)
3619#[inline]
3620#[target_feature(enable = "avx512fp16")]
3621#[cfg_attr(test, assert_instr(vfcmulcsh))]
3622#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3623pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
3624    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
3625}
3626
3627/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3628/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3629/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3630/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3631///
3632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch)
3633#[inline]
3634#[target_feature(enable = "avx512fp16")]
3635#[cfg_attr(test, assert_instr(vfcmulcsh))]
3636#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3637pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3638    _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3639}
3640
3641/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3642/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3643/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3644/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3645///
3646/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch)
3647#[inline]
3648#[target_feature(enable = "avx512fp16")]
3649#[cfg_attr(test, assert_instr(vfcmulcsh))]
3650#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3651pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3652    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), k, a, b)
3653}
3654
3655/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3656/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3657/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3658///
3659/// Rounding is done according to the rounding parameter, which can be one of:
3660///
3661/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3662/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3663/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3664/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3665/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3666///
3667/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch)
3668#[inline]
3669#[target_feature(enable = "avx512fp16")]
3670#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3671#[rustc_legacy_const_generics(2)]
3672#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3673pub fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3674    static_assert_rounding!(ROUNDING);
3675    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
3676}
3677
3678/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3679/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3680/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3681/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3682///
3683/// Rounding is done according to the rounding parameter, which can be one of:
3684///
3685/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3686/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3687/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3688/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3689/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3690///
3691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch)
3692#[inline]
3693#[target_feature(enable = "avx512fp16")]
3694#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3695#[rustc_legacy_const_generics(4)]
3696#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3697pub fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
3698    src: __m128h,
3699    k: __mmask8,
3700    a: __m128h,
3701    b: __m128h,
3702) -> __m128h {
3703    unsafe {
3704        static_assert_rounding!(ROUNDING);
3705        transmute(vfcmulcsh(
3706            transmute(a),
3707            transmute(b),
3708            transmute(src),
3709            k,
3710            ROUNDING,
3711        ))
3712    }
3713}
3714
3715/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3716/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3717/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3718/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3719///
3720/// Rounding is done according to the rounding parameter, which can be one of:
3721///
3722/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3723/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3724/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3725/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3726/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3727///
3728/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch)
3729#[inline]
3730#[target_feature(enable = "avx512fp16")]
3731#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3732#[rustc_legacy_const_generics(3)]
3733#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3734pub fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
3735    k: __mmask8,
3736    a: __m128h,
3737    b: __m128h,
3738) -> __m128h {
3739    static_assert_rounding!(ROUNDING);
3740    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
3741}
3742
3743/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3744/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3745/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3746/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3747///
3748/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch)
3749#[inline]
3750#[target_feature(enable = "avx512fp16,avx512vl")]
3751#[cfg_attr(test, assert_instr(vfcmulcph))]
3752#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3753pub fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
3754    _mm_cmul_pch(a, b)
3755}
3756
3757/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3758/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3759/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3760/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3761///
3762/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch)
3763#[inline]
3764#[target_feature(enable = "avx512fp16,avx512vl")]
3765#[cfg_attr(test, assert_instr(vfcmulcph))]
3766#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3767pub fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3768    _mm_mask_cmul_pch(src, k, a, b)
3769}
3770
3771/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3772/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3773/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3774/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3775///
3776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch)
3777#[inline]
3778#[target_feature(enable = "avx512fp16,avx512vl")]
3779#[cfg_attr(test, assert_instr(vfcmulcph))]
3780#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3781pub fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3782    _mm_maskz_cmul_pch(k, a, b)
3783}
3784
3785/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3786/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3787/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3788/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3789///
3790/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch)
3791#[inline]
3792#[target_feature(enable = "avx512fp16,avx512vl")]
3793#[cfg_attr(test, assert_instr(vfcmulcph))]
3794#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3795pub fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
3796    _mm256_cmul_pch(a, b)
3797}
3798
3799/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3800/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3801/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3802/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3803///
3804/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch)
3805#[inline]
3806#[target_feature(enable = "avx512fp16,avx512vl")]
3807#[cfg_attr(test, assert_instr(vfcmulcph))]
3808#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3809pub fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3810    _mm256_mask_cmul_pch(src, k, a, b)
3811}
3812
3813/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3814/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3815/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3816/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3817///
3818/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch)
3819#[inline]
3820#[target_feature(enable = "avx512fp16,avx512vl")]
3821#[cfg_attr(test, assert_instr(vfcmulcph))]
3822#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3823pub fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3824    _mm256_maskz_cmul_pch(k, a, b)
3825}
3826
3827/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3828/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3829/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3830/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3831///
3832/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch)
3833#[inline]
3834#[target_feature(enable = "avx512fp16")]
3835#[cfg_attr(test, assert_instr(vfcmulcph))]
3836#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3837pub fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
3838    _mm512_cmul_pch(a, b)
3839}
3840
3841/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3842/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3843/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3844/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3845///
3846/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch)
3847#[inline]
3848#[target_feature(enable = "avx512fp16")]
3849#[cfg_attr(test, assert_instr(vfcmulcph))]
3850#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3851pub fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3852    _mm512_mask_cmul_pch(src, k, a, b)
3853}
3854
3855/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3856/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3857/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3858/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3859///
3860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch)
3861#[inline]
3862#[target_feature(enable = "avx512fp16")]
3863#[cfg_attr(test, assert_instr(vfcmulcph))]
3864#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3865pub fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3866    _mm512_maskz_cmul_pch(k, a, b)
3867}
3868
3869/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3870/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3871/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3872///
3873/// Rounding is done according to the rounding parameter, which can be one of:
3874///
3875/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3876/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3877/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3878/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3879/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3880///
3881/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch)
3882#[inline]
3883#[target_feature(enable = "avx512fp16")]
3884#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3885#[rustc_legacy_const_generics(2)]
3886#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3887pub fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3888    static_assert_rounding!(ROUNDING);
3889    _mm512_cmul_round_pch::<ROUNDING>(a, b)
3890}
3891
3892/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3893/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3894/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3895/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3896///
3897/// Rounding is done according to the rounding parameter, which can be one of:
3898///
3899/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3900/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3901/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3902/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3903/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3904///
3905/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch)
3906#[inline]
3907#[target_feature(enable = "avx512fp16")]
3908#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3909#[rustc_legacy_const_generics(4)]
3910#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3911pub fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
3912    src: __m512h,
3913    k: __mmask16,
3914    a: __m512h,
3915    b: __m512h,
3916) -> __m512h {
3917    static_assert_rounding!(ROUNDING);
3918    _mm512_mask_cmul_round_pch::<ROUNDING>(src, k, a, b)
3919}
3920
3921/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3922/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3923/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3924/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3925///
3926/// Rounding is done according to the rounding parameter, which can be one of:
3927///
3928/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3929/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3930/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3931/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3932/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3933///
3934/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch)
3935#[inline]
3936#[target_feature(enable = "avx512fp16")]
3937#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3938#[rustc_legacy_const_generics(3)]
3939#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3940pub fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
3941    k: __mmask16,
3942    a: __m512h,
3943    b: __m512h,
3944) -> __m512h {
3945    static_assert_rounding!(ROUNDING);
3946    _mm512_maskz_cmul_round_pch::<ROUNDING>(k, a, b)
3947}
3948
3949/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3950/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3951/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3952/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3953///
3954/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch)
3955#[inline]
3956#[target_feature(enable = "avx512fp16")]
3957#[cfg_attr(test, assert_instr(vfcmulcsh))]
3958#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3959pub fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
3960    _mm_cmul_sch(a, b)
3961}
3962
3963/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3964/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3965/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3966/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3967///
3968/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch)
3969#[inline]
3970#[target_feature(enable = "avx512fp16")]
3971#[cfg_attr(test, assert_instr(vfcmulcsh))]
3972#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3973pub fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3974    _mm_mask_cmul_sch(src, k, a, b)
3975}
3976
3977/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3978/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3979/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3980/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3981///
3982/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch)
3983#[inline]
3984#[target_feature(enable = "avx512fp16")]
3985#[cfg_attr(test, assert_instr(vfcmulcsh))]
3986#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
3987pub fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3988    _mm_maskz_cmul_sch(k, a, b)
3989}
3990
3991/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3992/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3993/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3994///
3995/// Rounding is done according to the rounding parameter, which can be one of:
3996///
3997/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3998/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3999/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4000/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4001/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4002///
4003/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch)
4004#[inline]
4005#[target_feature(enable = "avx512fp16")]
4006#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
4007#[rustc_legacy_const_generics(2)]
4008#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4009pub fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
4010    static_assert_rounding!(ROUNDING);
4011    _mm_cmul_round_sch::<ROUNDING>(a, b)
4012}
4013
4014/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
4015/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
4016/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4017/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4018///
4019/// Rounding is done according to the rounding parameter, which can be one of:
4020///
4021/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4022/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4023/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4024/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4025/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4026///
4027/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch)
4028#[inline]
4029#[target_feature(enable = "avx512fp16")]
4030#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
4031#[rustc_legacy_const_generics(4)]
4032#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4033pub fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
4034    src: __m128h,
4035    k: __mmask8,
4036    a: __m128h,
4037    b: __m128h,
4038) -> __m128h {
4039    static_assert_rounding!(ROUNDING);
4040    _mm_mask_cmul_round_sch::<ROUNDING>(src, k, a, b)
4041}
4042
4043/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
4044/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
4045/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4046/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4047///
4048/// Rounding is done according to the rounding parameter, which can be one of:
4049///
4050/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4051/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4052/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4053/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4054/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4055///
4056/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch)
4057#[inline]
4058#[target_feature(enable = "avx512fp16")]
4059#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
4060#[rustc_legacy_const_generics(3)]
4061#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4062pub fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
4063    k: __mmask8,
4064    a: __m128h,
4065    b: __m128h,
4066) -> __m128h {
4067    static_assert_rounding!(ROUNDING);
4068    _mm_maskz_cmul_round_sch::<ROUNDING>(k, a, b)
4069}
4070
4071/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
4072/// the results in dst.
4073///
4074/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_ph)
4075#[inline]
4076#[target_feature(enable = "avx512fp16,avx512vl")]
4077#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4078#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4079pub const fn _mm_abs_ph(v2: __m128h) -> __m128h {
4080    unsafe { transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX))) }
4081}
4082
4083/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
4084/// the result in dst.
4085///
4086/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_ph)
4087#[inline]
4088#[target_feature(enable = "avx512fp16,avx512vl")]
4089#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4090#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4091pub const fn _mm256_abs_ph(v2: __m256h) -> __m256h {
4092    unsafe { transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX))) }
4093}
4094
4095/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
4096/// the result in dst.
4097///
4098/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ph)
4099#[inline]
4100#[target_feature(enable = "avx512fp16")]
4101#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4102#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4103pub const fn _mm512_abs_ph(v2: __m512h) -> __m512h {
4104    unsafe { transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX))) }
4105}
4106
4107/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex
4108/// number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines
4109/// the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate
4110/// `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4111///
4112/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conj_pch)
4113#[inline]
4114#[target_feature(enable = "avx512fp16,avx512vl")]
4115#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4116#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4117pub const fn _mm_conj_pch(a: __m128h) -> __m128h {
4118    unsafe { transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN))) }
4119}
4120
4121/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4122/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4123/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4124/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4125///
4126/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conj_pch)
4127#[inline]
4128#[target_feature(enable = "avx512fp16,avx512vl")]
4129#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4131pub const fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
4132    unsafe {
4133        let r: __m128 = transmute(_mm_conj_pch(a));
4134        transmute(simd_select_bitmask(k, r, transmute(src)))
4135    }
4136}
4137
4138/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4139/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4140/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4141/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4142///
4143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conj_pch)
4144#[inline]
4145#[target_feature(enable = "avx512fp16,avx512vl")]
4146#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4147#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4148pub const fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
4149    _mm_mask_conj_pch(_mm_setzero_ph(), k, a)
4150}
4151
4152/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
4153/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4154/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4155///
4156/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conj_pch)
4157#[inline]
4158#[target_feature(enable = "avx512fp16,avx512vl")]
4159#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4160#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4161pub const fn _mm256_conj_pch(a: __m256h) -> __m256h {
4162    unsafe { transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN))) }
4163}
4164
4165/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4166/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4167/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4168/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4169///
4170/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conj_pch)
4171#[inline]
4172#[target_feature(enable = "avx512fp16,avx512vl")]
4173#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4174#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4175pub const fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
4176    unsafe {
4177        let r: __m256 = transmute(_mm256_conj_pch(a));
4178        transmute(simd_select_bitmask(k, r, transmute(src)))
4179    }
4180}
4181
4182/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4183/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4184/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4185/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4186///
4187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conj_pch)
4188#[inline]
4189#[target_feature(enable = "avx512fp16,avx512vl")]
4190#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4191#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4192pub const fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
4193    _mm256_mask_conj_pch(_mm256_setzero_ph(), k, a)
4194}
4195
4196/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
4197/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4198/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4199///
4200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conj_pch)
4201#[inline]
4202#[target_feature(enable = "avx512fp16")]
4203#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4204#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4205pub const fn _mm512_conj_pch(a: __m512h) -> __m512h {
4206    unsafe { transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN))) }
4207}
4208
4209/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4210/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4211/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4212/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4213///
4214/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conj_pch)
4215#[inline]
4216#[target_feature(enable = "avx512fp16")]
4217#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4218#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4219pub const fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
4220    unsafe {
4221        let r: __m512 = transmute(_mm512_conj_pch(a));
4222        transmute(simd_select_bitmask(k, r, transmute(src)))
4223    }
4224}
4225
4226/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4227/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4228/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4229/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4230///
4231/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conj_pch)
4232#[inline]
4233#[target_feature(enable = "avx512fp16")]
4234#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4235#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4236pub const fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
4237    _mm512_mask_conj_pch(_mm512_setzero_ph(), k, a)
4238}
4239
4240/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4241/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4242/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4243///
4244/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pch)
4245#[inline]
4246#[target_feature(enable = "avx512fp16,avx512vl")]
4247#[cfg_attr(test, assert_instr(vfmaddcph))]
4248#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4249pub fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4250    _mm_mask3_fmadd_pch(a, b, c, 0xff)
4251}
4252
4253/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4254/// and store the results in dst using writemask k (the element is copied from a when the corresponding
4255/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4256/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4257///
4258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pch)
4259#[inline]
4260#[target_feature(enable = "avx512fp16,avx512vl")]
4261#[cfg_attr(test, assert_instr(vfmaddcph))]
4262#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4263pub fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4264    unsafe {
4265        let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4266        transmute(simd_select_bitmask(k, r, transmute(a)))
4267    }
4268}
4269
4270/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4271/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4272/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4273/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4274///
4275/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pch)
4276#[inline]
4277#[target_feature(enable = "avx512fp16,avx512vl")]
4278#[cfg_attr(test, assert_instr(vfmaddcph))]
4279#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4280pub fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4281    unsafe {
4282        transmute(vfmaddcph_mask3_128(
4283            transmute(a),
4284            transmute(b),
4285            transmute(c),
4286            k,
4287        ))
4288    }
4289}
4290
4291/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4292/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4293/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4294/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4295///
4296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pch)
4297#[inline]
4298#[target_feature(enable = "avx512fp16,avx512vl")]
4299#[cfg_attr(test, assert_instr(vfmaddcph))]
4300#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4301pub fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4302    unsafe {
4303        transmute(vfmaddcph_maskz_128(
4304            transmute(a),
4305            transmute(b),
4306            transmute(c),
4307            k,
4308        ))
4309    }
4310}
4311
4312/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4313/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4314/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4315///
4316/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pch)
4317#[inline]
4318#[target_feature(enable = "avx512fp16,avx512vl")]
4319#[cfg_attr(test, assert_instr(vfmaddcph))]
4320#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4321pub fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4322    _mm256_mask3_fmadd_pch(a, b, c, 0xff)
4323}
4324
4325/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4326/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4327/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4328/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4329///
4330/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pch)
4331#[inline]
4332#[target_feature(enable = "avx512fp16,avx512vl")]
4333#[cfg_attr(test, assert_instr(vfmaddcph))]
4334#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4335pub fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4336    unsafe {
4337        let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4338        transmute(simd_select_bitmask(k, r, transmute(a)))
4339    }
4340}
4341
4342/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4343/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4344/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4345/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4346///
4347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pch)
4348#[inline]
4349#[target_feature(enable = "avx512fp16,avx512vl")]
4350#[cfg_attr(test, assert_instr(vfmaddcph))]
4351#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4352pub fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4353    unsafe {
4354        transmute(vfmaddcph_mask3_256(
4355            transmute(a),
4356            transmute(b),
4357            transmute(c),
4358            k,
4359        ))
4360    }
4361}
4362
4363/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4364/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4365/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4366/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4367///
4368/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pch)
4369#[inline]
4370#[target_feature(enable = "avx512fp16,avx512vl")]
4371#[cfg_attr(test, assert_instr(vfmaddcph))]
4372#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4373pub fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4374    unsafe {
4375        transmute(vfmaddcph_maskz_256(
4376            transmute(a),
4377            transmute(b),
4378            transmute(c),
4379            k,
4380        ))
4381    }
4382}
4383
4384/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4385/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4386/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4387///
4388/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pch)
4389#[inline]
4390#[target_feature(enable = "avx512fp16")]
4391#[cfg_attr(test, assert_instr(vfmaddcph))]
4392#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4393pub fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4394    _mm512_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4395}
4396
4397/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4398/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4399/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4400/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4401///
4402/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pch)
4403#[inline]
4404#[target_feature(enable = "avx512fp16")]
4405#[cfg_attr(test, assert_instr(vfmaddcph))]
4406#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4407pub fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4408    _mm512_mask_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4409}
4410
4411/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4412/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4413/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4414/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4415///
4416/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pch)
4417#[inline]
4418#[target_feature(enable = "avx512fp16")]
4419#[cfg_attr(test, assert_instr(vfmaddcph))]
4420#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4421pub fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4422    _mm512_mask3_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4423}
4424
4425/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4426/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4427/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4428/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4429///
4430/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pch)
4431#[inline]
4432#[target_feature(enable = "avx512fp16")]
4433#[cfg_attr(test, assert_instr(vfmaddcph))]
4434#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4435pub fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4436    _mm512_maskz_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4437}
4438
4439/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4440/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4441/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4442///
4443/// Rounding is done according to the rounding parameter, which can be one of:
4444///
4445/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4446/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4447/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4448/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4449/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4450///
4451/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pch)
4452#[inline]
4453#[target_feature(enable = "avx512fp16")]
4454#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4455#[rustc_legacy_const_generics(3)]
4456#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4457pub fn _mm512_fmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4458    static_assert_rounding!(ROUNDING);
4459    _mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4460}
4461
4462/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4463/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4464/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4465/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4466///
4467/// Rounding is done according to the rounding parameter, which can be one of:
4468///
4469/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4470/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4471/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4472/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4473/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4474///
4475/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pch)
4476#[inline]
4477#[target_feature(enable = "avx512fp16")]
4478#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4479#[rustc_legacy_const_generics(4)]
4480#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4481pub fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
4482    a: __m512h,
4483    k: __mmask16,
4484    b: __m512h,
4485    c: __m512h,
4486) -> __m512h {
4487    unsafe {
4488        static_assert_rounding!(ROUNDING);
4489        let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4490        transmute(simd_select_bitmask(k, r, transmute(a)))
4491    }
4492}
4493
4494/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4495/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4496/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4497/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4498///
4499/// Rounding is done according to the rounding parameter, which can be one of:
4500///
4501/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4502/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4503/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4504/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4505/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4506///
4507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pch)
4508#[inline]
4509#[target_feature(enable = "avx512fp16")]
4510#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4511#[rustc_legacy_const_generics(4)]
4512#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4513pub fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
4514    a: __m512h,
4515    b: __m512h,
4516    c: __m512h,
4517    k: __mmask16,
4518) -> __m512h {
4519    unsafe {
4520        static_assert_rounding!(ROUNDING);
4521        transmute(vfmaddcph_mask3_512(
4522            transmute(a),
4523            transmute(b),
4524            transmute(c),
4525            k,
4526            ROUNDING,
4527        ))
4528    }
4529}
4530
4531/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4532/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4533/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4534/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4535///
4536/// Rounding is done according to the rounding parameter, which can be one of:
4537///
4538/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4539/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4540/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4541/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4542/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4543///
4544/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pch)
4545#[inline]
4546#[target_feature(enable = "avx512fp16")]
4547#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4548#[rustc_legacy_const_generics(4)]
4549#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4550pub fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
4551    k: __mmask16,
4552    a: __m512h,
4553    b: __m512h,
4554    c: __m512h,
4555) -> __m512h {
4556    unsafe {
4557        static_assert_rounding!(ROUNDING);
4558        transmute(vfmaddcph_maskz_512(
4559            transmute(a),
4560            transmute(b),
4561            transmute(c),
4562            k,
4563            ROUNDING,
4564        ))
4565    }
4566}
4567
4568/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4569/// store the result in the lower elements of dst, and copy the upper 6 packed elements from a to the
4570/// upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
4571/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4572///
4573/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sch)
4574#[inline]
4575#[target_feature(enable = "avx512fp16")]
4576#[cfg_attr(test, assert_instr(vfmaddcsh))]
4577#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4578pub fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4579    _mm_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4580}
4581
4582/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4583/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4584/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4585/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4586/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4587///
4588/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sch)
4589#[inline]
4590#[target_feature(enable = "avx512fp16")]
4591#[cfg_attr(test, assert_instr(vfmaddcsh))]
4592#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4593pub fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4594    _mm_mask_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4595}
4596
4597/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4598/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4599/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4600/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4601/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4602///
4603/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sch)
4604#[inline]
4605#[target_feature(enable = "avx512fp16")]
4606#[cfg_attr(test, assert_instr(vfmaddcsh))]
4607#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4608pub fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4609    _mm_mask3_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4610}
4611
4612/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4613/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4614/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4615/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4616/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4617///
4618/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sch)
4619#[inline]
4620#[target_feature(enable = "avx512fp16")]
4621#[cfg_attr(test, assert_instr(vfmaddcsh))]
4622#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4623pub fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4624    _mm_maskz_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4625}
4626
4627/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4628/// store the result in the lower elements of dst. Each complex number is composed of two adjacent
4629/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4630///
4631/// Rounding is done according to the rounding parameter, which can be one of:
4632///
4633/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4634/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4635/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4636/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4637/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4638///
4639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sch)
4640#[inline]
4641#[target_feature(enable = "avx512fp16")]
4642#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4643#[rustc_legacy_const_generics(3)]
4644#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4645pub fn _mm_fmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4646    unsafe {
4647        static_assert_rounding!(ROUNDING);
4648        transmute(vfmaddcsh_mask(
4649            transmute(a),
4650            transmute(b),
4651            transmute(c),
4652            0xff,
4653            ROUNDING,
4654        ))
4655    }
4656}
4657
4658/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4659/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4660/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4661/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4662/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4663///
4664/// Rounding is done according to the rounding parameter, which can be one of:
4665///
4666/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4667/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4668/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4669/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4670/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4671///
4672/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sch)
4673#[inline]
4674#[target_feature(enable = "avx512fp16")]
4675#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4676#[rustc_legacy_const_generics(4)]
4677#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4678pub fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
4679    a: __m128h,
4680    k: __mmask8,
4681    b: __m128h,
4682    c: __m128h,
4683) -> __m128h {
4684    unsafe {
4685        static_assert_rounding!(ROUNDING);
4686        let a = transmute(a);
4687        let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
4688        transmute(_mm_mask_move_ss(a, k, a, r))
4689    }
4690}
4691
4692/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4693/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4694/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4695/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4696/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4697///
4698/// Rounding is done according to the rounding parameter, which can be one of:
4699///
4700/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4701/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4702/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4703/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4704/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4705///
4706/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sch)
4707#[inline]
4708#[target_feature(enable = "avx512fp16")]
4709#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4710#[rustc_legacy_const_generics(4)]
4711#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4712pub fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
4713    a: __m128h,
4714    b: __m128h,
4715    c: __m128h,
4716    k: __mmask8,
4717) -> __m128h {
4718    unsafe {
4719        static_assert_rounding!(ROUNDING);
4720        let c = transmute(c);
4721        let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
4722        transmute(_mm_move_ss(c, r))
4723    }
4724}
4725
4726/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4727/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4728/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4729/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4730/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4731///
4732/// Rounding is done according to the rounding parameter, which can be one of:
4733///
4734/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4735/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4736/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4737/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4738/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4739///
4740/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sch)
4741#[inline]
4742#[target_feature(enable = "avx512fp16")]
4743#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4744#[rustc_legacy_const_generics(4)]
4745#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4746pub fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
4747    k: __mmask8,
4748    a: __m128h,
4749    b: __m128h,
4750    c: __m128h,
4751) -> __m128h {
4752    unsafe {
4753        static_assert_rounding!(ROUNDING);
4754        transmute(vfmaddcsh_maskz(
4755            transmute(a),
4756            transmute(b),
4757            transmute(c),
4758            k,
4759            ROUNDING,
4760        ))
4761    }
4762}
4763
4764/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4765/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4766/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4767/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4768///
4769/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_pch)
4770#[inline]
4771#[target_feature(enable = "avx512fp16,avx512vl")]
4772#[cfg_attr(test, assert_instr(vfcmaddcph))]
4773#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4774pub fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4775    _mm_mask3_fcmadd_pch(a, b, c, 0xff)
4776}
4777
4778/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4779/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4780/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4781/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4782/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4783///
4784/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_pch)
4785#[inline]
4786#[target_feature(enable = "avx512fp16,avx512vl")]
4787#[cfg_attr(test, assert_instr(vfcmaddcph))]
4788#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4789pub fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4790    unsafe {
4791        let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4792        transmute(simd_select_bitmask(k, r, transmute(a)))
4793    }
4794}
4795
4796/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4797/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4798/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4799/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4800/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4801///
4802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_pch)
4803#[inline]
4804#[target_feature(enable = "avx512fp16,avx512vl")]
4805#[cfg_attr(test, assert_instr(vfcmaddcph))]
4806#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4807pub fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4808    unsafe {
4809        transmute(vfcmaddcph_mask3_128(
4810            transmute(a),
4811            transmute(b),
4812            transmute(c),
4813            k,
4814        ))
4815    }
4816}
4817
4818/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4819/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4820/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4821/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4822/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4823///
4824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_pch)
4825#[inline]
4826#[target_feature(enable = "avx512fp16,avx512vl")]
4827#[cfg_attr(test, assert_instr(vfcmaddcph))]
4828#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4829pub fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4830    unsafe {
4831        transmute(vfcmaddcph_maskz_128(
4832            transmute(a),
4833            transmute(b),
4834            transmute(c),
4835            k,
4836        ))
4837    }
4838}
4839
4840/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4841/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4842/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4843/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4844///
4845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmadd_pch)
4846#[inline]
4847#[target_feature(enable = "avx512fp16,avx512vl")]
4848#[cfg_attr(test, assert_instr(vfcmaddcph))]
4849#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4850pub fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4851    _mm256_mask3_fcmadd_pch(a, b, c, 0xff)
4852}
4853
4854/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4855/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4856/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4857/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4858/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4859///
4860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmadd_pch)
4861#[inline]
4862#[target_feature(enable = "avx512fp16,avx512vl")]
4863#[cfg_attr(test, assert_instr(vfcmaddcph))]
4864#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4865pub fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4866    unsafe {
4867        let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4868        transmute(simd_select_bitmask(k, r, transmute(a)))
4869    }
4870}
4871
4872/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4873/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4874/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4875/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4876/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4877///
4878/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fcmadd_pch)
4879#[inline]
4880#[target_feature(enable = "avx512fp16,avx512vl")]
4881#[cfg_attr(test, assert_instr(vfcmaddcph))]
4882#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4883pub fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4884    unsafe {
4885        transmute(vfcmaddcph_mask3_256(
4886            transmute(a),
4887            transmute(b),
4888            transmute(c),
4889            k,
4890        ))
4891    }
4892}
4893
4894/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4895/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4896/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4897/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4898/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4899///
4900/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmadd_pch)
4901#[inline]
4902#[target_feature(enable = "avx512fp16,avx512vl")]
4903#[cfg_attr(test, assert_instr(vfcmaddcph))]
4904#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4905pub fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4906    unsafe {
4907        transmute(vfcmaddcph_maskz_256(
4908            transmute(a),
4909            transmute(b),
4910            transmute(c),
4911            k,
4912        ))
4913    }
4914}
4915
4916/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4917/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4918/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4919/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4920///
4921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_pch)
4922#[inline]
4923#[target_feature(enable = "avx512fp16")]
4924#[cfg_attr(test, assert_instr(vfcmaddcph))]
4925#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4926pub fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4927    _mm512_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4928}
4929
4930/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4931/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4932/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4933/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4934/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4935///
4936/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_pch)
4937#[inline]
4938#[target_feature(enable = "avx512fp16")]
4939#[cfg_attr(test, assert_instr(vfcmaddcph))]
4940#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4941pub fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4942    _mm512_mask_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4943}
4944
4945/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4946/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4947/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4948/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4949/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4950///
4951/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_pch)
4952#[inline]
4953#[target_feature(enable = "avx512fp16")]
4954#[cfg_attr(test, assert_instr(vfcmaddcph))]
4955#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4956pub fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4957    _mm512_mask3_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4958}
4959
4960/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4961/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4962/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4963/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4964/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4965///
4966/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_pch)
4967#[inline]
4968#[target_feature(enable = "avx512fp16")]
4969#[cfg_attr(test, assert_instr(vfcmaddcph))]
4970#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4971pub fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4972    _mm512_maskz_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4973}
4974
4975/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4976/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4977/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4978/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4979///
4980/// Rounding is done according to the rounding parameter, which can be one of:
4981///
4982/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4983/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4984/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4985/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4986/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4987///
4988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_round_pch)
4989#[inline]
4990#[target_feature(enable = "avx512fp16")]
4991#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4992#[rustc_legacy_const_generics(3)]
4993#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
4994pub fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4995    static_assert_rounding!(ROUNDING);
4996    _mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4997}
4998
4999/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
5000/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
5001/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
5002/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5003/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5004///
5005/// Rounding is done according to the rounding parameter, which can be one of:
5006///
5007/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5008/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5009/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5010/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5011/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5012///
5013/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_round_pch)
5014#[inline]
5015#[target_feature(enable = "avx512fp16")]
5016#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
5017#[rustc_legacy_const_generics(4)]
5018#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5019pub fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
5020    a: __m512h,
5021    k: __mmask16,
5022    b: __m512h,
5023    c: __m512h,
5024) -> __m512h {
5025    unsafe {
5026        static_assert_rounding!(ROUNDING);
5027        let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
5028        transmute(simd_select_bitmask(k, r, transmute(a)))
5029    }
5030}
5031
5032/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
5033/// to the corresponding complex numbers in c using writemask k (the element is copied from c when the corresponding
5034/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
5035/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
5036/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5037///
5038/// Rounding is done according to the rounding parameter, which can be one of:
5039///
5040/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5041/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5042/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5043/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5044/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5045///
5046/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_round_pch)
5047#[inline]
5048#[target_feature(enable = "avx512fp16")]
5049#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
5050#[rustc_legacy_const_generics(4)]
5051#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5052pub fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
5053    a: __m512h,
5054    b: __m512h,
5055    c: __m512h,
5056    k: __mmask16,
5057) -> __m512h {
5058    unsafe {
5059        static_assert_rounding!(ROUNDING);
5060        transmute(vfcmaddcph_mask3_512(
5061            transmute(a),
5062            transmute(b),
5063            transmute(c),
5064            k,
5065            ROUNDING,
5066        ))
5067    }
5068}
5069
5070/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
5071/// to the corresponding complex numbers in c using zeromask k (the element is zeroed out when the corresponding
5072/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
5073/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
5074/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5075///
5076/// Rounding is done according to the rounding parameter, which can be one of:
5077///
5078/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5079/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5080/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5081/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5082/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5083///
5084/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_round_pch)
5085#[inline]
5086#[target_feature(enable = "avx512fp16")]
5087#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
5088#[rustc_legacy_const_generics(4)]
5089#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5090pub fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
5091    k: __mmask16,
5092    a: __m512h,
5093    b: __m512h,
5094    c: __m512h,
5095) -> __m512h {
5096    unsafe {
5097        static_assert_rounding!(ROUNDING);
5098        transmute(vfcmaddcph_maskz_512(
5099            transmute(a),
5100            transmute(b),
5101            transmute(c),
5102            k,
5103            ROUNDING,
5104        ))
5105    }
5106}
5107
5108/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5109/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
5110/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
5111/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
5112/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5113///
5114/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_sch)
5115#[inline]
5116#[target_feature(enable = "avx512fp16")]
5117#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5118#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5119pub fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5120    _mm_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
5121}
5122
5123/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5124/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5125/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
5126/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5127/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5128/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5129///
5130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_sch)
5131#[inline]
5132#[target_feature(enable = "avx512fp16")]
5133#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5134#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5135pub fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5136    _mm_mask_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
5137}
5138
5139/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5140/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5141/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
5142/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5143/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5144/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5145///
5146/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_sch)
5147#[inline]
5148#[target_feature(enable = "avx512fp16")]
5149#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5150#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5151pub fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5152    _mm_mask3_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
5153}
5154
5155/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5156/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5157/// zeromask k (the element is zeroed out when the corresponding mask bit is not set), and copy the upper
5158/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5159/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5160/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5161///
5162/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_sch)
5163#[inline]
5164#[target_feature(enable = "avx512fp16")]
5165#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5166#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5167pub fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5168    _mm_maskz_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
5169}
5170
5171/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5172/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
5173/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
5174/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
5175/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5176///
5177/// Rounding is done according to the rounding parameter, which can be one of:
5178///
5179/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5180/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5181/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5182/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5183/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5184///
5185/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_round_sch)
5186#[inline]
5187#[target_feature(enable = "avx512fp16")]
5188#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5189#[rustc_legacy_const_generics(3)]
5190#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5191pub fn _mm_fcmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5192    unsafe {
5193        static_assert_rounding!(ROUNDING);
5194        transmute(vfcmaddcsh_mask(
5195            transmute(a),
5196            transmute(b),
5197            transmute(c),
5198            0xff,
5199            ROUNDING,
5200        ))
5201    }
5202}
5203
5204/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5205/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5206/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
5207/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5208/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5209/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5210///
5211/// Rounding is done according to the rounding parameter, which can be one of:
5212///
5213/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5214/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5215/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5216/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5217/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5218///
5219/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_round_sch)
5220#[inline]
5221#[target_feature(enable = "avx512fp16")]
5222#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5223#[rustc_legacy_const_generics(4)]
5224#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5225pub fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
5226    a: __m128h,
5227    k: __mmask8,
5228    b: __m128h,
5229    c: __m128h,
5230) -> __m128h {
5231    unsafe {
5232        static_assert_rounding!(ROUNDING);
5233        let a = transmute(a);
5234        let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
5235        transmute(_mm_mask_move_ss(a, k, a, r))
5236    }
5237}
5238
5239/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5240/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5241/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
5242/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5243/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5244/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5245///
5246/// Rounding is done according to the rounding parameter, which can be one of:
5247///
5248/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5249/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5250/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5251/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5252/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5253///
5254/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_round_sch)
5255#[inline]
5256#[target_feature(enable = "avx512fp16")]
5257#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5258#[rustc_legacy_const_generics(4)]
5259#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5260pub fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
5261    a: __m128h,
5262    b: __m128h,
5263    c: __m128h,
5264    k: __mmask8,
5265) -> __m128h {
5266    unsafe {
5267        static_assert_rounding!(ROUNDING);
5268        let c = transmute(c);
5269        let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
5270        transmute(_mm_move_ss(c, r))
5271    }
5272}
5273
5274/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5275/// accumulate to the lower complex number in c using zeromask k (the element is zeroed out when the corresponding
5276/// mask bit is not set), and store the result in the lower elements of dst, and copy the upper 6 packed elements
5277/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
5278/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
5279/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5280///
5281/// Rounding is done according to the rounding parameter, which can be one of:
5282///
5283/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5284/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5285/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5286/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5287/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5288///
5289/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_round_sch)
5290#[inline]
5291#[target_feature(enable = "avx512fp16")]
5292#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5293#[rustc_legacy_const_generics(4)]
5294#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5295pub fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
5296    k: __mmask8,
5297    a: __m128h,
5298    b: __m128h,
5299    c: __m128h,
5300) -> __m128h {
5301    unsafe {
5302        static_assert_rounding!(ROUNDING);
5303        transmute(vfcmaddcsh_maskz(
5304            transmute(a),
5305            transmute(b),
5306            transmute(c),
5307            k,
5308            ROUNDING,
5309        ))
5310    }
5311}
5312
5313/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5314/// result to packed elements in c, and store the results in dst.
5315///
5316/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ph)
5317#[inline]
5318#[target_feature(enable = "avx512fp16,avx512vl")]
5319#[cfg_attr(test, assert_instr(vfmadd))]
5320#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5321#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5322pub const fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5323    unsafe { simd_fma(a, b, c) }
5324}
5325
5326/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5327/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5328/// from a when the corresponding mask bit is not set).
5329///
5330/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ph)
5331#[inline]
5332#[target_feature(enable = "avx512fp16,avx512vl")]
5333#[cfg_attr(test, assert_instr(vfmadd))]
5334#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5335#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5336pub const fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5337    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a) }
5338}
5339
5340/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5341/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5342/// from c when the corresponding mask bit is not set).
5343///
5344/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ph)
5345#[inline]
5346#[target_feature(enable = "avx512fp16,avx512vl")]
5347#[cfg_attr(test, assert_instr(vfmadd))]
5348#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5349#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5350pub const fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5351    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c) }
5352}
5353
5354/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5355/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5356/// out when the corresponding mask bit is not set).
5357///
5358/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ph)
5359#[inline]
5360#[target_feature(enable = "avx512fp16,avx512vl")]
5361#[cfg_attr(test, assert_instr(vfmadd))]
5362#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5363#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5364pub const fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5365    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph()) }
5366}
5367
5368/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5369/// result to packed elements in c, and store the results in dst.
5370///
5371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ph)
5372#[inline]
5373#[target_feature(enable = "avx512fp16,avx512vl")]
5374#[cfg_attr(test, assert_instr(vfmadd))]
5375#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5376#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5377pub const fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5378    unsafe { simd_fma(a, b, c) }
5379}
5380
5381/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5382/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5383/// from a when the corresponding mask bit is not set).
5384///
5385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ph)
5386#[inline]
5387#[target_feature(enable = "avx512fp16,avx512vl")]
5388#[cfg_attr(test, assert_instr(vfmadd))]
5389#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5390#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5391pub const fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5392    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a) }
5393}
5394
5395/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5396/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5397/// from c when the corresponding mask bit is not set).
5398///
5399/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ph)
5400#[inline]
5401#[target_feature(enable = "avx512fp16,avx512vl")]
5402#[cfg_attr(test, assert_instr(vfmadd))]
5403#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5404#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5405pub const fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5406    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c) }
5407}
5408
5409/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5410/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5411/// out when the corresponding mask bit is not set).
5412///
5413/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ph)
5414#[inline]
5415#[target_feature(enable = "avx512fp16,avx512vl")]
5416#[cfg_attr(test, assert_instr(vfmadd))]
5417#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5418#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5419pub const fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5420    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph()) }
5421}
5422
5423/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5424/// result to packed elements in c, and store the results in dst.
5425///
5426/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ph)
5427#[inline]
5428#[target_feature(enable = "avx512fp16")]
5429#[cfg_attr(test, assert_instr(vfmadd))]
5430#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5431#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5432pub const fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5433    unsafe { simd_fma(a, b, c) }
5434}
5435
5436/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5437/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5438/// from a when the corresponding mask bit is not set).
5439///
5440/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ph)
5441#[inline]
5442#[target_feature(enable = "avx512fp16")]
5443#[cfg_attr(test, assert_instr(vfmadd))]
5444#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5445#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5446pub const fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5447    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a) }
5448}
5449
5450/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5451/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5452/// from c when the corresponding mask bit is not set).
5453///
5454/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ph)
5455#[inline]
5456#[target_feature(enable = "avx512fp16")]
5457#[cfg_attr(test, assert_instr(vfmadd))]
5458#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5459#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5460pub const fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5461    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c) }
5462}
5463
5464/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5465/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5466/// out when the corresponding mask bit is not set).
5467///
5468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ph)
5469#[inline]
5470#[target_feature(enable = "avx512fp16")]
5471#[cfg_attr(test, assert_instr(vfmadd))]
5472#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5473#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5474pub const fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5475    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph()) }
5476}
5477
5478/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5479/// result to packed elements in c, and store the results in dst.
5480///
5481/// Rounding is done according to the rounding parameter, which can be one of:
5482///
5483/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5484/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5485/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5486/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5487/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5488///
5489/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ph)
5490#[inline]
5491#[target_feature(enable = "avx512fp16")]
5492#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5493#[rustc_legacy_const_generics(3)]
5494#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5495pub fn _mm512_fmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5496    unsafe {
5497        static_assert_rounding!(ROUNDING);
5498        vfmaddph_512(a, b, c, ROUNDING)
5499    }
5500}
5501
5502/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5503/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5504/// from a when the corresponding mask bit is not set).
5505///
5506/// Rounding is done according to the rounding parameter, which can be one of:
5507///
5508/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5509/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5510/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5511/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5512/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5513///
5514/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ph)
5515#[inline]
5516#[target_feature(enable = "avx512fp16")]
5517#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5518#[rustc_legacy_const_generics(4)]
5519#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5520pub fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
5521    a: __m512h,
5522    k: __mmask32,
5523    b: __m512h,
5524    c: __m512h,
5525) -> __m512h {
5526    unsafe {
5527        static_assert_rounding!(ROUNDING);
5528        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
5529    }
5530}
5531
5532/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5533/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5534/// from c when the corresponding mask bit is not set).
5535///
5536/// Rounding is done according to the rounding parameter, which can be one of:
5537///
5538/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5539/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5540/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5541/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5542/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5543///
5544/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ph)
5545#[inline]
5546#[target_feature(enable = "avx512fp16")]
5547#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5548#[rustc_legacy_const_generics(4)]
5549#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5550pub fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
5551    a: __m512h,
5552    b: __m512h,
5553    c: __m512h,
5554    k: __mmask32,
5555) -> __m512h {
5556    unsafe {
5557        static_assert_rounding!(ROUNDING);
5558        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
5559    }
5560}
5561
5562/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5563/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5564/// out when the corresponding mask bit is not set).
5565///
5566/// Rounding is done according to the rounding parameter, which can be one of:
5567///
5568/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5569/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5570/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5571/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5572/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5573///
5574/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ph)
5575#[inline]
5576#[target_feature(enable = "avx512fp16")]
5577#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5578#[rustc_legacy_const_generics(4)]
5579#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5580pub fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
5581    k: __mmask32,
5582    a: __m512h,
5583    b: __m512h,
5584    c: __m512h,
5585) -> __m512h {
5586    unsafe {
5587        static_assert_rounding!(ROUNDING);
5588        simd_select_bitmask(
5589            k,
5590            _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
5591            _mm512_setzero_ph(),
5592        )
5593    }
5594}
5595
5596/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5597/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5598/// 7 packed elements from a to the upper elements of dst.
5599///
5600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sh)
5601#[inline]
5602#[target_feature(enable = "avx512fp16")]
5603#[cfg_attr(test, assert_instr(vfmadd))]
5604#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5605#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5606pub const fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5607    unsafe {
5608        let extracta: f16 = simd_extract!(a, 0);
5609        let extractb: f16 = simd_extract!(b, 0);
5610        let extractc: f16 = simd_extract!(c, 0);
5611        let r = fmaf16(extracta, extractb, extractc);
5612        simd_insert!(a, 0, r)
5613    }
5614}
5615
5616/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5617/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5618/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5619/// upper elements of dst.
5620///
5621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sh)
5622#[inline]
5623#[target_feature(enable = "avx512fp16")]
5624#[cfg_attr(test, assert_instr(vfmadd))]
5625#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5626#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5627pub const fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5628    unsafe {
5629        let mut fmadd: f16 = simd_extract!(a, 0);
5630        if k & 1 != 0 {
5631            let extractb: f16 = simd_extract!(b, 0);
5632            let extractc: f16 = simd_extract!(c, 0);
5633            fmadd = fmaf16(fmadd, extractb, extractc);
5634        }
5635        simd_insert!(a, 0, fmadd)
5636    }
5637}
5638
5639/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5640/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5641/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5642/// upper elements of dst.
5643///
5644/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sh)
5645#[inline]
5646#[target_feature(enable = "avx512fp16")]
5647#[cfg_attr(test, assert_instr(vfmadd))]
5648#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5649#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5650pub const fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5651    unsafe {
5652        let mut fmadd: f16 = simd_extract!(c, 0);
5653        if k & 1 != 0 {
5654            let extracta: f16 = simd_extract!(a, 0);
5655            let extractb: f16 = simd_extract!(b, 0);
5656            fmadd = fmaf16(extracta, extractb, fmadd);
5657        }
5658        simd_insert!(c, 0, fmadd)
5659    }
5660}
5661
5662/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5663/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5664/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5665/// upper elements of dst.
5666///
5667/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sh)
5668#[inline]
5669#[target_feature(enable = "avx512fp16")]
5670#[cfg_attr(test, assert_instr(vfmadd))]
5671#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5672#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5673pub const fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5674    unsafe {
5675        let mut fmadd: f16 = 0.0;
5676        if k & 1 != 0 {
5677            let extracta: f16 = simd_extract!(a, 0);
5678            let extractb: f16 = simd_extract!(b, 0);
5679            let extractc: f16 = simd_extract!(c, 0);
5680            fmadd = fmaf16(extracta, extractb, extractc);
5681        }
5682        simd_insert!(a, 0, fmadd)
5683    }
5684}
5685
5686/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5687/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5688/// 7 packed elements from a to the upper elements of dst.
5689///
5690/// Rounding is done according to the rounding parameter, which can be one of:
5691///
5692/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5693/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5694/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5695/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5696/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5697///
5698/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sh)
5699#[inline]
5700#[target_feature(enable = "avx512fp16")]
5701#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5702#[rustc_legacy_const_generics(3)]
5703#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5704pub fn _mm_fmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5705    unsafe {
5706        static_assert_rounding!(ROUNDING);
5707        let extracta: f16 = simd_extract!(a, 0);
5708        let extractb: f16 = simd_extract!(b, 0);
5709        let extractc: f16 = simd_extract!(c, 0);
5710        let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5711        simd_insert!(a, 0, r)
5712    }
5713}
5714
5715/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5716/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5717/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5718/// upper elements of dst.
5719///
5720/// Rounding is done according to the rounding parameter, which can be one of:
5721///
5722/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5723/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5724/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5725/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5726/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5727///
5728/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sh)
5729#[inline]
5730#[target_feature(enable = "avx512fp16")]
5731#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5732#[rustc_legacy_const_generics(4)]
5733#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5734pub fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
5735    a: __m128h,
5736    k: __mmask8,
5737    b: __m128h,
5738    c: __m128h,
5739) -> __m128h {
5740    unsafe {
5741        static_assert_rounding!(ROUNDING);
5742        let mut fmadd: f16 = simd_extract!(a, 0);
5743        if k & 1 != 0 {
5744            let extractb: f16 = simd_extract!(b, 0);
5745            let extractc: f16 = simd_extract!(c, 0);
5746            fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
5747        }
5748        simd_insert!(a, 0, fmadd)
5749    }
5750}
5751
5752/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5753/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5754/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5755/// upper elements of dst.
5756///
5757/// Rounding is done according to the rounding parameter, which can be one of:
5758///
5759/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5760/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5761/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5762/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5763/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5764///
5765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sh)
5766#[inline]
5767#[target_feature(enable = "avx512fp16")]
5768#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5769#[rustc_legacy_const_generics(4)]
5770#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5771pub fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
5772    a: __m128h,
5773    b: __m128h,
5774    c: __m128h,
5775    k: __mmask8,
5776) -> __m128h {
5777    unsafe {
5778        static_assert_rounding!(ROUNDING);
5779        let mut fmadd: f16 = simd_extract!(c, 0);
5780        if k & 1 != 0 {
5781            let extracta: f16 = simd_extract!(a, 0);
5782            let extractb: f16 = simd_extract!(b, 0);
5783            fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
5784        }
5785        simd_insert!(c, 0, fmadd)
5786    }
5787}
5788
5789/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5790/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5791/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5792/// upper elements of dst.
5793///
5794/// Rounding is done according to the rounding parameter, which can be one of:
5795///
5796/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5797/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5798/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5799/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5800/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5801///
5802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sh)
5803#[inline]
5804#[target_feature(enable = "avx512fp16")]
5805#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5806#[rustc_legacy_const_generics(4)]
5807#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5808pub fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
5809    k: __mmask8,
5810    a: __m128h,
5811    b: __m128h,
5812    c: __m128h,
5813) -> __m128h {
5814    unsafe {
5815        static_assert_rounding!(ROUNDING);
5816        let mut fmadd: f16 = 0.0;
5817        if k & 1 != 0 {
5818            let extracta: f16 = simd_extract!(a, 0);
5819            let extractb: f16 = simd_extract!(b, 0);
5820            let extractc: f16 = simd_extract!(c, 0);
5821            fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5822        }
5823        simd_insert!(a, 0, fmadd)
5824    }
5825}
5826
5827/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5828/// in c from the intermediate result, and store the results in dst.
5829/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5830///
5831/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ph)
5832#[inline]
5833#[target_feature(enable = "avx512fp16,avx512vl")]
5834#[cfg_attr(test, assert_instr(vfmsub))]
5835#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5836#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5837pub const fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5838    unsafe { simd_fma(a, b, simd_neg(c)) }
5839}
5840
5841/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5842/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5843/// from a when the corresponding mask bit is not set).
5844///
5845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ph)
5846#[inline]
5847#[target_feature(enable = "avx512fp16,avx512vl")]
5848#[cfg_attr(test, assert_instr(vfmsub))]
5849#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5850#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5851pub const fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5852    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a) }
5853}
5854
5855/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5856/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5857/// from c when the corresponding mask bit is not set).
5858///
5859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ph)
5860#[inline]
5861#[target_feature(enable = "avx512fp16,avx512vl")]
5862#[cfg_attr(test, assert_instr(vfmsub))]
5863#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5864#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5865pub const fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5866    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c) }
5867}
5868
5869/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5870/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5871/// out when the corresponding mask bit is not set).
5872///
5873/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ph)
5874#[inline]
5875#[target_feature(enable = "avx512fp16,avx512vl")]
5876#[cfg_attr(test, assert_instr(vfmsub))]
5877#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5878#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5879pub const fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5880    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph()) }
5881}
5882
5883/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5884/// in c from the intermediate result, and store the results in dst.
5885///
5886/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ph)
5887#[inline]
5888#[target_feature(enable = "avx512fp16,avx512vl")]
5889#[cfg_attr(test, assert_instr(vfmsub))]
5890#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5891#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5892pub const fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5893    unsafe { simd_fma(a, b, simd_neg(c)) }
5894}
5895
5896/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5897/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5898/// from a when the corresponding mask bit is not set).
5899///
5900/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ph)
5901#[inline]
5902#[target_feature(enable = "avx512fp16,avx512vl")]
5903#[cfg_attr(test, assert_instr(vfmsub))]
5904#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5905#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5906pub const fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5907    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a) }
5908}
5909
5910/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5911/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5912/// from c when the corresponding mask bit is not set).
5913///
5914/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ph)
5915#[inline]
5916#[target_feature(enable = "avx512fp16,avx512vl")]
5917#[cfg_attr(test, assert_instr(vfmsub))]
5918#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5919#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5920pub const fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5921    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c) }
5922}
5923
5924/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5925/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5926/// out when the corresponding mask bit is not set).
5927///
5928/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ph)
5929#[inline]
5930#[target_feature(enable = "avx512fp16,avx512vl")]
5931#[cfg_attr(test, assert_instr(vfmsub))]
5932#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5933#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5934pub const fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5935    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph()) }
5936}
5937
5938/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5939/// in c from the intermediate result, and store the results in dst.
5940///
5941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ph)
5942#[inline]
5943#[target_feature(enable = "avx512fp16")]
5944#[cfg_attr(test, assert_instr(vfmsub))]
5945#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5946#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5947pub const fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5948    unsafe { simd_fma(a, b, simd_neg(c)) }
5949}
5950
5951/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5952/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5953/// from a when the corresponding mask bit is not set).
5954///
5955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ph)
5956#[inline]
5957#[target_feature(enable = "avx512fp16")]
5958#[cfg_attr(test, assert_instr(vfmsub))]
5959#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5960#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5961pub const fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5962    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a) }
5963}
5964
5965/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5966/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5967/// from c when the corresponding mask bit is not set).
5968///
5969/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ph)
5970#[inline]
5971#[target_feature(enable = "avx512fp16")]
5972#[cfg_attr(test, assert_instr(vfmsub))]
5973#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5974#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5975pub const fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5976    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c) }
5977}
5978
5979/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5980/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5981/// out when the corresponding mask bit is not set).
5982///
5983/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ph)
5984#[inline]
5985#[target_feature(enable = "avx512fp16")]
5986#[cfg_attr(test, assert_instr(vfmsub))]
5987#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
5988#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5989pub const fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5990    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph()) }
5991}
5992
5993/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5994/// in c from the intermediate result, and store the results in dst.
5995///
5996/// Rounding is done according to the rounding parameter, which can be one of:
5997///
5998/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5999/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6000/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6001/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6002/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6003///
6004/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ph)
6005#[inline]
6006#[target_feature(enable = "avx512fp16")]
6007#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6008#[rustc_legacy_const_generics(3)]
6009#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6010pub fn _mm512_fmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6011    unsafe {
6012        static_assert_rounding!(ROUNDING);
6013        vfmaddph_512(a, b, simd_neg(c), ROUNDING)
6014    }
6015}
6016
6017/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6018/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
6019/// from a when the corresponding mask bit is not set).
6020///
6021/// Rounding is done according to the rounding parameter, which can be one of:
6022///
6023/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6024/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6025/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6026/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6027/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6028///
6029/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ph)
6030#[inline]
6031#[target_feature(enable = "avx512fp16")]
6032#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6033#[rustc_legacy_const_generics(4)]
6034#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6035pub fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
6036    a: __m512h,
6037    k: __mmask32,
6038    b: __m512h,
6039    c: __m512h,
6040) -> __m512h {
6041    unsafe {
6042        static_assert_rounding!(ROUNDING);
6043        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
6044    }
6045}
6046
6047/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6048/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
6049/// from c when the corresponding mask bit is not set).
6050///
6051/// Rounding is done according to the rounding parameter, which can be one of:
6052///
6053/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6054/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6055/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6056/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6057/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6058///
6059/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ph)
6060#[inline]
6061#[target_feature(enable = "avx512fp16")]
6062#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6063#[rustc_legacy_const_generics(4)]
6064#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6065pub fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
6066    a: __m512h,
6067    b: __m512h,
6068    c: __m512h,
6069    k: __mmask32,
6070) -> __m512h {
6071    unsafe {
6072        static_assert_rounding!(ROUNDING);
6073        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
6074    }
6075}
6076
6077/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6078/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
6079/// out when the corresponding mask bit is not set).
6080///
6081/// Rounding is done according to the rounding parameter, which can be one of:
6082///
6083/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6084/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6085/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6086/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6087/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6088///
6089/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ph)
6090#[inline]
6091#[target_feature(enable = "avx512fp16")]
6092#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6093#[rustc_legacy_const_generics(4)]
6094#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6095pub fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
6096    k: __mmask32,
6097    a: __m512h,
6098    b: __m512h,
6099    c: __m512h,
6100) -> __m512h {
6101    unsafe {
6102        static_assert_rounding!(ROUNDING);
6103        simd_select_bitmask(
6104            k,
6105            _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
6106            _mm512_setzero_ph(),
6107        )
6108    }
6109}
6110
6111/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6112/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
6113/// 7 packed elements from a to the upper elements of dst.
6114///
6115/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sh)
6116#[inline]
6117#[target_feature(enable = "avx512fp16")]
6118#[cfg_attr(test, assert_instr(vfmsub))]
6119#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6120#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6121pub const fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6122    unsafe {
6123        let extracta: f16 = simd_extract!(a, 0);
6124        let extractb: f16 = simd_extract!(b, 0);
6125        let extractc: f16 = simd_extract!(c, 0);
6126        let r = fmaf16(extracta, extractb, -extractc);
6127        simd_insert!(a, 0, r)
6128    }
6129}
6130
6131/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6132/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6133/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6134/// upper elements of dst.
6135///
6136/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sh)
6137#[inline]
6138#[target_feature(enable = "avx512fp16")]
6139#[cfg_attr(test, assert_instr(vfmsub))]
6140#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6141#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6142pub const fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6143    unsafe {
6144        let mut fmsub: f16 = simd_extract!(a, 0);
6145        if k & 1 != 0 {
6146            let extractb: f16 = simd_extract!(b, 0);
6147            let extractc: f16 = simd_extract!(c, 0);
6148            fmsub = fmaf16(fmsub, extractb, -extractc);
6149        }
6150        simd_insert!(a, 0, fmsub)
6151    }
6152}
6153
6154/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6155/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6156/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
6157/// upper elements of dst.
6158///
6159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sh)
6160#[inline]
6161#[target_feature(enable = "avx512fp16")]
6162#[cfg_attr(test, assert_instr(vfmsub))]
6163#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6164#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6165pub const fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6166    unsafe {
6167        let mut fmsub: f16 = simd_extract!(c, 0);
6168        if k & 1 != 0 {
6169            let extracta: f16 = simd_extract!(a, 0);
6170            let extractb: f16 = simd_extract!(b, 0);
6171            fmsub = fmaf16(extracta, extractb, -fmsub);
6172        }
6173        simd_insert!(c, 0, fmsub)
6174    }
6175}
6176
6177/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6178/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
6179/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6180/// upper elements of dst.
6181///
6182/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sh)
6183#[inline]
6184#[target_feature(enable = "avx512fp16")]
6185#[cfg_attr(test, assert_instr(vfmsub))]
6186#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6187#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6188pub const fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6189    unsafe {
6190        let mut fmsub: f16 = 0.0;
6191        if k & 1 != 0 {
6192            let extracta: f16 = simd_extract!(a, 0);
6193            let extractb: f16 = simd_extract!(b, 0);
6194            let extractc: f16 = simd_extract!(c, 0);
6195            fmsub = fmaf16(extracta, extractb, -extractc);
6196        }
6197        simd_insert!(a, 0, fmsub)
6198    }
6199}
6200
6201/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6202/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
6203/// 7 packed elements from a to the upper elements of dst.
6204///
6205/// Rounding is done according to the rounding parameter, which can be one of:
6206///
6207/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6208/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6209/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6210/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6211/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6212///
6213/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sh)
6214#[inline]
6215#[target_feature(enable = "avx512fp16")]
6216#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6217#[rustc_legacy_const_generics(3)]
6218#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6219pub fn _mm_fmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6220    unsafe {
6221        static_assert_rounding!(ROUNDING);
6222        let extracta: f16 = simd_extract!(a, 0);
6223        let extractb: f16 = simd_extract!(b, 0);
6224        let extractc: f16 = simd_extract!(c, 0);
6225        let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6226        simd_insert!(a, 0, r)
6227    }
6228}
6229
6230/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6231/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6232/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6233/// upper elements of dst.
6234///
6235/// Rounding is done according to the rounding parameter, which can be one of:
6236///
6237/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6238/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6239/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6240/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6241/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6242///
6243/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sh)
6244#[inline]
6245#[target_feature(enable = "avx512fp16")]
6246#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6247#[rustc_legacy_const_generics(4)]
6248#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6249pub fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
6250    a: __m128h,
6251    k: __mmask8,
6252    b: __m128h,
6253    c: __m128h,
6254) -> __m128h {
6255    unsafe {
6256        static_assert_rounding!(ROUNDING);
6257        let mut fmsub: f16 = simd_extract!(a, 0);
6258        if k & 1 != 0 {
6259            let extractb: f16 = simd_extract!(b, 0);
6260            let extractc: f16 = simd_extract!(c, 0);
6261            fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
6262        }
6263        simd_insert!(a, 0, fmsub)
6264    }
6265}
6266
6267/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6268/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6269/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
6270/// upper elements of dst.
6271///
6272/// Rounding is done according to the rounding parameter, which can be one of:
6273///
6274/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6275/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6276/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6277/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6278/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6279///
6280/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sh)
6281#[inline]
6282#[target_feature(enable = "avx512fp16")]
6283#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6284#[rustc_legacy_const_generics(4)]
6285#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6286pub fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
6287    a: __m128h,
6288    b: __m128h,
6289    c: __m128h,
6290    k: __mmask8,
6291) -> __m128h {
6292    unsafe {
6293        static_assert_rounding!(ROUNDING);
6294        let mut fmsub: f16 = simd_extract!(c, 0);
6295        if k & 1 != 0 {
6296            let extracta: f16 = simd_extract!(a, 0);
6297            let extractb: f16 = simd_extract!(b, 0);
6298            fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
6299        }
6300        simd_insert!(c, 0, fmsub)
6301    }
6302}
6303
6304/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6305/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
6306/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6307/// upper elements of dst.
6308///
6309/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sh)
6310#[inline]
6311#[target_feature(enable = "avx512fp16")]
6312#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6313#[rustc_legacy_const_generics(4)]
6314#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6315pub fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
6316    k: __mmask8,
6317    a: __m128h,
6318    b: __m128h,
6319    c: __m128h,
6320) -> __m128h {
6321    unsafe {
6322        static_assert_rounding!(ROUNDING);
6323        let mut fmsub: f16 = 0.0;
6324        if k & 1 != 0 {
6325            let extracta: f16 = simd_extract!(a, 0);
6326            let extractb: f16 = simd_extract!(b, 0);
6327            let extractc: f16 = simd_extract!(c, 0);
6328            fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6329        }
6330        simd_insert!(a, 0, fmsub)
6331    }
6332}
6333
6334/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6335/// result from packed elements in c, and store the results in dst.
6336///
6337/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ph)
6338#[inline]
6339#[target_feature(enable = "avx512fp16,avx512vl")]
6340#[cfg_attr(test, assert_instr(vfnmadd))]
6341#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6342#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6343pub const fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6344    unsafe { simd_fma(simd_neg(a), b, c) }
6345}
6346
6347/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6348/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6349/// from a when the corresponding mask bit is not set).
6350///
6351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ph)
6352#[inline]
6353#[target_feature(enable = "avx512fp16,avx512vl")]
6354#[cfg_attr(test, assert_instr(vfnmadd))]
6355#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6356#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6357pub const fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6358    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a) }
6359}
6360
6361/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6362/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6363/// from c when the corresponding mask bit is not set).
6364///
6365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ph)
6366#[inline]
6367#[target_feature(enable = "avx512fp16,avx512vl")]
6368#[cfg_attr(test, assert_instr(vfnmadd))]
6369#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6370#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6371pub const fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6372    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c) }
6373}
6374
6375/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6376/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6377/// out when the corresponding mask bit is not set).
6378///
6379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ph)
6380#[inline]
6381#[target_feature(enable = "avx512fp16,avx512vl")]
6382#[cfg_attr(test, assert_instr(vfnmadd))]
6383#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6384#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6385pub const fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6386    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph()) }
6387}
6388
6389/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6390/// result from packed elements in c, and store the results in dst.
6391///
6392/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ph)
6393#[inline]
6394#[target_feature(enable = "avx512fp16,avx512vl")]
6395#[cfg_attr(test, assert_instr(vfnmadd))]
6396#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6397#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6398pub const fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6399    unsafe { simd_fma(simd_neg(a), b, c) }
6400}
6401
6402/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6403/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6404/// from a when the corresponding mask bit is not set).
6405///
6406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ph)
6407#[inline]
6408#[target_feature(enable = "avx512fp16,avx512vl")]
6409#[cfg_attr(test, assert_instr(vfnmadd))]
6410#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6411#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6412pub const fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6413    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a) }
6414}
6415
6416/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6417/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6418/// from c when the corresponding mask bit is not set).
6419///
6420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ph)
6421#[inline]
6422#[target_feature(enable = "avx512fp16,avx512vl")]
6423#[cfg_attr(test, assert_instr(vfnmadd))]
6424#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6425#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6426pub const fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6427    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c) }
6428}
6429
6430/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6431/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6432/// out when the corresponding mask bit is not set).
6433///
6434/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ph)
6435#[inline]
6436#[target_feature(enable = "avx512fp16,avx512vl")]
6437#[cfg_attr(test, assert_instr(vfnmadd))]
6438#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6439#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6440pub const fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6441    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph()) }
6442}
6443
6444/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6445/// result from packed elements in c, and store the results in dst.
6446///
6447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ph)
6448#[inline]
6449#[target_feature(enable = "avx512fp16")]
6450#[cfg_attr(test, assert_instr(vfnmadd))]
6451#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6452#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6453pub const fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6454    unsafe { simd_fma(simd_neg(a), b, c) }
6455}
6456
6457/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6458/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6459/// from a when the corresponding mask bit is not set).
6460///
6461/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ph)
6462#[inline]
6463#[target_feature(enable = "avx512fp16")]
6464#[cfg_attr(test, assert_instr(vfnmadd))]
6465#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6466#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6467pub const fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6468    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a) }
6469}
6470
6471/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6472/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6473/// from c when the corresponding mask bit is not set).
6474///
6475/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ph)
6476#[inline]
6477#[target_feature(enable = "avx512fp16")]
6478#[cfg_attr(test, assert_instr(vfnmadd))]
6479#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6480#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6481pub const fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6482    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c) }
6483}
6484
6485/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6486/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6487/// out when the corresponding mask bit is not set).
6488///
6489/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ph)
6490#[inline]
6491#[target_feature(enable = "avx512fp16")]
6492#[cfg_attr(test, assert_instr(vfnmadd))]
6493#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6494#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6495pub const fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6496    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph()) }
6497}
6498
6499/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6500/// result from packed elements in c, and store the results in dst.
6501///
6502/// Rounding is done according to the rounding parameter, which can be one of:
6503///
6504/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6505/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6506/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6507/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6508/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6509///
6510/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ph)
6511#[inline]
6512#[target_feature(enable = "avx512fp16")]
6513#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6514#[rustc_legacy_const_generics(3)]
6515#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6516pub fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6517    unsafe {
6518        static_assert_rounding!(ROUNDING);
6519        vfmaddph_512(simd_neg(a), b, c, ROUNDING)
6520    }
6521}
6522
6523/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6524/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6525/// from a when the corresponding mask bit is not set).
6526///
6527/// Rounding is done according to the rounding parameter, which can be one of:
6528///
6529/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6530/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6531/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6532/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6533/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6534///
6535/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ph)
6536#[inline]
6537#[target_feature(enable = "avx512fp16")]
6538#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6539#[rustc_legacy_const_generics(4)]
6540#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6541pub fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
6542    a: __m512h,
6543    k: __mmask32,
6544    b: __m512h,
6545    c: __m512h,
6546) -> __m512h {
6547    unsafe {
6548        static_assert_rounding!(ROUNDING);
6549        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
6550    }
6551}
6552
6553/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6554/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6555/// from c when the corresponding mask bit is not set).
6556///
6557/// Rounding is done according to the rounding parameter, which can be one of:
6558///
6559/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6560/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6561/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6562/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6563/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6564///
6565/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ph)
6566#[inline]
6567#[target_feature(enable = "avx512fp16")]
6568#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6569#[rustc_legacy_const_generics(4)]
6570#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6571pub fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
6572    a: __m512h,
6573    b: __m512h,
6574    c: __m512h,
6575    k: __mmask32,
6576) -> __m512h {
6577    unsafe {
6578        static_assert_rounding!(ROUNDING);
6579        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
6580    }
6581}
6582
6583/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6584/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6585/// out when the corresponding mask bit is not set).
6586///
6587/// Rounding is done according to the rounding parameter, which can be one of:
6588///
6589/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6590/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6591/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6592/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6593/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6594///
6595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ph)
6596#[inline]
6597#[target_feature(enable = "avx512fp16")]
6598#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6599#[rustc_legacy_const_generics(4)]
6600#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6601pub fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
6602    k: __mmask32,
6603    a: __m512h,
6604    b: __m512h,
6605    c: __m512h,
6606) -> __m512h {
6607    unsafe {
6608        static_assert_rounding!(ROUNDING);
6609        simd_select_bitmask(
6610            k,
6611            _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
6612            _mm512_setzero_ph(),
6613        )
6614    }
6615}
6616
6617/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6618/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6619/// elements from a to the upper elements of dst.
6620///
6621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sh)
6622#[inline]
6623#[target_feature(enable = "avx512fp16")]
6624#[cfg_attr(test, assert_instr(vfnmadd))]
6625#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6626#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6627pub const fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6628    unsafe {
6629        let extracta: f16 = simd_extract!(a, 0);
6630        let extractb: f16 = simd_extract!(b, 0);
6631        let extractc: f16 = simd_extract!(c, 0);
6632        let r = fmaf16(-extracta, extractb, extractc);
6633        simd_insert!(a, 0, r)
6634    }
6635}
6636
6637/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6638/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6639/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6640/// elements of dst.
6641///
6642/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_sh)
6643#[inline]
6644#[target_feature(enable = "avx512fp16")]
6645#[cfg_attr(test, assert_instr(vfnmadd))]
6646#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6647#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6648pub const fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6649    unsafe {
6650        let mut fnmadd: f16 = simd_extract!(a, 0);
6651        if k & 1 != 0 {
6652            let extractb: f16 = simd_extract!(b, 0);
6653            let extractc: f16 = simd_extract!(c, 0);
6654            fnmadd = fmaf16(-fnmadd, extractb, extractc);
6655        }
6656        simd_insert!(a, 0, fnmadd)
6657    }
6658}
6659
6660/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6661/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6662/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6663/// elements of dst.
6664///
6665/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_sh)
6666#[inline]
6667#[target_feature(enable = "avx512fp16")]
6668#[cfg_attr(test, assert_instr(vfnmadd))]
6669#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6670#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6671pub const fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6672    unsafe {
6673        let mut fnmadd: f16 = simd_extract!(c, 0);
6674        if k & 1 != 0 {
6675            let extracta: f16 = simd_extract!(a, 0);
6676            let extractb: f16 = simd_extract!(b, 0);
6677            fnmadd = fmaf16(-extracta, extractb, fnmadd);
6678        }
6679        simd_insert!(c, 0, fnmadd)
6680    }
6681}
6682
6683/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6684/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6685/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6686/// elements of dst.
6687///
6688/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_sh)
6689#[inline]
6690#[target_feature(enable = "avx512fp16")]
6691#[cfg_attr(test, assert_instr(vfnmadd))]
6692#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6693#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6694pub const fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6695    unsafe {
6696        let mut fnmadd: f16 = 0.0;
6697        if k & 1 != 0 {
6698            let extracta: f16 = simd_extract!(a, 0);
6699            let extractb: f16 = simd_extract!(b, 0);
6700            let extractc: f16 = simd_extract!(c, 0);
6701            fnmadd = fmaf16(-extracta, extractb, extractc);
6702        }
6703        simd_insert!(a, 0, fnmadd)
6704    }
6705}
6706
6707/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6708/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6709/// elements from a to the upper elements of dst.
6710///
6711/// Rounding is done according to the rounding parameter, which can be one of:
6712///
6713/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6714/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6715/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6716/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6717/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6718///
6719/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sh)
6720#[inline]
6721#[target_feature(enable = "avx512fp16")]
6722#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6723#[rustc_legacy_const_generics(3)]
6724#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6725pub fn _mm_fnmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6726    unsafe {
6727        static_assert_rounding!(ROUNDING);
6728        let extracta: f16 = simd_extract!(a, 0);
6729        let extractb: f16 = simd_extract!(b, 0);
6730        let extractc: f16 = simd_extract!(c, 0);
6731        let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6732        simd_insert!(a, 0, r)
6733    }
6734}
6735
6736/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6737/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6738/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6739/// elements of dst.
6740///
6741/// Rounding is done according to the rounding parameter, which can be one of:
6742///
6743/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6744/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6745/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6746/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6747/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6748///
6749/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_round_sh)
6750#[inline]
6751#[target_feature(enable = "avx512fp16")]
6752#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6753#[rustc_legacy_const_generics(4)]
6754#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6755pub fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
6756    a: __m128h,
6757    k: __mmask8,
6758    b: __m128h,
6759    c: __m128h,
6760) -> __m128h {
6761    unsafe {
6762        static_assert_rounding!(ROUNDING);
6763        let mut fnmadd: f16 = simd_extract!(a, 0);
6764        if k & 1 != 0 {
6765            let extractb: f16 = simd_extract!(b, 0);
6766            let extractc: f16 = simd_extract!(c, 0);
6767            fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
6768        }
6769        simd_insert!(a, 0, fnmadd)
6770    }
6771}
6772
6773/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6774/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6775/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6776/// elements of dst.
6777///
6778/// Rounding is done according to the rounding parameter, which can be one of:
6779///
6780/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6781/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6782/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6783/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6784/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6785///
6786/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_round_sh)
6787#[inline]
6788#[target_feature(enable = "avx512fp16")]
6789#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6790#[rustc_legacy_const_generics(4)]
6791#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6792pub fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
6793    a: __m128h,
6794    b: __m128h,
6795    c: __m128h,
6796    k: __mmask8,
6797) -> __m128h {
6798    unsafe {
6799        static_assert_rounding!(ROUNDING);
6800        let mut fnmadd: f16 = simd_extract!(c, 0);
6801        if k & 1 != 0 {
6802            let extracta: f16 = simd_extract!(a, 0);
6803            let extractb: f16 = simd_extract!(b, 0);
6804            fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
6805        }
6806        simd_insert!(c, 0, fnmadd)
6807    }
6808}
6809
6810/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6811/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6812/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6813/// elements of dst.
6814///
6815/// Rounding is done according to the rounding parameter, which can be one of:
6816///
6817/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6818/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6819/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6820/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6821/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6822///
6823/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_round_sh)
6824#[inline]
6825#[target_feature(enable = "avx512fp16")]
6826#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6827#[rustc_legacy_const_generics(4)]
6828#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6829pub fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
6830    k: __mmask8,
6831    a: __m128h,
6832    b: __m128h,
6833    c: __m128h,
6834) -> __m128h {
6835    unsafe {
6836        static_assert_rounding!(ROUNDING);
6837        let mut fnmadd: f16 = 0.0;
6838        if k & 1 != 0 {
6839            let extracta: f16 = simd_extract!(a, 0);
6840            let extractb: f16 = simd_extract!(b, 0);
6841            let extractc: f16 = simd_extract!(c, 0);
6842            fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6843        }
6844        simd_insert!(a, 0, fnmadd)
6845    }
6846}
6847
6848/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6849/// in c from the negated intermediate result, and store the results in dst.
6850///
6851/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ph)
6852#[inline]
6853#[target_feature(enable = "avx512fp16,avx512vl")]
6854#[cfg_attr(test, assert_instr(vfnmsub))]
6855#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6856#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6857pub const fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6858    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6859}
6860
6861/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6862/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6863/// copied from a when the corresponding mask bit is not set).
6864///
6865/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ph)
6866#[inline]
6867#[target_feature(enable = "avx512fp16,avx512vl")]
6868#[cfg_attr(test, assert_instr(vfnmsub))]
6869#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6870#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6871pub const fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6872    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a) }
6873}
6874
6875/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6876/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6877/// copied from c when the corresponding mask bit is not set).
6878///
6879/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ph)
6880#[inline]
6881#[target_feature(enable = "avx512fp16,avx512vl")]
6882#[cfg_attr(test, assert_instr(vfnmsub))]
6883#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6884#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6885pub const fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6886    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c) }
6887}
6888
6889/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6890/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6891/// zeroed out when the corresponding mask bit is not set).
6892///
6893/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ph)
6894#[inline]
6895#[target_feature(enable = "avx512fp16,avx512vl")]
6896#[cfg_attr(test, assert_instr(vfnmsub))]
6897#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6898#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6899pub const fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6900    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph()) }
6901}
6902
6903/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6904/// in c from the negated intermediate result, and store the results in dst.
6905///
6906/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ph)
6907#[inline]
6908#[target_feature(enable = "avx512fp16,avx512vl")]
6909#[cfg_attr(test, assert_instr(vfnmsub))]
6910#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6911#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6912pub const fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6913    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6914}
6915
6916/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6917/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6918/// copied from a when the corresponding mask bit is not set).
6919///
6920/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ph)
6921#[inline]
6922#[target_feature(enable = "avx512fp16,avx512vl")]
6923#[cfg_attr(test, assert_instr(vfnmsub))]
6924#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6925#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6926pub const fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6927    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a) }
6928}
6929
6930/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6931/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6932/// copied from c when the corresponding mask bit is not set).
6933///
6934/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ph)
6935#[inline]
6936#[target_feature(enable = "avx512fp16,avx512vl")]
6937#[cfg_attr(test, assert_instr(vfnmsub))]
6938#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6939#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6940pub const fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6941    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c) }
6942}
6943
6944/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6945/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6946/// zeroed out when the corresponding mask bit is not set).
6947///
6948/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ph)
6949#[inline]
6950#[target_feature(enable = "avx512fp16,avx512vl")]
6951#[cfg_attr(test, assert_instr(vfnmsub))]
6952#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6953#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6954pub const fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6955    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph()) }
6956}
6957
6958/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6959/// in c from the negated intermediate result, and store the results in dst.
6960///
6961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ph)
6962#[inline]
6963#[target_feature(enable = "avx512fp16")]
6964#[cfg_attr(test, assert_instr(vfnmsub))]
6965#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6966#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6967pub const fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6968    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6969}
6970
6971/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6972/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6973/// copied from a when the corresponding mask bit is not set).
6974///
6975/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ph)
6976#[inline]
6977#[target_feature(enable = "avx512fp16")]
6978#[cfg_attr(test, assert_instr(vfnmsub))]
6979#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6980#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6981pub const fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6982    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a) }
6983}
6984
6985/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6986/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6987/// copied from c when the corresponding mask bit is not set).
6988///
6989/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ph)
6990#[inline]
6991#[target_feature(enable = "avx512fp16")]
6992#[cfg_attr(test, assert_instr(vfnmsub))]
6993#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
6994#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6995pub const fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6996    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c) }
6997}
6998
6999/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
7000/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
7001/// zeroed out when the corresponding mask bit is not set).
7002///
7003/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ph)
7004#[inline]
7005#[target_feature(enable = "avx512fp16")]
7006#[cfg_attr(test, assert_instr(vfnmsub))]
7007#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7008#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7009pub const fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7010    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph()) }
7011}
7012
7013/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
7014/// in c from the negated intermediate result, and store the results in dst.
7015///
7016/// Rounding is done according to the rounding parameter, which can be one of:
7017///
7018/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7019/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7020/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7021/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7022/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7023///
7024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ph)
7025#[inline]
7026#[target_feature(enable = "avx512fp16")]
7027#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7028#[rustc_legacy_const_generics(3)]
7029#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7030pub fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7031    unsafe {
7032        static_assert_rounding!(ROUNDING);
7033        vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
7034    }
7035}
7036
7037/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
7038/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
7039/// copied from a when the corresponding mask bit is not set).
7040///
7041/// Rounding is done according to the rounding parameter, which can be one of:
7042///
7043/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7044/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7045/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7046/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7047/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7048///
7049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ph)
7050#[inline]
7051#[target_feature(enable = "avx512fp16")]
7052#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7053#[rustc_legacy_const_generics(4)]
7054#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7055pub fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
7056    a: __m512h,
7057    k: __mmask32,
7058    b: __m512h,
7059    c: __m512h,
7060) -> __m512h {
7061    unsafe {
7062        static_assert_rounding!(ROUNDING);
7063        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
7064    }
7065}
7066
7067/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
7068/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
7069/// copied from c when the corresponding mask bit is not set).
7070///
7071/// Rounding is done according to the rounding parameter, which can be one of:
7072///
7073/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7074/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7075/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7076/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7077/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7078///
7079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ph)
7080#[inline]
7081#[target_feature(enable = "avx512fp16")]
7082#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7083#[rustc_legacy_const_generics(4)]
7084#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7085pub fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
7086    a: __m512h,
7087    b: __m512h,
7088    c: __m512h,
7089    k: __mmask32,
7090) -> __m512h {
7091    unsafe {
7092        static_assert_rounding!(ROUNDING);
7093        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
7094    }
7095}
7096
7097/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
7098/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
7099/// zeroed out when the corresponding mask bit is not set).
7100///
7101/// Rounding is done according to the rounding parameter, which can be one of:
7102///
7103/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7104/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7105/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7106/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7107/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7108///
7109/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ph)
7110#[inline]
7111#[target_feature(enable = "avx512fp16")]
7112#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7113#[rustc_legacy_const_generics(4)]
7114#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7115pub fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
7116    k: __mmask32,
7117    a: __m512h,
7118    b: __m512h,
7119    c: __m512h,
7120) -> __m512h {
7121    unsafe {
7122        static_assert_rounding!(ROUNDING);
7123        simd_select_bitmask(
7124            k,
7125            _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
7126            _mm512_setzero_ph(),
7127        )
7128    }
7129}
7130
7131/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7132/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
7133/// elements from a to the upper elements of dst.
7134///
7135/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sh)
7136#[inline]
7137#[target_feature(enable = "avx512fp16")]
7138#[cfg_attr(test, assert_instr(vfnmsub))]
7139#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7140#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7141pub const fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7142    unsafe {
7143        let extracta: f16 = simd_extract!(a, 0);
7144        let extractb: f16 = simd_extract!(b, 0);
7145        let extractc: f16 = simd_extract!(c, 0);
7146        let r = fmaf16(-extracta, extractb, -extractc);
7147        simd_insert!(a, 0, r)
7148    }
7149}
7150
7151/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7152/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7153/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7154/// elements of dst.
7155///
7156/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sh)
7157#[inline]
7158#[target_feature(enable = "avx512fp16")]
7159#[cfg_attr(test, assert_instr(vfnmsub))]
7160#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7161#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7162pub const fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7163    unsafe {
7164        let mut fnmsub: f16 = simd_extract!(a, 0);
7165        if k & 1 != 0 {
7166            let extractb: f16 = simd_extract!(b, 0);
7167            let extractc: f16 = simd_extract!(c, 0);
7168            fnmsub = fmaf16(-fnmsub, extractb, -extractc);
7169        }
7170        simd_insert!(a, 0, fnmsub)
7171    }
7172}
7173
7174/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7175/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7176/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
7177/// elements of dst.
7178///
7179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sh)
7180#[inline]
7181#[target_feature(enable = "avx512fp16")]
7182#[cfg_attr(test, assert_instr(vfnmsub))]
7183#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7184#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7185pub const fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7186    unsafe {
7187        let mut fnmsub: f16 = simd_extract!(c, 0);
7188        if k & 1 != 0 {
7189            let extracta: f16 = simd_extract!(a, 0);
7190            let extractb: f16 = simd_extract!(b, 0);
7191            fnmsub = fmaf16(-extracta, extractb, -fnmsub);
7192        }
7193        simd_insert!(c, 0, fnmsub)
7194    }
7195}
7196
7197/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7198/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
7199/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7200/// elements of dst.
7201///
7202/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sh)
7203#[inline]
7204#[target_feature(enable = "avx512fp16")]
7205#[cfg_attr(test, assert_instr(vfnmsub))]
7206#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7207#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7208pub const fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7209    unsafe {
7210        let mut fnmsub: f16 = 0.0;
7211        if k & 1 != 0 {
7212            let extracta: f16 = simd_extract!(a, 0);
7213            let extractb: f16 = simd_extract!(b, 0);
7214            let extractc: f16 = simd_extract!(c, 0);
7215            fnmsub = fmaf16(-extracta, extractb, -extractc);
7216        }
7217        simd_insert!(a, 0, fnmsub)
7218    }
7219}
7220
7221/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7222/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
7223/// elements from a to the upper elements of dst.
7224///
7225/// Rounding is done according to the rounding parameter, which can be one of:
7226///
7227/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7228/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7229/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7230/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7231/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7232///
7233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sh)
7234#[inline]
7235#[target_feature(enable = "avx512fp16")]
7236#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7237#[rustc_legacy_const_generics(3)]
7238#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7239pub fn _mm_fnmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7240    unsafe {
7241        static_assert_rounding!(ROUNDING);
7242        let extracta: f16 = simd_extract!(a, 0);
7243        let extractb: f16 = simd_extract!(b, 0);
7244        let extractc: f16 = simd_extract!(c, 0);
7245        let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
7246        simd_insert!(a, 0, r)
7247    }
7248}
7249
7250/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7251/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7252/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7253/// elements of dst.
7254///
7255/// Rounding is done according to the rounding parameter, which can be one of:
7256///
7257/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7258/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7259/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7260/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7261/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7262///
7263/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_sh)
7264#[inline]
7265#[target_feature(enable = "avx512fp16")]
7266#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7267#[rustc_legacy_const_generics(4)]
7268#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7269pub fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
7270    a: __m128h,
7271    k: __mmask8,
7272    b: __m128h,
7273    c: __m128h,
7274) -> __m128h {
7275    unsafe {
7276        static_assert_rounding!(ROUNDING);
7277        let mut fnmsub: f16 = simd_extract!(a, 0);
7278        if k & 1 != 0 {
7279            let extractb: f16 = simd_extract!(b, 0);
7280            let extractc: f16 = simd_extract!(c, 0);
7281            fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
7282        }
7283        simd_insert!(a, 0, fnmsub)
7284    }
7285}
7286
7287/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7288/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7289/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
7290/// elements of dst.
7291///
7292/// Rounding is done according to the rounding parameter, which can be one of:
7293///
7294/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7295/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7296/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7297/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7298/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7299///
7300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_sh)
7301#[inline]
7302#[target_feature(enable = "avx512fp16")]
7303#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7304#[rustc_legacy_const_generics(4)]
7305#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7306pub fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
7307    a: __m128h,
7308    b: __m128h,
7309    c: __m128h,
7310    k: __mmask8,
7311) -> __m128h {
7312    unsafe {
7313        static_assert_rounding!(ROUNDING);
7314        let mut fnmsub: f16 = simd_extract!(c, 0);
7315        if k & 1 != 0 {
7316            let extracta: f16 = simd_extract!(a, 0);
7317            let extractb: f16 = simd_extract!(b, 0);
7318            fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
7319        }
7320        simd_insert!(c, 0, fnmsub)
7321    }
7322}
7323
7324/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7325/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
7326/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7327/// elements of dst.
7328///
7329/// Rounding is done according to the rounding parameter, which can be one of:
7330///
7331/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7332/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7333/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7334/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7335/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7336///
7337/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_sh)
7338#[inline]
7339#[target_feature(enable = "avx512fp16")]
7340#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7341#[rustc_legacy_const_generics(4)]
7342#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7343pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
7344    k: __mmask8,
7345    a: __m128h,
7346    b: __m128h,
7347    c: __m128h,
7348) -> __m128h {
7349    unsafe {
7350        static_assert_rounding!(ROUNDING);
7351        let mut fnmsub: f16 = 0.0;
7352        if k & 1 != 0 {
7353            let extracta: f16 = simd_extract!(a, 0);
7354            let extractb: f16 = simd_extract!(b, 0);
7355            let extractc: f16 = simd_extract!(c, 0);
7356            fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
7357        }
7358        simd_insert!(a, 0, fnmsub)
7359    }
7360}
7361
7362/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7363/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7364///
7365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ph)
7366#[inline]
7367#[target_feature(enable = "avx512fp16,avx512vl")]
7368#[cfg_attr(test, assert_instr(vfmaddsub))]
7369#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7370#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7371pub const fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7372    unsafe {
7373        let add = simd_fma(a, b, c);
7374        let sub = simd_fma(a, b, simd_neg(c));
7375        simd_shuffle!(sub, add, [0, 9, 2, 11, 4, 13, 6, 15])
7376    }
7377}
7378
7379/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7380/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7381/// (the element is copied from a when the corresponding mask bit is not set).
7382///
7383/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ph)
7384#[inline]
7385#[target_feature(enable = "avx512fp16,avx512vl")]
7386#[cfg_attr(test, assert_instr(vfmaddsub))]
7387#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7389pub const fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7390    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a) }
7391}
7392
7393/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7394/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7395/// (the element is copied from c when the corresponding mask bit is not set).
7396///
7397/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ph)
7398#[inline]
7399#[target_feature(enable = "avx512fp16,avx512vl")]
7400#[cfg_attr(test, assert_instr(vfmaddsub))]
7401#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7402#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7403pub const fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7404    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c) }
7405}
7406
7407/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7408/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7409/// (the element is zeroed out when the corresponding mask bit is not set).
7410///
7411/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ph)
7412#[inline]
7413#[target_feature(enable = "avx512fp16,avx512vl")]
7414#[cfg_attr(test, assert_instr(vfmaddsub))]
7415#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7416#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7417pub const fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7418    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph()) }
7419}
7420
7421/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7422/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7423///
7424/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ph)
7425#[inline]
7426#[target_feature(enable = "avx512fp16,avx512vl")]
7427#[cfg_attr(test, assert_instr(vfmaddsub))]
7428#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7429#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7430pub const fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7431    unsafe {
7432        let add = simd_fma(a, b, c);
7433        let sub = simd_fma(a, b, simd_neg(c));
7434        simd_shuffle!(
7435            sub,
7436            add,
7437            [0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31]
7438        )
7439    }
7440}
7441
7442/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7443/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7444/// (the element is copied from a when the corresponding mask bit is not set).
7445///
7446/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ph)
7447#[inline]
7448#[target_feature(enable = "avx512fp16,avx512vl")]
7449#[cfg_attr(test, assert_instr(vfmaddsub))]
7450#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7451#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7452pub const fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7453    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a) }
7454}
7455
7456/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7457/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7458/// (the element is copied from c when the corresponding mask bit is not set).
7459///
7460/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ph)
7461#[inline]
7462#[target_feature(enable = "avx512fp16,avx512vl")]
7463#[cfg_attr(test, assert_instr(vfmaddsub))]
7464#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7465#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7466pub const fn _mm256_mask3_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7467    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c) }
7468}
7469
7470/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7471/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7472/// (the element is zeroed out when the corresponding mask bit is not set).
7473///
7474/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ph)
7475#[inline]
7476#[target_feature(enable = "avx512fp16,avx512vl")]
7477#[cfg_attr(test, assert_instr(vfmaddsub))]
7478#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7479#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7480pub const fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7481    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph()) }
7482}
7483
7484/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7485/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7486///
7487/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ph)
7488#[inline]
7489#[target_feature(enable = "avx512fp16")]
7490#[cfg_attr(test, assert_instr(vfmaddsub))]
7491#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7492#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7493pub const fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7494    unsafe {
7495        let add = simd_fma(a, b, c);
7496        let sub = simd_fma(a, b, simd_neg(c));
7497        simd_shuffle!(
7498            sub,
7499            add,
7500            [
7501                0, 33, 2, 35, 4, 37, 6, 39, 8, 41, 10, 43, 12, 45, 14, 47, 16, 49, 18, 51, 20, 53,
7502                22, 55, 24, 57, 26, 59, 28, 61, 30, 63
7503            ]
7504        )
7505    }
7506}
7507
7508/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7509/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7510/// (the element is copied from a when the corresponding mask bit is not set).
7511///
7512/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ph)
7513#[inline]
7514#[target_feature(enable = "avx512fp16")]
7515#[cfg_attr(test, assert_instr(vfmaddsub))]
7516#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7517#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7518pub const fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7519    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a) }
7520}
7521
7522/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7523/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7524/// (the element is copied from c when the corresponding mask bit is not set).
7525///
7526/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ph)
7527#[inline]
7528#[target_feature(enable = "avx512fp16")]
7529#[cfg_attr(test, assert_instr(vfmaddsub))]
7530#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7531#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7532pub const fn _mm512_mask3_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7533    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c) }
7534}
7535
7536/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7537/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7538/// (the element is zeroed out when the corresponding mask bit is not set).
7539///
7540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ph)
7541#[inline]
7542#[target_feature(enable = "avx512fp16")]
7543#[cfg_attr(test, assert_instr(vfmaddsub))]
7544#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7545#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7546pub const fn _mm512_maskz_fmaddsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7547    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph()) }
7548}
7549
7550/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7551/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7552///
7553/// Rounding is done according to the rounding parameter, which can be one of:
7554///
7555/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7556/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7557/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7558/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7559/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7560///
7561/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ph)
7562#[inline]
7563#[target_feature(enable = "avx512fp16")]
7564#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7565#[rustc_legacy_const_generics(3)]
7566#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7567pub fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
7568    a: __m512h,
7569    b: __m512h,
7570    c: __m512h,
7571) -> __m512h {
7572    unsafe {
7573        static_assert_rounding!(ROUNDING);
7574        vfmaddsubph_512(a, b, c, ROUNDING)
7575    }
7576}
7577
7578/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7579/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7580/// (the element is copied from a when the corresponding mask bit is not set).
7581///
7582/// Rounding is done according to the rounding parameter, which can be one of:
7583///
7584/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7585/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7586/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7587/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7588/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7589///
7590/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ph)
7591#[inline]
7592#[target_feature(enable = "avx512fp16")]
7593#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7594#[rustc_legacy_const_generics(4)]
7595#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7596pub fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
7597    a: __m512h,
7598    k: __mmask32,
7599    b: __m512h,
7600    c: __m512h,
7601) -> __m512h {
7602    unsafe {
7603        static_assert_rounding!(ROUNDING);
7604        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
7605    }
7606}
7607
7608/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7609/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7610/// (the element is copied from c when the corresponding mask bit is not set).
7611///
7612/// Rounding is done according to the rounding parameter, which can be one of:
7613///
7614/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7615/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7616/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7617/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7618/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7619///
7620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ph)
7621#[inline]
7622#[target_feature(enable = "avx512fp16")]
7623#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7624#[rustc_legacy_const_generics(4)]
7625#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7626pub fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
7627    a: __m512h,
7628    b: __m512h,
7629    c: __m512h,
7630    k: __mmask32,
7631) -> __m512h {
7632    unsafe {
7633        static_assert_rounding!(ROUNDING);
7634        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
7635    }
7636}
7637
7638/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7639/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7640/// (the element is zeroed out when the corresponding mask bit is not set).
7641///
7642/// Rounding is done according to the rounding parameter, which can be one of:
7643///
7644/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7645/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7646/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7647/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7648/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7649///
7650/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ph)
7651#[inline]
7652#[target_feature(enable = "avx512fp16")]
7653#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7654#[rustc_legacy_const_generics(4)]
7655#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7656pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
7657    k: __mmask32,
7658    a: __m512h,
7659    b: __m512h,
7660    c: __m512h,
7661) -> __m512h {
7662    unsafe {
7663        static_assert_rounding!(ROUNDING);
7664        simd_select_bitmask(
7665            k,
7666            _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
7667            _mm512_setzero_ph(),
7668        )
7669    }
7670}
7671
7672/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7673/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7674///
7675/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ph)
7676#[inline]
7677#[target_feature(enable = "avx512fp16,avx512vl")]
7678#[cfg_attr(test, assert_instr(vfmsubadd))]
7679#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7680#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7681pub const fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7682    _mm_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
7683}
7684
7685/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7686/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7687/// (the element is copied from a when the corresponding mask bit is not set).
7688///
7689/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ph)
7690#[inline]
7691#[target_feature(enable = "avx512fp16,avx512vl")]
7692#[cfg_attr(test, assert_instr(vfmsubadd))]
7693#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7694#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7695pub const fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7696    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a) }
7697}
7698
7699/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7700/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7701/// (the element is copied from c when the corresponding mask bit is not set).
7702///
7703/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ph)
7704#[inline]
7705#[target_feature(enable = "avx512fp16,avx512vl")]
7706#[cfg_attr(test, assert_instr(vfmsubadd))]
7707#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7708#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7709pub const fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7710    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c) }
7711}
7712
7713/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7714/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7715/// (the element is zeroed out when the corresponding mask bit is not set).
7716///
7717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ph)
7718#[inline]
7719#[target_feature(enable = "avx512fp16,avx512vl")]
7720#[cfg_attr(test, assert_instr(vfmsubadd))]
7721#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7722#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7723pub const fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7724    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph()) }
7725}
7726
7727/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7728/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7729///
7730/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ph)
7731#[inline]
7732#[target_feature(enable = "avx512fp16,avx512vl")]
7733#[cfg_attr(test, assert_instr(vfmsubadd))]
7734#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7735#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7736pub const fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7737    _mm256_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
7738}
7739
7740/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7741/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7742/// (the element is copied from a when the corresponding mask bit is not set).
7743///
7744/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ph)
7745#[inline]
7746#[target_feature(enable = "avx512fp16,avx512vl")]
7747#[cfg_attr(test, assert_instr(vfmsubadd))]
7748#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7749#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7750pub const fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7751    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a) }
7752}
7753
7754/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7755/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7756/// (the element is copied from c when the corresponding mask bit is not set).
7757///
7758/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ph)
7759#[inline]
7760#[target_feature(enable = "avx512fp16,avx512vl")]
7761#[cfg_attr(test, assert_instr(vfmsubadd))]
7762#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7763#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7764pub const fn _mm256_mask3_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7765    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c) }
7766}
7767
7768/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7769/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7770/// (the element is zeroed out when the corresponding mask bit is not set).
7771///
7772/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ph)
7773#[inline]
7774#[target_feature(enable = "avx512fp16,avx512vl")]
7775#[cfg_attr(test, assert_instr(vfmsubadd))]
7776#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7777#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7778pub const fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7779    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph()) }
7780}
7781
7782/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7783/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7784///
7785/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ph)
7786#[inline]
7787#[target_feature(enable = "avx512fp16")]
7788#[cfg_attr(test, assert_instr(vfmsubadd))]
7789#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7790#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7791pub const fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7792    _mm512_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
7793}
7794
7795/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7796/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7797/// (the element is copied from a when the corresponding mask bit is not set).
7798///
7799/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ph)
7800#[inline]
7801#[target_feature(enable = "avx512fp16")]
7802#[cfg_attr(test, assert_instr(vfmsubadd))]
7803#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7804#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7805pub const fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7806    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a) }
7807}
7808
7809/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7810/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7811/// (the element is copied from c when the corresponding mask bit is not set).
7812///
7813/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ph)
7814#[inline]
7815#[target_feature(enable = "avx512fp16")]
7816#[cfg_attr(test, assert_instr(vfmsubadd))]
7817#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7818#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7819pub const fn _mm512_mask3_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7820    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c) }
7821}
7822
7823/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7824/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7825/// (the element is zeroed out when the corresponding mask bit is not set).
7826///
7827/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ph)
7828#[inline]
7829#[target_feature(enable = "avx512fp16")]
7830#[cfg_attr(test, assert_instr(vfmsubadd))]
7831#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7832#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7833pub const fn _mm512_maskz_fmsubadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7834    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph()) }
7835}
7836
7837/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7838/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7839///
7840/// Rounding is done according to the rounding parameter, which can be one of:
7841///
7842/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7843/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7844/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7845/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7846/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7847///
7848/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ph)
7849#[inline]
7850#[target_feature(enable = "avx512fp16")]
7851#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7852#[rustc_legacy_const_generics(3)]
7853#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7854pub fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
7855    a: __m512h,
7856    b: __m512h,
7857    c: __m512h,
7858) -> __m512h {
7859    unsafe {
7860        static_assert_rounding!(ROUNDING);
7861        vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
7862    }
7863}
7864
7865/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7866/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7867/// (the element is copied from a when the corresponding mask bit is not set).
7868///
7869/// Rounding is done according to the rounding parameter, which can be one of:
7870///
7871/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7872/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7873/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7874/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7875/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7876///
7877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ph)
7878#[inline]
7879#[target_feature(enable = "avx512fp16")]
7880#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7881#[rustc_legacy_const_generics(4)]
7882#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7883pub fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
7884    a: __m512h,
7885    k: __mmask32,
7886    b: __m512h,
7887    c: __m512h,
7888) -> __m512h {
7889    unsafe {
7890        static_assert_rounding!(ROUNDING);
7891        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
7892    }
7893}
7894
7895/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7896/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7897/// (the element is copied from c when the corresponding mask bit is not set).
7898///
7899/// Rounding is done according to the rounding parameter, which can be one of:
7900///
7901/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7902/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7903/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7904/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7905/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7906///
7907/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ph)
7908#[inline]
7909#[target_feature(enable = "avx512fp16")]
7910#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7911#[rustc_legacy_const_generics(4)]
7912#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7913pub fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
7914    a: __m512h,
7915    b: __m512h,
7916    c: __m512h,
7917    k: __mmask32,
7918) -> __m512h {
7919    unsafe {
7920        static_assert_rounding!(ROUNDING);
7921        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
7922    }
7923}
7924
7925/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7926/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7927/// (the element is zeroed out when the corresponding mask bit is not set).
7928///
7929/// Rounding is done according to the rounding parameter, which can be one of:
7930///
7931/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7932/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7933/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7934/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7935/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7936///
7937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ph)
7938#[inline]
7939#[target_feature(enable = "avx512fp16")]
7940#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7941#[rustc_legacy_const_generics(4)]
7942#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7943pub fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
7944    k: __mmask32,
7945    a: __m512h,
7946    b: __m512h,
7947    c: __m512h,
7948) -> __m512h {
7949    unsafe {
7950        static_assert_rounding!(ROUNDING);
7951        simd_select_bitmask(
7952            k,
7953            _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
7954            _mm512_setzero_ph(),
7955        )
7956    }
7957}
7958
7959/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7960/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7961///
7962/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ph)
7963#[inline]
7964#[target_feature(enable = "avx512fp16,avx512vl")]
7965#[cfg_attr(test, assert_instr(vrcpph))]
7966#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7967pub fn _mm_rcp_ph(a: __m128h) -> __m128h {
7968    _mm_mask_rcp_ph(_mm_undefined_ph(), 0xff, a)
7969}
7970
7971/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7972/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7973/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7974///
7975/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_ph)
7976#[inline]
7977#[target_feature(enable = "avx512fp16,avx512vl")]
7978#[cfg_attr(test, assert_instr(vrcpph))]
7979#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7980pub fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7981    unsafe { vrcpph_128(a, src, k) }
7982}
7983
7984/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7985/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7986/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7987///
7988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_ph)
7989#[inline]
7990#[target_feature(enable = "avx512fp16,avx512vl")]
7991#[cfg_attr(test, assert_instr(vrcpph))]
7992#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
7993pub fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
7994    _mm_mask_rcp_ph(_mm_setzero_ph(), k, a)
7995}
7996
7997/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7998/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7999///
8000/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ph)
8001#[inline]
8002#[target_feature(enable = "avx512fp16,avx512vl")]
8003#[cfg_attr(test, assert_instr(vrcpph))]
8004#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8005pub fn _mm256_rcp_ph(a: __m256h) -> __m256h {
8006    _mm256_mask_rcp_ph(_mm256_undefined_ph(), 0xffff, a)
8007}
8008
8009/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
8010/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
8011/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8012///
8013/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp_ph)
8014#[inline]
8015#[target_feature(enable = "avx512fp16,avx512vl")]
8016#[cfg_attr(test, assert_instr(vrcpph))]
8017#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8018pub fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8019    unsafe { vrcpph_256(a, src, k) }
8020}
8021
8022/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
8023/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
8024/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8025///
8026/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp_ph)
8027#[inline]
8028#[target_feature(enable = "avx512fp16,avx512vl")]
8029#[cfg_attr(test, assert_instr(vrcpph))]
8030#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8031pub fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
8032    _mm256_mask_rcp_ph(_mm256_setzero_ph(), k, a)
8033}
8034
8035/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
8036/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8037///
8038/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp_ph)
8039#[inline]
8040#[target_feature(enable = "avx512fp16")]
8041#[cfg_attr(test, assert_instr(vrcpph))]
8042#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8043pub fn _mm512_rcp_ph(a: __m512h) -> __m512h {
8044    _mm512_mask_rcp_ph(_mm512_undefined_ph(), 0xffffffff, a)
8045}
8046
8047/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
8048/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
8049/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8050///
8051/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp_ph)
8052#[inline]
8053#[target_feature(enable = "avx512fp16")]
8054#[cfg_attr(test, assert_instr(vrcpph))]
8055#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8056pub fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8057    unsafe { vrcpph_512(a, src, k) }
8058}
8059
8060/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
8061/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
8062/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8063///
8064/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp_ph)
8065#[inline]
8066#[target_feature(enable = "avx512fp16")]
8067#[cfg_attr(test, assert_instr(vrcpph))]
8068#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8069pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
8070    _mm512_mask_rcp_ph(_mm512_setzero_ph(), k, a)
8071}
8072
8073/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
8074/// store the result in the lower element of dst, and copy the upper 7 packed elements from a to the
8075/// upper elements of dst.
8076/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8077///
8078/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_sh)
8079#[inline]
8080#[target_feature(enable = "avx512fp16")]
8081#[cfg_attr(test, assert_instr(vrcpsh))]
8082#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8083pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
8084    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
8085}
8086
8087/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
8088/// store the result in the lower element of dst using writemask k (the element is copied from src when
8089/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8090/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8091///
8092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_sh)
8093#[inline]
8094#[target_feature(enable = "avx512fp16")]
8095#[cfg_attr(test, assert_instr(vrcpsh))]
8096#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8097pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8098    unsafe { vrcpsh(a, b, src, k) }
8099}
8100
8101/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
8102/// store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8103/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8104/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8105///
8106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_sh)
8107#[inline]
8108#[target_feature(enable = "avx512fp16")]
8109#[cfg_attr(test, assert_instr(vrcpsh))]
8110#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8111pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8112    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), k, a, b)
8113}
8114
8115/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8116/// elements in a, and store the results in dst.
8117/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8118///
8119/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ph)
8120#[inline]
8121#[target_feature(enable = "avx512fp16,avx512vl")]
8122#[cfg_attr(test, assert_instr(vrsqrtph))]
8123#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8124pub fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
8125    _mm_mask_rsqrt_ph(_mm_undefined_ph(), 0xff, a)
8126}
8127
8128/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8129/// elements in a, and store the results in dst using writemask k (elements are copied from src when
8130/// the corresponding mask bit is not set).
8131/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8132///
8133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_ph)
8134#[inline]
8135#[target_feature(enable = "avx512fp16,avx512vl")]
8136#[cfg_attr(test, assert_instr(vrsqrtph))]
8137#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8138pub fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8139    unsafe { vrsqrtph_128(a, src, k) }
8140}
8141
8142/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8143/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
8144/// corresponding mask bit is not set).
8145/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8146///
8147/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_ph)
8148#[inline]
8149#[target_feature(enable = "avx512fp16,avx512vl")]
8150#[cfg_attr(test, assert_instr(vrsqrtph))]
8151#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8152pub fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
8153    _mm_mask_rsqrt_ph(_mm_setzero_ph(), k, a)
8154}
8155
8156/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8157/// elements in a, and store the results in dst.
8158/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8159///
8160/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ph)
8161#[inline]
8162#[target_feature(enable = "avx512fp16,avx512vl")]
8163#[cfg_attr(test, assert_instr(vrsqrtph))]
8164#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8165pub fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
8166    _mm256_mask_rsqrt_ph(_mm256_undefined_ph(), 0xffff, a)
8167}
8168
8169/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8170/// elements in a, and store the results in dst using writemask k (elements are copied from src when
8171/// the corresponding mask bit is not set).
8172/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8173///
8174/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt_ph)
8175#[inline]
8176#[target_feature(enable = "avx512fp16,avx512vl")]
8177#[cfg_attr(test, assert_instr(vrsqrtph))]
8178#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8179pub fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8180    unsafe { vrsqrtph_256(a, src, k) }
8181}
8182
8183/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8184/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
8185/// corresponding mask bit is not set).
8186/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8187///
8188/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt_ph)
8189#[inline]
8190#[target_feature(enable = "avx512fp16,avx512vl")]
8191#[cfg_attr(test, assert_instr(vrsqrtph))]
8192#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8193pub fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
8194    _mm256_mask_rsqrt_ph(_mm256_setzero_ph(), k, a)
8195}
8196
8197/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8198/// elements in a, and store the results in dst.
8199/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8200///
8201/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt_ph)
8202#[inline]
8203#[target_feature(enable = "avx512fp16")]
8204#[cfg_attr(test, assert_instr(vrsqrtph))]
8205#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8206pub fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
8207    _mm512_mask_rsqrt_ph(_mm512_undefined_ph(), 0xffffffff, a)
8208}
8209
8210/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8211/// elements in a, and store the results in dst using writemask k (elements are copied from src when
8212/// the corresponding mask bit is not set).
8213/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8214///
8215/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt_ph)
8216#[inline]
8217#[target_feature(enable = "avx512fp16")]
8218#[cfg_attr(test, assert_instr(vrsqrtph))]
8219#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8220pub fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8221    unsafe { vrsqrtph_512(a, src, k) }
8222}
8223
8224/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8225/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
8226/// corresponding mask bit is not set).
8227/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8228///
8229/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt_ph)
8230#[inline]
8231#[target_feature(enable = "avx512fp16")]
8232#[cfg_attr(test, assert_instr(vrsqrtph))]
8233#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8234pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
8235    _mm512_mask_rsqrt_ph(_mm512_setzero_ph(), k, a)
8236}
8237
8238/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
8239/// element in b, store the result in the lower element of dst, and copy the upper 7 packed elements from a
8240/// to the upper elements of dst.
8241/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8242///
8243/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_sh)
8244#[inline]
8245#[target_feature(enable = "avx512fp16")]
8246#[cfg_attr(test, assert_instr(vrsqrtsh))]
8247#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8248pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
8249    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
8250}
8251
8252/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
8253/// element in b, store the result in the lower element of dst using writemask k (the element is copied from src
8254/// when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8255/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8256///
8257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_sh)
8258#[inline]
8259#[target_feature(enable = "avx512fp16")]
8260#[cfg_attr(test, assert_instr(vrsqrtsh))]
8261#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8262pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8263    unsafe { vrsqrtsh(a, b, src, k) }
8264}
8265
8266/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
8267/// element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when
8268/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8269/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8270///
8271/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_sh)
8272#[inline]
8273#[target_feature(enable = "avx512fp16")]
8274#[cfg_attr(test, assert_instr(vrsqrtsh))]
8275#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8276pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8277    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
8278}
8279
8280/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8281/// results in dst.
8282///
8283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ph)
8284#[inline]
8285#[target_feature(enable = "avx512fp16,avx512vl")]
8286#[cfg_attr(test, assert_instr(vsqrtph))]
8287#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8288pub fn _mm_sqrt_ph(a: __m128h) -> __m128h {
8289    unsafe { simd_fsqrt(a) }
8290}
8291
8292/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8293/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8294///
8295/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ph)
8296#[inline]
8297#[target_feature(enable = "avx512fp16,avx512vl")]
8298#[cfg_attr(test, assert_instr(vsqrtph))]
8299#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8300pub fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8301    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), src) }
8302}
8303
8304/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8305/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8306///
8307/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ph)
8308#[inline]
8309#[target_feature(enable = "avx512fp16,avx512vl")]
8310#[cfg_attr(test, assert_instr(vsqrtph))]
8311#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8312pub fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
8313    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph()) }
8314}
8315
8316/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8317/// results in dst.
8318///
8319/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ph)
8320#[inline]
8321#[target_feature(enable = "avx512fp16,avx512vl")]
8322#[cfg_attr(test, assert_instr(vsqrtph))]
8323#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8324pub fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
8325    unsafe { simd_fsqrt(a) }
8326}
8327
8328/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8329/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8330///
8331/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ph)
8332#[inline]
8333#[target_feature(enable = "avx512fp16,avx512vl")]
8334#[cfg_attr(test, assert_instr(vsqrtph))]
8335#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8336pub fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8337    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), src) }
8338}
8339
8340/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8341/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8342///
8343/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ph)
8344#[inline]
8345#[target_feature(enable = "avx512fp16,avx512vl")]
8346#[cfg_attr(test, assert_instr(vsqrtph))]
8347#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8348pub fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
8349    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph()) }
8350}
8351
8352/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8353/// results in dst.
8354///
8355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ph)
8356#[inline]
8357#[target_feature(enable = "avx512fp16")]
8358#[cfg_attr(test, assert_instr(vsqrtph))]
8359#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8360pub fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
8361    unsafe { simd_fsqrt(a) }
8362}
8363
8364/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8365/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8366///
8367/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ph)
8368#[inline]
8369#[target_feature(enable = "avx512fp16")]
8370#[cfg_attr(test, assert_instr(vsqrtph))]
8371#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8372pub fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8373    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), src) }
8374}
8375
8376/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8377/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8378///
8379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ph)
8380#[inline]
8381#[target_feature(enable = "avx512fp16")]
8382#[cfg_attr(test, assert_instr(vsqrtph))]
8383#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8384pub fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
8385    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph()) }
8386}
8387
8388/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8389/// results in dst.
8390/// Rounding is done according to the rounding parameter, which can be one of:
8391///
8392/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8393/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8394/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8395/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8396/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8397///
8398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ph)
8399#[inline]
8400#[target_feature(enable = "avx512fp16")]
8401#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8402#[rustc_legacy_const_generics(1)]
8403#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8404pub fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
8405    unsafe {
8406        static_assert_rounding!(ROUNDING);
8407        vsqrtph_512(a, ROUNDING)
8408    }
8409}
8410
8411/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8412/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8413/// Rounding is done according to the rounding parameter, which can be one of:
8414///
8415/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8416/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8417/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8418/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8419/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8420///
8421/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ph)
8422#[inline]
8423#[target_feature(enable = "avx512fp16")]
8424#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8425#[rustc_legacy_const_generics(3)]
8426#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8427pub fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
8428    src: __m512h,
8429    k: __mmask32,
8430    a: __m512h,
8431) -> __m512h {
8432    unsafe {
8433        static_assert_rounding!(ROUNDING);
8434        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
8435    }
8436}
8437
8438/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8439/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8440/// Rounding is done according to the rounding parameter, which can be one of:
8441///
8442/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8443/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8444/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8445/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8446/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8447///
8448/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ph)
8449#[inline]
8450#[target_feature(enable = "avx512fp16")]
8451#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8452#[rustc_legacy_const_generics(2)]
8453#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8454pub fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
8455    unsafe {
8456        static_assert_rounding!(ROUNDING);
8457        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
8458    }
8459}
8460
8461/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8462/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8463/// elements of dst.
8464///
8465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sh)
8466#[inline]
8467#[target_feature(enable = "avx512fp16")]
8468#[cfg_attr(test, assert_instr(vsqrtsh))]
8469#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8470pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
8471    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
8472}
8473
8474/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8475/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8476/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8477///
8478/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sh)
8479#[inline]
8480#[target_feature(enable = "avx512fp16")]
8481#[cfg_attr(test, assert_instr(vsqrtsh))]
8482#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8483pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8484    _mm_mask_sqrt_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8485}
8486
8487/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8488/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8489/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8490///
8491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sh)
8492#[inline]
8493#[target_feature(enable = "avx512fp16")]
8494#[cfg_attr(test, assert_instr(vsqrtsh))]
8495#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8496pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8497    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
8498}
8499
8500/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8501/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8502/// elements of dst.
8503/// Rounding is done according to the rounding parameter, which can be one of:
8504///
8505/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8506/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8507/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8508/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8509/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8510///
8511/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sh)
8512#[inline]
8513#[target_feature(enable = "avx512fp16")]
8514#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8515#[rustc_legacy_const_generics(2)]
8516#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8517pub fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
8518    static_assert_rounding!(ROUNDING);
8519    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
8520}
8521
8522/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8523/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8524/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8525/// Rounding is done according to the rounding parameter, which can be one of:
8526///
8527/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8528/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8529/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8530/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8531/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8532///
8533/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sh)
8534#[inline]
8535#[target_feature(enable = "avx512fp16")]
8536#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8537#[rustc_legacy_const_generics(4)]
8538#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8539pub fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
8540    src: __m128h,
8541    k: __mmask8,
8542    a: __m128h,
8543    b: __m128h,
8544) -> __m128h {
8545    unsafe {
8546        static_assert_rounding!(ROUNDING);
8547        vsqrtsh(a, b, src, k, ROUNDING)
8548    }
8549}
8550
8551/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8552/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8553/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8554/// Rounding is done according to the rounding parameter, which can be one of:
8555///
8556/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8557/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8558/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8559/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8560/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8561///
8562/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sh)
8563#[inline]
8564#[target_feature(enable = "avx512fp16")]
8565#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8566#[rustc_legacy_const_generics(3)]
8567#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8568pub fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
8569    k: __mmask8,
8570    a: __m128h,
8571    b: __m128h,
8572) -> __m128h {
8573    static_assert_rounding!(ROUNDING);
8574    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
8575}
8576
8577/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8578/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8579/// value when inputs are NaN or signed-zero values.
8580///
8581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ph)
8582#[inline]
8583#[target_feature(enable = "avx512fp16,avx512vl")]
8584#[cfg_attr(test, assert_instr(vmaxph))]
8585#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8586pub fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
8587    unsafe { vmaxph_128(a, b) }
8588}
8589
8590/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8591/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8592/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8593/// NaN or signed-zero values.
8594///
8595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ph)
8596#[inline]
8597#[target_feature(enable = "avx512fp16,avx512vl")]
8598#[cfg_attr(test, assert_instr(vmaxph))]
8599#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8600pub fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8601    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), src) }
8602}
8603
8604/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8605/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8606/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8607/// NaN or signed-zero values.
8608///
8609/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ph)
8610#[inline]
8611#[target_feature(enable = "avx512fp16,avx512vl")]
8612#[cfg_attr(test, assert_instr(vmaxph))]
8613#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8614pub fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8615    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph()) }
8616}
8617
8618/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8619/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8620/// value when inputs are NaN or signed-zero values.
8621///
8622/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ph)
8623#[inline]
8624#[target_feature(enable = "avx512fp16,avx512vl")]
8625#[cfg_attr(test, assert_instr(vmaxph))]
8626#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8627pub fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
8628    unsafe { vmaxph_256(a, b) }
8629}
8630
8631/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8632/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8633/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8634/// NaN or signed-zero values.
8635///
8636/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ph)
8637#[inline]
8638#[target_feature(enable = "avx512fp16,avx512vl")]
8639#[cfg_attr(test, assert_instr(vmaxph))]
8640#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8641pub fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8642    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), src) }
8643}
8644
8645/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8646/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8647/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8648/// NaN or signed-zero values.
8649///
8650/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ph)
8651#[inline]
8652#[target_feature(enable = "avx512fp16,avx512vl")]
8653#[cfg_attr(test, assert_instr(vmaxph))]
8654#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8655pub fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8656    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph()) }
8657}
8658
8659/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8660/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8661/// value when inputs are NaN or signed-zero values.
8662///
8663/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ph)
8664#[inline]
8665#[target_feature(enable = "avx512fp16")]
8666#[cfg_attr(test, assert_instr(vmaxph))]
8667#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8668pub fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
8669    _mm512_max_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8670}
8671
8672/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8673/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8674/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8675/// NaN or signed-zero values.
8676///
8677/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ph)
8678#[inline]
8679#[target_feature(enable = "avx512fp16")]
8680#[cfg_attr(test, assert_instr(vmaxph))]
8681#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8682pub fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8683    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), src) }
8684}
8685
8686/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8687/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8688/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8689/// NaN or signed-zero values.
8690///
8691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ph)
8692#[inline]
8693#[target_feature(enable = "avx512fp16")]
8694#[cfg_attr(test, assert_instr(vmaxph))]
8695#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8696pub fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8697    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph()) }
8698}
8699
8700/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8701/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8702/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8703/// NaN or signed-zero values.
8704///
8705/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ph)
8706#[inline]
8707#[target_feature(enable = "avx512fp16")]
8708#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8709#[rustc_legacy_const_generics(2)]
8710#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8711pub fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8712    unsafe {
8713        static_assert_sae!(SAE);
8714        vmaxph_512(a, b, SAE)
8715    }
8716}
8717
8718/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8719/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8720/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8721/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8722///
8723/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ph)
8724#[inline]
8725#[target_feature(enable = "avx512fp16")]
8726#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8727#[rustc_legacy_const_generics(4)]
8728#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8729pub fn _mm512_mask_max_round_ph<const SAE: i32>(
8730    src: __m512h,
8731    k: __mmask32,
8732    a: __m512h,
8733    b: __m512h,
8734) -> __m512h {
8735    unsafe {
8736        static_assert_sae!(SAE);
8737        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
8738    }
8739}
8740
8741/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8742/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8743/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8744/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8745///
8746/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ph)
8747#[inline]
8748#[target_feature(enable = "avx512fp16")]
8749#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8750#[rustc_legacy_const_generics(3)]
8751#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8752pub fn _mm512_maskz_max_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8753    unsafe {
8754        static_assert_sae!(SAE);
8755        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8756    }
8757}
8758
8759/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8760/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8761/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value
8762/// when inputs are NaN or signed-zero values.
8763///
8764/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sh)
8765#[inline]
8766#[target_feature(enable = "avx512fp16,avx512vl")]
8767#[cfg_attr(test, assert_instr(vmaxsh))]
8768#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8769pub fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
8770    _mm_mask_max_sh(_mm_undefined_ph(), 0xff, a, b)
8771}
8772
8773/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8774/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8775/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8776/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8777///
8778/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sh)
8779#[inline]
8780#[target_feature(enable = "avx512fp16,avx512vl")]
8781#[cfg_attr(test, assert_instr(vmaxsh))]
8782#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8783pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8784    _mm_mask_max_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8785}
8786
8787/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8788/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8789/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8790/// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8791///
8792/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sh)
8793#[inline]
8794#[target_feature(enable = "avx512fp16,avx512vl")]
8795#[cfg_attr(test, assert_instr(vmaxsh))]
8796#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8797pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8798    _mm_mask_max_sh(f16x8::ZERO.as_m128h(), k, a, b)
8799}
8800
8801/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8802/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8803/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8804/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8805///
8806/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sh)
8807#[inline]
8808#[target_feature(enable = "avx512fp16,avx512vl")]
8809#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8810#[rustc_legacy_const_generics(2)]
8811#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8812pub fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8813    static_assert_sae!(SAE);
8814    _mm_mask_max_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8815}
8816
8817/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8818/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8819/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8820/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8821/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8822///
8823/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sh)
8824#[inline]
8825#[target_feature(enable = "avx512fp16,avx512vl")]
8826#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8827#[rustc_legacy_const_generics(4)]
8828#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8829pub fn _mm_mask_max_round_sh<const SAE: i32>(
8830    src: __m128h,
8831    k: __mmask8,
8832    a: __m128h,
8833    b: __m128h,
8834) -> __m128h {
8835    unsafe {
8836        static_assert_sae!(SAE);
8837        vmaxsh(a, b, src, k, SAE)
8838    }
8839}
8840
8841/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8842/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8843/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8844/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8845/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8846///
8847/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sh)
8848#[inline]
8849#[target_feature(enable = "avx512fp16,avx512vl")]
8850#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8851#[rustc_legacy_const_generics(3)]
8852#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8853pub fn _mm_maskz_max_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8854    static_assert_sae!(SAE);
8855    _mm_mask_max_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
8856}
8857
8858/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8859/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8860/// when inputs are NaN or signed-zero values.
8861///
8862/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ph)
8863#[inline]
8864#[target_feature(enable = "avx512fp16,avx512vl")]
8865#[cfg_attr(test, assert_instr(vminph))]
8866#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8867pub fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
8868    unsafe { vminph_128(a, b) }
8869}
8870
8871/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8872/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8873/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8874/// NaN or signed-zero values.
8875///
8876/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ph)
8877#[inline]
8878#[target_feature(enable = "avx512fp16,avx512vl")]
8879#[cfg_attr(test, assert_instr(vminph))]
8880#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8881pub fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8882    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), src) }
8883}
8884
8885/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8886/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8887/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8888/// NaN or signed-zero values.
8889///
8890/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ph)
8891#[inline]
8892#[target_feature(enable = "avx512fp16,avx512vl")]
8893#[cfg_attr(test, assert_instr(vminph))]
8894#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8895pub fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8896    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph()) }
8897}
8898
8899/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8900/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8901/// when inputs are NaN or signed-zero values.
8902///
8903/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ph)
8904#[inline]
8905#[target_feature(enable = "avx512fp16,avx512vl")]
8906#[cfg_attr(test, assert_instr(vminph))]
8907#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8908pub fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
8909    unsafe { vminph_256(a, b) }
8910}
8911
8912/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8913/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8914/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8915/// NaN or signed-zero values.
8916///
8917/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ph)
8918#[inline]
8919#[target_feature(enable = "avx512fp16,avx512vl")]
8920#[cfg_attr(test, assert_instr(vminph))]
8921#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8922pub fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8923    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), src) }
8924}
8925
8926/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8927/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8928/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8929/// NaN or signed-zero values.
8930///
8931/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ph)
8932#[inline]
8933#[target_feature(enable = "avx512fp16,avx512vl")]
8934#[cfg_attr(test, assert_instr(vminph))]
8935#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8936pub fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8937    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph()) }
8938}
8939
8940/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8941/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8942/// when inputs are NaN or signed-zero values.
8943///
8944/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ph)
8945#[inline]
8946#[target_feature(enable = "avx512fp16")]
8947#[cfg_attr(test, assert_instr(vminph))]
8948#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8949pub fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
8950    _mm512_min_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8951}
8952
8953/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8954/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8955/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8956/// NaN or signed-zero values.
8957///
8958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ph)
8959#[inline]
8960#[target_feature(enable = "avx512fp16")]
8961#[cfg_attr(test, assert_instr(vminph))]
8962#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8963pub fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8964    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), src) }
8965}
8966
8967/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8968/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8969/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8970/// NaN or signed-zero values.
8971///
8972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ph)
8973#[inline]
8974#[target_feature(enable = "avx512fp16")]
8975#[cfg_attr(test, assert_instr(vminph))]
8976#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8977pub fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8978    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph()) }
8979}
8980
8981/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8982/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not
8983/// follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8984///
8985/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ph)
8986#[inline]
8987#[target_feature(enable = "avx512fp16")]
8988#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8989#[rustc_legacy_const_generics(2)]
8990#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
8991pub fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8992    unsafe {
8993        static_assert_sae!(SAE);
8994        vminph_512(a, b, SAE)
8995    }
8996}
8997
8998/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8999/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9000/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
9001/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9002///
9003/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ph)
9004#[inline]
9005#[target_feature(enable = "avx512fp16")]
9006#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
9007#[rustc_legacy_const_generics(4)]
9008#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9009pub fn _mm512_mask_min_round_ph<const SAE: i32>(
9010    src: __m512h,
9011    k: __mmask32,
9012    a: __m512h,
9013    b: __m512h,
9014) -> __m512h {
9015    unsafe {
9016        static_assert_sae!(SAE);
9017        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
9018    }
9019}
9020
9021/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
9022/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9023/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
9024/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9025///
9026/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ph)
9027#[inline]
9028#[target_feature(enable = "avx512fp16")]
9029#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
9030#[rustc_legacy_const_generics(3)]
9031#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9032pub fn _mm512_maskz_min_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
9033    unsafe {
9034        static_assert_sae!(SAE);
9035        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
9036    }
9037}
9038
9039/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
9040/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
9041/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
9042/// inputs are NaN or signed-zero values.
9043///
9044/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sh)
9045#[inline]
9046#[target_feature(enable = "avx512fp16,avx512vl")]
9047#[cfg_attr(test, assert_instr(vminsh))]
9048#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9049pub fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
9050    _mm_mask_min_sh(_mm_undefined_ph(), 0xff, a, b)
9051}
9052
9053/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
9054/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
9055/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
9056/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9057///
9058/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sh)
9059#[inline]
9060#[target_feature(enable = "avx512fp16,avx512vl")]
9061#[cfg_attr(test, assert_instr(vminsh))]
9062#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9063pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9064    _mm_mask_min_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9065}
9066
9067/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
9068/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
9069/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
9070/// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9071///
9072/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sh)
9073#[inline]
9074#[target_feature(enable = "avx512fp16,avx512vl")]
9075#[cfg_attr(test, assert_instr(vminsh))]
9076#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9077pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9078    _mm_mask_min_sh(f16x8::ZERO.as_m128h(), k, a, b)
9079}
9080
9081/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
9082/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
9083/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
9084/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9085///
9086/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sh)
9087#[inline]
9088#[target_feature(enable = "avx512fp16,avx512vl")]
9089#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
9090#[rustc_legacy_const_generics(2)]
9091#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9092pub fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
9093    static_assert_sae!(SAE);
9094    _mm_mask_min_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
9095}
9096
9097/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
9098/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9099/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
9100/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
9101/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9102///
9103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sh)
9104#[inline]
9105#[target_feature(enable = "avx512fp16,avx512vl")]
9106#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
9107#[rustc_legacy_const_generics(4)]
9108#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9109pub fn _mm_mask_min_round_sh<const SAE: i32>(
9110    src: __m128h,
9111    k: __mmask8,
9112    a: __m128h,
9113    b: __m128h,
9114) -> __m128h {
9115    unsafe {
9116        static_assert_sae!(SAE);
9117        vminsh(a, b, src, k, SAE)
9118    }
9119}
9120
9121/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
9122/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
9123/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
9124/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
9125/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9126///
9127/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sh)
9128#[inline]
9129#[target_feature(enable = "avx512fp16,avx512vl")]
9130#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
9131#[rustc_legacy_const_generics(3)]
9132#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9133pub fn _mm_maskz_min_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9134    static_assert_sae!(SAE);
9135    _mm_mask_min_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
9136}
9137
9138/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9139/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9140/// This intrinsic essentially calculates `floor(log2(x))` for each element.
9141///
9142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
9143#[inline]
9144#[target_feature(enable = "avx512fp16,avx512vl")]
9145#[cfg_attr(test, assert_instr(vgetexpph))]
9146#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9147pub fn _mm_getexp_ph(a: __m128h) -> __m128h {
9148    _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
9149}
9150
9151/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9152/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9153/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9154/// `floor(log2(x))` for each element.
9155///
9156/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
9157#[inline]
9158#[target_feature(enable = "avx512fp16,avx512vl")]
9159#[cfg_attr(test, assert_instr(vgetexpph))]
9160#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9161pub fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
9162    unsafe { vgetexpph_128(a, src, k) }
9163}
9164
9165/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9166/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9167/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9168/// `floor(log2(x))` for each element.
9169///
9170/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
9171#[inline]
9172#[target_feature(enable = "avx512fp16,avx512vl")]
9173#[cfg_attr(test, assert_instr(vgetexpph))]
9174#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9175pub fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
9176    _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
9177}
9178
9179/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9180/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9181/// This intrinsic essentially calculates `floor(log2(x))` for each element.
9182///
9183/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
9184#[inline]
9185#[target_feature(enable = "avx512fp16,avx512vl")]
9186#[cfg_attr(test, assert_instr(vgetexpph))]
9187#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9188pub fn _mm256_getexp_ph(a: __m256h) -> __m256h {
9189    _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
9190}
9191
9192/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9193/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9194/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9195/// `floor(log2(x))` for each element.
9196///
9197/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
9198#[inline]
9199#[target_feature(enable = "avx512fp16,avx512vl")]
9200#[cfg_attr(test, assert_instr(vgetexpph))]
9201#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9202pub fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
9203    unsafe { vgetexpph_256(a, src, k) }
9204}
9205
9206/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9207/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9208/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9209/// `floor(log2(x))` for each element.
9210///
9211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
9212#[inline]
9213#[target_feature(enable = "avx512fp16,avx512vl")]
9214#[cfg_attr(test, assert_instr(vgetexpph))]
9215#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9216pub fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
9217    _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
9218}
9219
9220/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9221/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9222/// This intrinsic essentially calculates `floor(log2(x))` for each element.
9223///
9224/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
9225#[inline]
9226#[target_feature(enable = "avx512fp16")]
9227#[cfg_attr(test, assert_instr(vgetexpph))]
9228#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9229pub fn _mm512_getexp_ph(a: __m512h) -> __m512h {
9230    _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
9231}
9232
9233/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9234/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9235/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9236/// `floor(log2(x))` for each element.
9237///
9238/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
9239#[inline]
9240#[target_feature(enable = "avx512fp16")]
9241#[cfg_attr(test, assert_instr(vgetexpph))]
9242#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9243pub fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
9244    _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
9245}
9246
9247/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9248/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9249/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9250/// `floor(log2(x))` for each element.
9251///
9252/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
9253#[inline]
9254#[target_feature(enable = "avx512fp16")]
9255#[cfg_attr(test, assert_instr(vgetexpph))]
9256#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9257pub fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
9258    _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
9259}
9260
9261/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9262/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9263/// This intrinsic essentially calculates `floor(log2(x))` for each element. Exceptions can be suppressed
9264/// by passing _MM_FROUND_NO_EXC in the sae parameter
9265///
9266/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
9267#[inline]
9268#[target_feature(enable = "avx512fp16")]
9269#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9270#[rustc_legacy_const_generics(1)]
9271#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9272pub fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
9273    static_assert_sae!(SAE);
9274    _mm512_mask_getexp_round_ph::<SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9275}
9276
9277/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9278/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9279/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9280/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9281///
9282/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
9283#[inline]
9284#[target_feature(enable = "avx512fp16")]
9285#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9286#[rustc_legacy_const_generics(3)]
9287#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9288pub fn _mm512_mask_getexp_round_ph<const SAE: i32>(
9289    src: __m512h,
9290    k: __mmask32,
9291    a: __m512h,
9292) -> __m512h {
9293    unsafe {
9294        static_assert_sae!(SAE);
9295        vgetexpph_512(a, src, k, SAE)
9296    }
9297}
9298
9299/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9300/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9301/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9302/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9303///
9304/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
9305#[inline]
9306#[target_feature(enable = "avx512fp16")]
9307#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9308#[rustc_legacy_const_generics(2)]
9309#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9310pub fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
9311    static_assert_sae!(SAE);
9312    _mm512_mask_getexp_round_ph::<SAE>(_mm512_setzero_ph(), k, a)
9313}
9314
9315/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9316/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9317/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9318/// calculates `floor(log2(x))` for the lower element.
9319///
9320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
9321#[inline]
9322#[target_feature(enable = "avx512fp16")]
9323#[cfg_attr(test, assert_instr(vgetexpsh))]
9324#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9325pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
9326    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
9327}
9328
9329/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9330/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9331/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9332/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9333/// for the lower element.
9334///
9335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
9336#[inline]
9337#[target_feature(enable = "avx512fp16")]
9338#[cfg_attr(test, assert_instr(vgetexpsh))]
9339#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9340pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9341    _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9342}
9343
9344/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9345/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9346/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9347/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9348/// lower element.
9349///
9350/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
9351#[inline]
9352#[target_feature(enable = "avx512fp16")]
9353#[cfg_attr(test, assert_instr(vgetexpsh))]
9354#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9355pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9356    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), k, a, b)
9357}
9358
9359/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9360/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9361/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9362/// calculates `floor(log2(x))` for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9363/// in the sae parameter
9364///
9365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
9366#[inline]
9367#[target_feature(enable = "avx512fp16")]
9368#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9369#[rustc_legacy_const_generics(2)]
9370#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9371pub fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
9372    static_assert_sae!(SAE);
9373    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9374}
9375
9376/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9377/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9378/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9379/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9380/// for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9381///
9382/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
9383#[inline]
9384#[target_feature(enable = "avx512fp16")]
9385#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9386#[rustc_legacy_const_generics(4)]
9387#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9388pub fn _mm_mask_getexp_round_sh<const SAE: i32>(
9389    src: __m128h,
9390    k: __mmask8,
9391    a: __m128h,
9392    b: __m128h,
9393) -> __m128h {
9394    unsafe {
9395        static_assert_sae!(SAE);
9396        vgetexpsh(a, b, src, k, SAE)
9397    }
9398}
9399
9400/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9401/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9402/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9403/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9404/// lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9405///
9406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
9407#[inline]
9408#[target_feature(enable = "avx512fp16")]
9409#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9410#[rustc_legacy_const_generics(3)]
9411#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9412pub fn _mm_maskz_getexp_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9413    static_assert_sae!(SAE);
9414    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
9415}
9416
9417/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9418/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9419/// on the interval range defined by norm and the sign depends on sign and the source sign.
9420///
9421/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9422///
9423///     _MM_MANT_NORM_1_2     // interval [1, 2)
9424///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9425///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9426///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9427///
9428/// The sign is determined by sc which can take the following values:
9429///
9430///     _MM_MANT_SIGN_src     // sign = sign(src)
9431///     _MM_MANT_SIGN_zero    // sign = 0
9432///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9433///
9434/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
9435#[inline]
9436#[target_feature(enable = "avx512fp16,avx512vl")]
9437#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9438#[rustc_legacy_const_generics(1, 2)]
9439#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9440pub fn _mm_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9441    a: __m128h,
9442) -> __m128h {
9443    static_assert_uimm_bits!(NORM, 4);
9444    static_assert_uimm_bits!(SIGN, 2);
9445    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a)
9446}
9447
9448/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9449/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9450/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9451/// by norm and the sign depends on sign and the source sign.
9452///
9453/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9454///
9455///     _MM_MANT_NORM_1_2     // interval [1, 2)
9456///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9457///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9458///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9459///
9460/// The sign is determined by sc which can take the following values:
9461///
9462///     _MM_MANT_SIGN_src     // sign = sign(src)
9463///     _MM_MANT_SIGN_zero    // sign = 0
9464///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9465///
9466/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
9467#[inline]
9468#[target_feature(enable = "avx512fp16,avx512vl")]
9469#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9470#[rustc_legacy_const_generics(3, 4)]
9471#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9472pub fn _mm_mask_getmant_ph<
9473    const NORM: _MM_MANTISSA_NORM_ENUM,
9474    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9475>(
9476    src: __m128h,
9477    k: __mmask8,
9478    a: __m128h,
9479) -> __m128h {
9480    unsafe {
9481        static_assert_uimm_bits!(NORM, 4);
9482        static_assert_uimm_bits!(SIGN, 2);
9483        vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
9484    }
9485}
9486
9487/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9488/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9489/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9490/// by norm and the sign depends on sign and the source sign.
9491///
9492/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9493///
9494///     _MM_MANT_NORM_1_2     // interval [1, 2)
9495///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9496///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9497///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9498///
9499/// The sign is determined by sc which can take the following values:
9500///
9501///     _MM_MANT_SIGN_src     // sign = sign(src)
9502///     _MM_MANT_SIGN_zero    // sign = 0
9503///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9504///
9505/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
9506#[inline]
9507#[target_feature(enable = "avx512fp16,avx512vl")]
9508#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9509#[rustc_legacy_const_generics(2, 3)]
9510#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9511pub fn _mm_maskz_getmant_ph<
9512    const NORM: _MM_MANTISSA_NORM_ENUM,
9513    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9514>(
9515    k: __mmask8,
9516    a: __m128h,
9517) -> __m128h {
9518    static_assert_uimm_bits!(NORM, 4);
9519    static_assert_uimm_bits!(SIGN, 2);
9520    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_setzero_ph(), k, a)
9521}
9522
9523/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9524/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9525/// on the interval range defined by norm and the sign depends on sign and the source sign.
9526///
9527/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9528///
9529///     _MM_MANT_NORM_1_2     // interval [1, 2)
9530///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9531///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9532///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9533///
9534/// The sign is determined by sc which can take the following values:
9535///
9536///     _MM_MANT_SIGN_src     // sign = sign(src)
9537///     _MM_MANT_SIGN_zero    // sign = 0
9538///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9539///
9540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
9541#[inline]
9542#[target_feature(enable = "avx512fp16,avx512vl")]
9543#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9544#[rustc_legacy_const_generics(1, 2)]
9545#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9546pub fn _mm256_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9547    a: __m256h,
9548) -> __m256h {
9549    static_assert_uimm_bits!(NORM, 4);
9550    static_assert_uimm_bits!(SIGN, 2);
9551    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_undefined_ph(), 0xffff, a)
9552}
9553
9554/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9555/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9556/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9557/// by norm and the sign depends on sign and the source sign.
9558///
9559/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9560///
9561///     _MM_MANT_NORM_1_2     // interval [1, 2)
9562///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9563///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9564///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9565///
9566/// The sign is determined by sc which can take the following values:
9567///
9568///     _MM_MANT_SIGN_src     // sign = sign(src)
9569///     _MM_MANT_SIGN_zero    // sign = 0
9570///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9571///
9572/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
9573#[inline]
9574#[target_feature(enable = "avx512fp16,avx512vl")]
9575#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9576#[rustc_legacy_const_generics(3, 4)]
9577#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9578pub fn _mm256_mask_getmant_ph<
9579    const NORM: _MM_MANTISSA_NORM_ENUM,
9580    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9581>(
9582    src: __m256h,
9583    k: __mmask16,
9584    a: __m256h,
9585) -> __m256h {
9586    unsafe {
9587        static_assert_uimm_bits!(NORM, 4);
9588        static_assert_uimm_bits!(SIGN, 2);
9589        vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
9590    }
9591}
9592
9593/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9594/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9595/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9596/// by norm and the sign depends on sign and the source sign.
9597///
9598/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9599///
9600///     _MM_MANT_NORM_1_2     // interval [1, 2)
9601///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9602///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9603///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9604///
9605/// The sign is determined by sc which can take the following values:
9606///
9607///     _MM_MANT_SIGN_src     // sign = sign(src)
9608///     _MM_MANT_SIGN_zero    // sign = 0
9609///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9610///
9611/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
9612#[inline]
9613#[target_feature(enable = "avx512fp16,avx512vl")]
9614#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9615#[rustc_legacy_const_generics(2, 3)]
9616#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9617pub fn _mm256_maskz_getmant_ph<
9618    const NORM: _MM_MANTISSA_NORM_ENUM,
9619    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9620>(
9621    k: __mmask16,
9622    a: __m256h,
9623) -> __m256h {
9624    static_assert_uimm_bits!(NORM, 4);
9625    static_assert_uimm_bits!(SIGN, 2);
9626    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_setzero_ph(), k, a)
9627}
9628
9629/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9630/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9631/// on the interval range defined by norm and the sign depends on sign and the source sign.
9632///
9633/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9634///
9635///     _MM_MANT_NORM_1_2     // interval [1, 2)
9636///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9637///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9638///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9639///
9640/// The sign is determined by sc which can take the following values:
9641///
9642///     _MM_MANT_SIGN_src     // sign = sign(src)
9643///     _MM_MANT_SIGN_zero    // sign = 0
9644///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9645///
9646/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
9647#[inline]
9648#[target_feature(enable = "avx512fp16")]
9649#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9650#[rustc_legacy_const_generics(1, 2)]
9651#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9652pub fn _mm512_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9653    a: __m512h,
9654) -> __m512h {
9655    static_assert_uimm_bits!(NORM, 4);
9656    static_assert_uimm_bits!(SIGN, 2);
9657    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_undefined_ph(), 0xffffffff, a)
9658}
9659
9660/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9661/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9662/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9663/// by norm and the sign depends on sign and the source sign.
9664///
9665/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9666///
9667///     _MM_MANT_NORM_1_2     // interval [1, 2)
9668///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9669///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9670///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9671///
9672/// The sign is determined by sc which can take the following values:
9673///
9674///     _MM_MANT_SIGN_src     // sign = sign(src)
9675///     _MM_MANT_SIGN_zero    // sign = 0
9676///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9677///
9678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
9679#[inline]
9680#[target_feature(enable = "avx512fp16")]
9681#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9682#[rustc_legacy_const_generics(3, 4)]
9683#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9684pub fn _mm512_mask_getmant_ph<
9685    const NORM: _MM_MANTISSA_NORM_ENUM,
9686    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9687>(
9688    src: __m512h,
9689    k: __mmask32,
9690    a: __m512h,
9691) -> __m512h {
9692    static_assert_uimm_bits!(NORM, 4);
9693    static_assert_uimm_bits!(SIGN, 2);
9694    _mm512_mask_getmant_round_ph::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9695}
9696
9697/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9698/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9699/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9700/// by norm and the sign depends on sign and the source sign.
9701///
9702/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9703///
9704///     _MM_MANT_NORM_1_2     // interval [1, 2)
9705///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9706///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9707///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9708///
9709/// The sign is determined by sc which can take the following values:
9710///
9711///     _MM_MANT_SIGN_src     // sign = sign(src)
9712///     _MM_MANT_SIGN_zero    // sign = 0
9713///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9714///
9715/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
9716#[inline]
9717#[target_feature(enable = "avx512fp16")]
9718#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9719#[rustc_legacy_const_generics(2, 3)]
9720#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9721pub fn _mm512_maskz_getmant_ph<
9722    const NORM: _MM_MANTISSA_NORM_ENUM,
9723    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9724>(
9725    k: __mmask32,
9726    a: __m512h,
9727) -> __m512h {
9728    static_assert_uimm_bits!(NORM, 4);
9729    static_assert_uimm_bits!(SIGN, 2);
9730    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_setzero_ph(), k, a)
9731}
9732
9733/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9734/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9735/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9736/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9737///
9738/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9739///
9740///     _MM_MANT_NORM_1_2     // interval [1, 2)
9741///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9742///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9743///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9744///
9745/// The sign is determined by sc which can take the following values:
9746///
9747///     _MM_MANT_SIGN_src     // sign = sign(src)
9748///     _MM_MANT_SIGN_zero    // sign = 0
9749///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9750///
9751/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9752///
9753/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
9754#[inline]
9755#[target_feature(enable = "avx512fp16")]
9756#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9757#[rustc_legacy_const_generics(1, 2, 3)]
9758#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9759pub fn _mm512_getmant_round_ph<
9760    const NORM: _MM_MANTISSA_NORM_ENUM,
9761    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9762    const SAE: i32,
9763>(
9764    a: __m512h,
9765) -> __m512h {
9766    static_assert_uimm_bits!(NORM, 4);
9767    static_assert_uimm_bits!(SIGN, 2);
9768    static_assert_sae!(SAE);
9769    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9770}
9771
9772/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9773/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9774/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9775/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9776/// in the sae parameter
9777///
9778/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9779///
9780///     _MM_MANT_NORM_1_2     // interval [1, 2)
9781///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9782///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9783///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9784///
9785/// The sign is determined by sc which can take the following values:
9786///
9787///     _MM_MANT_SIGN_src     // sign = sign(src)
9788///     _MM_MANT_SIGN_zero    // sign = 0
9789///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9790///
9791/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9792///
9793/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
9794#[inline]
9795#[target_feature(enable = "avx512fp16")]
9796#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9797#[rustc_legacy_const_generics(3, 4, 5)]
9798#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9799pub fn _mm512_mask_getmant_round_ph<
9800    const NORM: _MM_MANTISSA_NORM_ENUM,
9801    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9802    const SAE: i32,
9803>(
9804    src: __m512h,
9805    k: __mmask32,
9806    a: __m512h,
9807) -> __m512h {
9808    unsafe {
9809        static_assert_uimm_bits!(NORM, 4);
9810        static_assert_uimm_bits!(SIGN, 2);
9811        static_assert_sae!(SAE);
9812        vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
9813    }
9814}
9815
9816/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9817/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9818/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9819/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9820/// in the sae parameter
9821///
9822/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9823///
9824///     _MM_MANT_NORM_1_2     // interval [1, 2)
9825///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9826///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9827///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9828///
9829/// The sign is determined by sc which can take the following values:
9830///
9831///     _MM_MANT_SIGN_src     // sign = sign(src)
9832///     _MM_MANT_SIGN_zero    // sign = 0
9833///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9834///
9835/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9836///
9837/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
9838#[inline]
9839#[target_feature(enable = "avx512fp16")]
9840#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9841#[rustc_legacy_const_generics(2, 3, 4)]
9842#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9843pub fn _mm512_maskz_getmant_round_ph<
9844    const NORM: _MM_MANTISSA_NORM_ENUM,
9845    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9846    const SAE: i32,
9847>(
9848    k: __mmask32,
9849    a: __m512h,
9850) -> __m512h {
9851    static_assert_uimm_bits!(NORM, 4);
9852    static_assert_uimm_bits!(SIGN, 2);
9853    static_assert_sae!(SAE);
9854    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_setzero_ph(), k, a)
9855}
9856
9857/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9858/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9859/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9860/// on the interval range defined by norm and the sign depends on sign and the source sign.
9861///
9862/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9863///
9864///     _MM_MANT_NORM_1_2     // interval [1, 2)
9865///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9866///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9867///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9868///
9869/// The sign is determined by sc which can take the following values:
9870///
9871///     _MM_MANT_SIGN_src     // sign = sign(src)
9872///     _MM_MANT_SIGN_zero    // sign = 0
9873///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9874///
9875/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
9876#[inline]
9877#[target_feature(enable = "avx512fp16")]
9878#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9879#[rustc_legacy_const_generics(2, 3)]
9880#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9881pub fn _mm_getmant_sh<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9882    a: __m128h,
9883    b: __m128h,
9884) -> __m128h {
9885    static_assert_uimm_bits!(NORM, 4);
9886    static_assert_uimm_bits!(SIGN, 2);
9887    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9888}
9889
9890/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9891/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9892/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9893/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9894/// the source sign.
9895///
9896/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9897///
9898///     _MM_MANT_NORM_1_2     // interval [1, 2)
9899///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9900///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9901///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9902///
9903/// The sign is determined by sc which can take the following values:
9904///
9905///     _MM_MANT_SIGN_src     // sign = sign(src)
9906///     _MM_MANT_SIGN_zero    // sign = 0
9907///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9908///
9909/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
9910#[inline]
9911#[target_feature(enable = "avx512fp16")]
9912#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9913#[rustc_legacy_const_generics(4, 5)]
9914#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9915pub fn _mm_mask_getmant_sh<
9916    const NORM: _MM_MANTISSA_NORM_ENUM,
9917    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9918>(
9919    src: __m128h,
9920    k: __mmask8,
9921    a: __m128h,
9922    b: __m128h,
9923) -> __m128h {
9924    static_assert_uimm_bits!(NORM, 4);
9925    static_assert_uimm_bits!(SIGN, 2);
9926    _mm_mask_getmant_round_sh::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9927}
9928
9929/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9930/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9931/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9932/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9933/// the source sign.
9934///
9935/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9936///
9937///     _MM_MANT_NORM_1_2     // interval [1, 2)
9938///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9939///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9940///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9941///
9942/// The sign is determined by sc which can take the following values:
9943///
9944///     _MM_MANT_SIGN_src     // sign = sign(src)
9945///     _MM_MANT_SIGN_zero    // sign = 0
9946///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9947///
9948/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
9949#[inline]
9950#[target_feature(enable = "avx512fp16")]
9951#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9952#[rustc_legacy_const_generics(3, 4)]
9953#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9954pub fn _mm_maskz_getmant_sh<
9955    const NORM: _MM_MANTISSA_NORM_ENUM,
9956    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9957>(
9958    k: __mmask8,
9959    a: __m128h,
9960    b: __m128h,
9961) -> __m128h {
9962    static_assert_uimm_bits!(NORM, 4);
9963    static_assert_uimm_bits!(SIGN, 2);
9964    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), k, a, b)
9965}
9966
9967/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9968/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9969/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9970/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9971/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9972///
9973/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9974///
9975///     _MM_MANT_NORM_1_2     // interval [1, 2)
9976///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9977///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9978///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9979///
9980/// The sign is determined by sc which can take the following values:
9981///
9982///     _MM_MANT_SIGN_src     // sign = sign(src)
9983///     _MM_MANT_SIGN_zero    // sign = 0
9984///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9985///
9986/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9987///
9988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
9989#[inline]
9990#[target_feature(enable = "avx512fp16")]
9991#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9992#[rustc_legacy_const_generics(2, 3, 4)]
9993#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
9994pub fn _mm_getmant_round_sh<
9995    const NORM: _MM_MANTISSA_NORM_ENUM,
9996    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9997    const SAE: i32,
9998>(
9999    a: __m128h,
10000    b: __m128h,
10001) -> __m128h {
10002    static_assert_uimm_bits!(NORM, 4);
10003    static_assert_uimm_bits!(SIGN, 2);
10004    static_assert_sae!(SAE);
10005    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10006}
10007
10008/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
10009/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10010/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
10011/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
10012/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10013///
10014/// The mantissa is normalized to the interval specified by interv, which can take the following values:
10015///
10016///     _MM_MANT_NORM_1_2     // interval [1, 2)
10017///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
10018///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
10019///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
10020///
10021/// The sign is determined by sc which can take the following values:
10022///
10023///     _MM_MANT_SIGN_src     // sign = sign(src)
10024///     _MM_MANT_SIGN_zero    // sign = 0
10025///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
10026///
10027/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10028///
10029/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
10030#[inline]
10031#[target_feature(enable = "avx512fp16")]
10032#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
10033#[rustc_legacy_const_generics(4, 5, 6)]
10034#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10035pub fn _mm_mask_getmant_round_sh<
10036    const NORM: _MM_MANTISSA_NORM_ENUM,
10037    const SIGN: _MM_MANTISSA_SIGN_ENUM,
10038    const SAE: i32,
10039>(
10040    src: __m128h,
10041    k: __mmask8,
10042    a: __m128h,
10043    b: __m128h,
10044) -> __m128h {
10045    unsafe {
10046        static_assert_uimm_bits!(NORM, 4);
10047        static_assert_uimm_bits!(SIGN, 2);
10048        static_assert_sae!(SAE);
10049        vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
10050    }
10051}
10052
10053/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
10054/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10055/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
10056/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
10057/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10058///
10059/// The mantissa is normalized to the interval specified by interv, which can take the following values:
10060///
10061///     _MM_MANT_NORM_1_2     // interval [1, 2)
10062///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
10063///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
10064///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
10065///
10066/// The sign is determined by sc which can take the following values:
10067///
10068///     _MM_MANT_SIGN_src     // sign = sign(src)
10069///     _MM_MANT_SIGN_zero    // sign = 0
10070///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
10071///
10072/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10073///
10074/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
10075#[inline]
10076#[target_feature(enable = "avx512fp16")]
10077#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
10078#[rustc_legacy_const_generics(3, 4, 5)]
10079#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10080pub fn _mm_maskz_getmant_round_sh<
10081    const NORM: _MM_MANTISSA_NORM_ENUM,
10082    const SIGN: _MM_MANTISSA_SIGN_ENUM,
10083    const SAE: i32,
10084>(
10085    k: __mmask8,
10086    a: __m128h,
10087    b: __m128h,
10088) -> __m128h {
10089    static_assert_uimm_bits!(NORM, 4);
10090    static_assert_uimm_bits!(SIGN, 2);
10091    static_assert_sae!(SAE);
10092    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
10093}
10094
10095/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10096/// specified by imm8, and store the results in dst.
10097///
10098/// Rounding is done according to the imm8 parameter, which can be one of:
10099///
10100/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10101/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10102/// * [`_MM_FROUND_TO_POS_INF`] : round up
10103/// * [`_MM_FROUND_TO_ZERO`] : truncate
10104/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10105///
10106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
10107#[inline]
10108#[target_feature(enable = "avx512fp16,avx512vl")]
10109#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10110#[rustc_legacy_const_generics(1)]
10111#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10112pub fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
10113    static_assert_uimm_bits!(IMM8, 8);
10114    _mm_mask_roundscale_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
10115}
10116
10117/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10118/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10119/// the corresponding mask bit is not set).
10120///
10121/// Rounding is done according to the imm8 parameter, which can be one of:
10122///
10123/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10124/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10125/// * [`_MM_FROUND_TO_POS_INF`] : round up
10126/// * [`_MM_FROUND_TO_ZERO`] : truncate
10127/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10128///
10129/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
10130#[inline]
10131#[target_feature(enable = "avx512fp16,avx512vl")]
10132#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10133#[rustc_legacy_const_generics(3)]
10134#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10135pub fn _mm_mask_roundscale_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
10136    unsafe {
10137        static_assert_uimm_bits!(IMM8, 8);
10138        vrndscaleph_128(a, IMM8, src, k)
10139    }
10140}
10141
10142/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10143/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10144/// mask bit is not set).
10145///
10146/// Rounding is done according to the imm8 parameter, which can be one of:
10147///
10148/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10149/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10150/// * [`_MM_FROUND_TO_POS_INF`] : round up
10151/// * [`_MM_FROUND_TO_ZERO`] : truncate
10152/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10153///
10154/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
10155#[inline]
10156#[target_feature(enable = "avx512fp16,avx512vl")]
10157#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10158#[rustc_legacy_const_generics(2)]
10159#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10160pub fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
10161    static_assert_uimm_bits!(IMM8, 8);
10162    _mm_mask_roundscale_ph::<IMM8>(_mm_setzero_ph(), k, a)
10163}
10164
10165/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10166/// specified by imm8, and store the results in dst.
10167///
10168/// Rounding is done according to the imm8 parameter, which can be one of:
10169///
10170/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10171/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10172/// * [`_MM_FROUND_TO_POS_INF`] : round up
10173/// * [`_MM_FROUND_TO_ZERO`] : truncate
10174/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10175///
10176/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
10177#[inline]
10178#[target_feature(enable = "avx512fp16,avx512vl")]
10179#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10180#[rustc_legacy_const_generics(1)]
10181#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10182pub fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
10183    static_assert_uimm_bits!(IMM8, 8);
10184    _mm256_mask_roundscale_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
10185}
10186
10187/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10188/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10189/// the corresponding mask bit is not set).
10190///
10191/// Rounding is done according to the imm8 parameter, which can be one of:
10192///
10193/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10194/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10195/// * [`_MM_FROUND_TO_POS_INF`] : round up
10196/// * [`_MM_FROUND_TO_ZERO`] : truncate
10197/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10198///
10199/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
10200#[inline]
10201#[target_feature(enable = "avx512fp16,avx512vl")]
10202#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10203#[rustc_legacy_const_generics(3)]
10204#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10205pub fn _mm256_mask_roundscale_ph<const IMM8: i32>(
10206    src: __m256h,
10207    k: __mmask16,
10208    a: __m256h,
10209) -> __m256h {
10210    unsafe {
10211        static_assert_uimm_bits!(IMM8, 8);
10212        vrndscaleph_256(a, IMM8, src, k)
10213    }
10214}
10215
10216/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10217/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10218/// mask bit is not set).
10219///
10220/// Rounding is done according to the imm8 parameter, which can be one of:
10221///
10222/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10223/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10224/// * [`_MM_FROUND_TO_POS_INF`] : round up
10225/// * [`_MM_FROUND_TO_ZERO`] : truncate
10226/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10227///
10228/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
10229#[inline]
10230#[target_feature(enable = "avx512fp16,avx512vl")]
10231#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10232#[rustc_legacy_const_generics(2)]
10233#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10234pub fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
10235    static_assert_uimm_bits!(IMM8, 8);
10236    _mm256_mask_roundscale_ph::<IMM8>(_mm256_setzero_ph(), k, a)
10237}
10238
10239/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10240/// specified by imm8, and store the results in dst.
10241///
10242/// Rounding is done according to the imm8 parameter, which can be one of:
10243///
10244/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10245/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10246/// * [`_MM_FROUND_TO_POS_INF`] : round up
10247/// * [`_MM_FROUND_TO_ZERO`] : truncate
10248/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10249///
10250/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
10251#[inline]
10252#[target_feature(enable = "avx512fp16")]
10253#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10254#[rustc_legacy_const_generics(1)]
10255#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10256pub fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
10257    static_assert_uimm_bits!(IMM8, 8);
10258    _mm512_mask_roundscale_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
10259}
10260
10261/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10262/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10263/// the corresponding mask bit is not set).
10264///
10265/// Rounding is done according to the imm8 parameter, which can be one of:
10266///
10267/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10268/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10269/// * [`_MM_FROUND_TO_POS_INF`] : round up
10270/// * [`_MM_FROUND_TO_ZERO`] : truncate
10271/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10272///
10273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
10274#[inline]
10275#[target_feature(enable = "avx512fp16")]
10276#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10277#[rustc_legacy_const_generics(3)]
10278#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10279pub fn _mm512_mask_roundscale_ph<const IMM8: i32>(
10280    src: __m512h,
10281    k: __mmask32,
10282    a: __m512h,
10283) -> __m512h {
10284    static_assert_uimm_bits!(IMM8, 8);
10285    _mm512_mask_roundscale_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
10286}
10287
10288/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10289/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10290/// mask bit is not set).
10291///
10292/// Rounding is done according to the imm8 parameter, which can be one of:
10293///
10294/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10295/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10296/// * [`_MM_FROUND_TO_POS_INF`] : round up
10297/// * [`_MM_FROUND_TO_ZERO`] : truncate
10298/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10299///
10300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
10301#[inline]
10302#[target_feature(enable = "avx512fp16")]
10303#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10304#[rustc_legacy_const_generics(2)]
10305#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10306pub fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10307    static_assert_uimm_bits!(IMM8, 8);
10308    _mm512_mask_roundscale_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10309}
10310
10311/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10312/// specified by imm8, and store the results in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10313/// in the sae parameter
10314///
10315/// Rounding is done according to the imm8 parameter, which can be one of:
10316///
10317/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10318/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10319/// * [`_MM_FROUND_TO_POS_INF`] : round up
10320/// * [`_MM_FROUND_TO_ZERO`] : truncate
10321/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10322///
10323/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
10324#[inline]
10325#[target_feature(enable = "avx512fp16")]
10326#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10327#[rustc_legacy_const_generics(1, 2)]
10328#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10329pub fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10330    static_assert_uimm_bits!(IMM8, 8);
10331    static_assert_sae!(SAE);
10332    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10333}
10334
10335/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10336/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10337/// the corresponding mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10338/// in the sae parameter
10339///
10340/// Rounding is done according to the imm8 parameter, which can be one of:
10341///
10342/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10343/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10344/// * [`_MM_FROUND_TO_POS_INF`] : round up
10345/// * [`_MM_FROUND_TO_ZERO`] : truncate
10346/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10347///
10348/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
10349#[inline]
10350#[target_feature(enable = "avx512fp16")]
10351#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10352#[rustc_legacy_const_generics(3, 4)]
10353#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10354pub fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10355    src: __m512h,
10356    k: __mmask32,
10357    a: __m512h,
10358) -> __m512h {
10359    unsafe {
10360        static_assert_uimm_bits!(IMM8, 8);
10361        static_assert_sae!(SAE);
10362        vrndscaleph_512(a, IMM8, src, k, SAE)
10363    }
10364}
10365
10366/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10367/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10368/// mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10369///
10370/// Rounding is done according to the imm8 parameter, which can be one of:
10371///
10372/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10373/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10374/// * [`_MM_FROUND_TO_POS_INF`] : round up
10375/// * [`_MM_FROUND_TO_ZERO`] : truncate
10376/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10377///
10378/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
10379#[inline]
10380#[target_feature(enable = "avx512fp16")]
10381#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10382#[rustc_legacy_const_generics(2, 3)]
10383#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10384pub fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10385    k: __mmask32,
10386    a: __m512h,
10387) -> __m512h {
10388    static_assert_uimm_bits!(IMM8, 8);
10389    static_assert_sae!(SAE);
10390    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10391}
10392
10393/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10394/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10395/// from a to the upper elements of dst.
10396///
10397/// Rounding is done according to the imm8 parameter, which can be one of:
10398///
10399/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10400/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10401/// * [`_MM_FROUND_TO_POS_INF`] : round up
10402/// * [`_MM_FROUND_TO_ZERO`] : truncate
10403/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10404///
10405/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
10406#[inline]
10407#[target_feature(enable = "avx512fp16")]
10408#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10409#[rustc_legacy_const_generics(2)]
10410#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10411pub fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10412    static_assert_uimm_bits!(IMM8, 8);
10413    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10414}
10415
10416/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10417/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10418/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10419///
10420/// Rounding is done according to the imm8 parameter, which can be one of:
10421///
10422/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10423/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10424/// * [`_MM_FROUND_TO_POS_INF`] : round up
10425/// * [`_MM_FROUND_TO_ZERO`] : truncate
10426/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10427///
10428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
10429#[inline]
10430#[target_feature(enable = "avx512fp16")]
10431#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10432#[rustc_legacy_const_generics(4)]
10433#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10434pub fn _mm_mask_roundscale_sh<const IMM8: i32>(
10435    src: __m128h,
10436    k: __mmask8,
10437    a: __m128h,
10438    b: __m128h,
10439) -> __m128h {
10440    static_assert_uimm_bits!(IMM8, 8);
10441    _mm_mask_roundscale_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10442}
10443
10444/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10445/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10446/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10447///
10448/// Rounding is done according to the imm8 parameter, which can be one of:
10449///
10450/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10451/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10452/// * [`_MM_FROUND_TO_POS_INF`] : round up
10453/// * [`_MM_FROUND_TO_ZERO`] : truncate
10454/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10455///
10456/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
10457#[inline]
10458#[target_feature(enable = "avx512fp16")]
10459#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10460#[rustc_legacy_const_generics(3)]
10461#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10462pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10463    static_assert_uimm_bits!(IMM8, 8);
10464    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
10465}
10466
10467/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10468/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10469/// from a to the upper elements of dst.
10470///
10471/// Rounding is done according to the imm8 parameter, which can be one of:
10472///
10473/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10474/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10475/// * [`_MM_FROUND_TO_POS_INF`] : round up
10476/// * [`_MM_FROUND_TO_ZERO`] : truncate
10477/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10478///
10479/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10480///
10481/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
10482#[inline]
10483#[target_feature(enable = "avx512fp16")]
10484#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10485#[rustc_legacy_const_generics(2, 3)]
10486#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10487pub fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
10488    static_assert_uimm_bits!(IMM8, 8);
10489    static_assert_sae!(SAE);
10490    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10491}
10492
10493/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10494/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10495/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10496///
10497/// Rounding is done according to the imm8 parameter, which can be one of:
10498///
10499/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10500/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10501/// * [`_MM_FROUND_TO_POS_INF`] : round up
10502/// * [`_MM_FROUND_TO_ZERO`] : truncate
10503/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10504///
10505/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10506///
10507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
10508#[inline]
10509#[target_feature(enable = "avx512fp16")]
10510#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10511#[rustc_legacy_const_generics(4, 5)]
10512#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10513pub fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10514    src: __m128h,
10515    k: __mmask8,
10516    a: __m128h,
10517    b: __m128h,
10518) -> __m128h {
10519    unsafe {
10520        static_assert_uimm_bits!(IMM8, 8);
10521        static_assert_sae!(SAE);
10522        vrndscalesh(a, b, src, k, IMM8, SAE)
10523    }
10524}
10525
10526/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10527/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10528/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10529///
10530/// Rounding is done according to the imm8 parameter, which can be one of:
10531///
10532/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10533/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10534/// * [`_MM_FROUND_TO_POS_INF`] : round up
10535/// * [`_MM_FROUND_TO_ZERO`] : truncate
10536/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10537///
10538/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10539///
10540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
10541#[inline]
10542#[target_feature(enable = "avx512fp16")]
10543#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10544#[rustc_legacy_const_generics(3, 4)]
10545#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10546pub fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10547    k: __mmask8,
10548    a: __m128h,
10549    b: __m128h,
10550) -> __m128h {
10551    static_assert_uimm_bits!(IMM8, 8);
10552    static_assert_sae!(SAE);
10553    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
10554}
10555
10556/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10557/// the results in dst.
10558///
10559/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
10560#[inline]
10561#[target_feature(enable = "avx512fp16,avx512vl")]
10562#[cfg_attr(test, assert_instr(vscalefph))]
10563#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10564pub fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
10565    _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
10566}
10567
10568/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10569/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10570///
10571/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
10572#[inline]
10573#[target_feature(enable = "avx512fp16,avx512vl")]
10574#[cfg_attr(test, assert_instr(vscalefph))]
10575#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10576pub fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10577    unsafe { vscalefph_128(a, b, src, k) }
10578}
10579
10580/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10581/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10582///
10583/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
10584#[inline]
10585#[target_feature(enable = "avx512fp16,avx512vl")]
10586#[cfg_attr(test, assert_instr(vscalefph))]
10587#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10588pub fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10589    _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
10590}
10591
10592/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10593/// the results in dst.
10594///
10595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
10596#[inline]
10597#[target_feature(enable = "avx512fp16,avx512vl")]
10598#[cfg_attr(test, assert_instr(vscalefph))]
10599#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10600pub fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
10601    _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
10602}
10603
10604/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10605/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10606///
10607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
10608#[inline]
10609#[target_feature(enable = "avx512fp16,avx512vl")]
10610#[cfg_attr(test, assert_instr(vscalefph))]
10611#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10612pub fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10613    unsafe { vscalefph_256(a, b, src, k) }
10614}
10615
10616/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10617/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10618///
10619/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
10620#[inline]
10621#[target_feature(enable = "avx512fp16,avx512vl")]
10622#[cfg_attr(test, assert_instr(vscalefph))]
10623#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10624pub fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10625    _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
10626}
10627
10628/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10629/// the results in dst.
10630///
10631/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
10632#[inline]
10633#[target_feature(enable = "avx512fp16")]
10634#[cfg_attr(test, assert_instr(vscalefph))]
10635#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10636pub fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
10637    _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
10638}
10639
10640/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10641/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10642///
10643/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
10644#[inline]
10645#[target_feature(enable = "avx512fp16")]
10646#[cfg_attr(test, assert_instr(vscalefph))]
10647#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10648pub fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10649    _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10650}
10651
10652/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10653/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10654///
10655/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
10656#[inline]
10657#[target_feature(enable = "avx512fp16")]
10658#[cfg_attr(test, assert_instr(vscalefph))]
10659#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10660pub fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10661    _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
10662}
10663
10664/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10665/// the results in dst.
10666///
10667/// Rounding is done according to the rounding parameter, which can be one of:
10668///
10669/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10670/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10671/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10672/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10673/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10674///
10675/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
10676#[inline]
10677#[target_feature(enable = "avx512fp16")]
10678#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10679#[rustc_legacy_const_generics(2)]
10680#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10681pub fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
10682    static_assert_rounding!(ROUNDING);
10683    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_undefined_ph(), 0xffffffff, a, b)
10684}
10685
10686/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10687/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10688///
10689/// Rounding is done according to the rounding parameter, which can be one of:
10690///
10691/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10692/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10693/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10694/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10695/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10696///
10697/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
10698#[inline]
10699#[target_feature(enable = "avx512fp16")]
10700#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10701#[rustc_legacy_const_generics(4)]
10702#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10703pub fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
10704    src: __m512h,
10705    k: __mmask32,
10706    a: __m512h,
10707    b: __m512h,
10708) -> __m512h {
10709    unsafe {
10710        static_assert_rounding!(ROUNDING);
10711        vscalefph_512(a, b, src, k, ROUNDING)
10712    }
10713}
10714
10715/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10716/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10717///
10718/// Rounding is done according to the rounding parameter, which can be one of:
10719///
10720/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10721/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10722/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10723/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10724/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10725///
10726/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
10727#[inline]
10728#[target_feature(enable = "avx512fp16")]
10729#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10730#[rustc_legacy_const_generics(3)]
10731#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10732pub fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
10733    k: __mmask32,
10734    a: __m512h,
10735    b: __m512h,
10736) -> __m512h {
10737    static_assert_rounding!(ROUNDING);
10738    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
10739}
10740
10741/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10742/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10743/// elements of dst.
10744///
10745/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
10746#[inline]
10747#[target_feature(enable = "avx512fp16")]
10748#[cfg_attr(test, assert_instr(vscalefsh))]
10749#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10750pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
10751    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
10752}
10753
10754/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10755/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10756/// and copy the upper 7 packed elements from a to the upper elements of dst.
10757///
10758/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
10759#[inline]
10760#[target_feature(enable = "avx512fp16")]
10761#[cfg_attr(test, assert_instr(vscalefsh))]
10762#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10763pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10764    _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10765}
10766
10767/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10768/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10769/// and copy the upper 7 packed elements from a to the upper elements of dst.
10770///
10771/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
10772#[inline]
10773#[target_feature(enable = "avx512fp16")]
10774#[cfg_attr(test, assert_instr(vscalefsh))]
10775#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10776pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10777    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), k, a, b)
10778}
10779
10780/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10781/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10782/// elements of dst.
10783///
10784/// Rounding is done according to the rounding parameter, which can be one of:
10785///
10786/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10787/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10788/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10789/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10790/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10791///
10792/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
10793#[inline]
10794#[target_feature(enable = "avx512fp16")]
10795#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10796#[rustc_legacy_const_generics(2)]
10797#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10798pub fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
10799    static_assert_rounding!(ROUNDING);
10800    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10801}
10802
10803/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10804/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10805/// and copy the upper 7 packed elements from a to the upper elements of dst.
10806///
10807/// Rounding is done according to the rounding parameter, which can be one of:
10808///
10809/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10810/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10811/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10812/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10813/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10814///
10815/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
10816#[inline]
10817#[target_feature(enable = "avx512fp16")]
10818#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10819#[rustc_legacy_const_generics(4)]
10820#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10821pub fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
10822    src: __m128h,
10823    k: __mmask8,
10824    a: __m128h,
10825    b: __m128h,
10826) -> __m128h {
10827    unsafe {
10828        static_assert_rounding!(ROUNDING);
10829        vscalefsh(a, b, src, k, ROUNDING)
10830    }
10831}
10832
10833/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10834/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10835/// and copy the upper 7 packed elements from a to the upper elements of dst.
10836///
10837/// Rounding is done according to the rounding parameter, which can be one of:
10838///
10839/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10840/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10841/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10842/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10843/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10844///
10845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
10846#[inline]
10847#[target_feature(enable = "avx512fp16")]
10848#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10849#[rustc_legacy_const_generics(3)]
10850#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10851pub fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
10852    k: __mmask8,
10853    a: __m128h,
10854    b: __m128h,
10855) -> __m128h {
10856    static_assert_rounding!(ROUNDING);
10857    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
10858}
10859
10860/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10861/// number of bits specified by imm8, and store the results in dst.
10862///
10863/// Rounding is done according to the imm8 parameter, which can be one of:
10864///
10865/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10866/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10867/// * [`_MM_FROUND_TO_POS_INF`] : round up
10868/// * [`_MM_FROUND_TO_ZERO`] : truncate
10869/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10870///
10871/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
10872#[inline]
10873#[target_feature(enable = "avx512fp16,avx512vl")]
10874#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10875#[rustc_legacy_const_generics(1)]
10876#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10877pub fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
10878    static_assert_uimm_bits!(IMM8, 8);
10879    _mm_mask_reduce_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
10880}
10881
10882/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10883/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10884/// from src when the corresponding mask bit is not set).
10885///
10886/// Rounding is done according to the imm8 parameter, which can be one of:
10887///
10888/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10889/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10890/// * [`_MM_FROUND_TO_POS_INF`] : round up
10891/// * [`_MM_FROUND_TO_ZERO`] : truncate
10892/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10893///
10894/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
10895#[inline]
10896#[target_feature(enable = "avx512fp16,avx512vl")]
10897#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10898#[rustc_legacy_const_generics(3)]
10899#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10900pub fn _mm_mask_reduce_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
10901    unsafe {
10902        static_assert_uimm_bits!(IMM8, 8);
10903        vreduceph_128(a, IMM8, src, k)
10904    }
10905}
10906
10907/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10908/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10909/// out when the corresponding mask bit is not set).
10910///
10911/// Rounding is done according to the imm8 parameter, which can be one of:
10912///
10913/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10914/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10915/// * [`_MM_FROUND_TO_POS_INF`] : round up
10916/// * [`_MM_FROUND_TO_ZERO`] : truncate
10917/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10918///
10919/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
10920#[inline]
10921#[target_feature(enable = "avx512fp16,avx512vl")]
10922#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10923#[rustc_legacy_const_generics(2)]
10924#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10925pub fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
10926    static_assert_uimm_bits!(IMM8, 8);
10927    _mm_mask_reduce_ph::<IMM8>(_mm_setzero_ph(), k, a)
10928}
10929
10930/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10931/// number of bits specified by imm8, and store the results in dst.
10932///
10933/// Rounding is done according to the imm8 parameter, which can be one of:
10934///
10935/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10936/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10937/// * [`_MM_FROUND_TO_POS_INF`] : round up
10938/// * [`_MM_FROUND_TO_ZERO`] : truncate
10939/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10940///
10941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
10942#[inline]
10943#[target_feature(enable = "avx512fp16,avx512vl")]
10944#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10945#[rustc_legacy_const_generics(1)]
10946#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10947pub fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
10948    static_assert_uimm_bits!(IMM8, 8);
10949    _mm256_mask_reduce_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
10950}
10951
10952/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10953/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10954/// from src when the corresponding mask bit is not set).
10955///
10956/// Rounding is done according to the imm8 parameter, which can be one of:
10957///
10958/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10959/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10960/// * [`_MM_FROUND_TO_POS_INF`] : round up
10961/// * [`_MM_FROUND_TO_ZERO`] : truncate
10962/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10963///
10964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
10965#[inline]
10966#[target_feature(enable = "avx512fp16,avx512vl")]
10967#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10968#[rustc_legacy_const_generics(3)]
10969#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10970pub fn _mm256_mask_reduce_ph<const IMM8: i32>(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
10971    unsafe {
10972        static_assert_uimm_bits!(IMM8, 8);
10973        vreduceph_256(a, IMM8, src, k)
10974    }
10975}
10976
10977/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10978/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10979/// out when the corresponding mask bit is not set).
10980///
10981/// Rounding is done according to the imm8 parameter, which can be one of:
10982///
10983/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10984/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10985/// * [`_MM_FROUND_TO_POS_INF`] : round up
10986/// * [`_MM_FROUND_TO_ZERO`] : truncate
10987/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10988///
10989/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
10990#[inline]
10991#[target_feature(enable = "avx512fp16,avx512vl")]
10992#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10993#[rustc_legacy_const_generics(2)]
10994#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
10995pub fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
10996    static_assert_uimm_bits!(IMM8, 8);
10997    _mm256_mask_reduce_ph::<IMM8>(_mm256_setzero_ph(), k, a)
10998}
10999
11000/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11001/// number of bits specified by imm8, and store the results in dst.
11002///
11003/// Rounding is done according to the imm8 parameter, which can be one of:
11004///
11005/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11006/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11007/// * [`_MM_FROUND_TO_POS_INF`] : round up
11008/// * [`_MM_FROUND_TO_ZERO`] : truncate
11009/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11010///
11011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
11012#[inline]
11013#[target_feature(enable = "avx512fp16")]
11014#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
11015#[rustc_legacy_const_generics(1)]
11016#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11017pub fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
11018    static_assert_uimm_bits!(IMM8, 8);
11019    _mm512_mask_reduce_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
11020}
11021
11022/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11023/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
11024/// from src when the corresponding mask bit is not set).
11025///
11026/// Rounding is done according to the imm8 parameter, which can be one of:
11027///
11028/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11029/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11030/// * [`_MM_FROUND_TO_POS_INF`] : round up
11031/// * [`_MM_FROUND_TO_ZERO`] : truncate
11032/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11033///
11034/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
11035#[inline]
11036#[target_feature(enable = "avx512fp16")]
11037#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
11038#[rustc_legacy_const_generics(3)]
11039#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11040pub fn _mm512_mask_reduce_ph<const IMM8: i32>(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
11041    static_assert_uimm_bits!(IMM8, 8);
11042    _mm512_mask_reduce_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
11043}
11044
11045/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11046/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
11047/// out when the corresponding mask bit is not set).
11048///
11049/// Rounding is done according to the imm8 parameter, which can be one of:
11050///
11051/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11052/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11053/// * [`_MM_FROUND_TO_POS_INF`] : round up
11054/// * [`_MM_FROUND_TO_ZERO`] : truncate
11055/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11056///
11057/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
11058#[inline]
11059#[target_feature(enable = "avx512fp16")]
11060#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
11061#[rustc_legacy_const_generics(2)]
11062#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11063pub fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
11064    static_assert_uimm_bits!(IMM8, 8);
11065    _mm512_mask_reduce_ph::<IMM8>(_mm512_setzero_ph(), k, a)
11066}
11067
11068/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11069/// number of bits specified by imm8, and store the results in dst.
11070///
11071/// Rounding is done according to the imm8 parameter, which can be one of:
11072///
11073/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11074/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11075/// * [`_MM_FROUND_TO_POS_INF`] : round up
11076/// * [`_MM_FROUND_TO_ZERO`] : truncate
11077/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11078///
11079/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11080///
11081/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
11082#[inline]
11083#[target_feature(enable = "avx512fp16")]
11084#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
11085#[rustc_legacy_const_generics(1, 2)]
11086#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11087pub fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
11088    static_assert_uimm_bits!(IMM8, 8);
11089    static_assert_sae!(SAE);
11090    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
11091}
11092
11093/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11094/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
11095/// from src when the corresponding mask bit is not set).
11096///
11097/// Rounding is done according to the imm8 parameter, which can be one of:
11098///
11099/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11100/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11101/// * [`_MM_FROUND_TO_POS_INF`] : round up
11102/// * [`_MM_FROUND_TO_ZERO`] : truncate
11103/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11104///
11105/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11106///
11107/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
11108#[inline]
11109#[target_feature(enable = "avx512fp16")]
11110#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
11111#[rustc_legacy_const_generics(3, 4)]
11112#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11113pub fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
11114    src: __m512h,
11115    k: __mmask32,
11116    a: __m512h,
11117) -> __m512h {
11118    unsafe {
11119        static_assert_uimm_bits!(IMM8, 8);
11120        static_assert_sae!(SAE);
11121        vreduceph_512(a, IMM8, src, k, SAE)
11122    }
11123}
11124
11125/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11126/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
11127/// out when the corresponding mask bit is not set).
11128///
11129/// Rounding is done according to the imm8 parameter, which can be one of:
11130///
11131/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11132/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11133/// * [`_MM_FROUND_TO_POS_INF`] : round up
11134/// * [`_MM_FROUND_TO_ZERO`] : truncate
11135/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11136///
11137/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11138///
11139/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
11140#[inline]
11141#[target_feature(enable = "avx512fp16")]
11142#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
11143#[rustc_legacy_const_generics(2, 3)]
11144#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11145pub fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
11146    k: __mmask32,
11147    a: __m512h,
11148) -> __m512h {
11149    static_assert_uimm_bits!(IMM8, 8);
11150    static_assert_sae!(SAE);
11151    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
11152}
11153
11154/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11155/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the
11156/// upper 7 packed elements from a to the upper elements of dst.
11157///
11158/// Rounding is done according to the imm8 parameter, which can be one of:
11159///
11160/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11161/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11162/// * [`_MM_FROUND_TO_POS_INF`] : round up
11163/// * [`_MM_FROUND_TO_ZERO`] : truncate
11164/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11165///
11166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
11167#[inline]
11168#[target_feature(enable = "avx512fp16")]
11169#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
11170#[rustc_legacy_const_generics(2)]
11171#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11172pub fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
11173    static_assert_uimm_bits!(IMM8, 8);
11174    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
11175}
11176
11177/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11178/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
11179/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from
11180/// a to the upper elements of dst.
11181///
11182/// Rounding is done according to the imm8 parameter, which can be one of:
11183///
11184/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11185/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11186/// * [`_MM_FROUND_TO_POS_INF`] : round up
11187/// * [`_MM_FROUND_TO_ZERO`] : truncate
11188/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11189///
11190/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
11191#[inline]
11192#[target_feature(enable = "avx512fp16")]
11193#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
11194#[rustc_legacy_const_generics(4)]
11195#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11196pub fn _mm_mask_reduce_sh<const IMM8: i32>(
11197    src: __m128h,
11198    k: __mmask8,
11199    a: __m128h,
11200    b: __m128h,
11201) -> __m128h {
11202    static_assert_uimm_bits!(IMM8, 8);
11203    _mm_mask_reduce_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
11204}
11205
11206/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11207/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
11208/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
11209/// to the upper elements of dst.
11210///
11211/// Rounding is done according to the imm8 parameter, which can be one of:
11212///
11213/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11214/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11215/// * [`_MM_FROUND_TO_POS_INF`] : round up
11216/// * [`_MM_FROUND_TO_ZERO`] : truncate
11217/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11218///
11219/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
11220#[inline]
11221#[target_feature(enable = "avx512fp16")]
11222#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
11223#[rustc_legacy_const_generics(3)]
11224#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11225pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
11226    static_assert_uimm_bits!(IMM8, 8);
11227    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
11228}
11229
11230/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11231/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper
11232/// 7 packed elements from a to the upper elements of dst.
11233///
11234/// Rounding is done according to the imm8 parameter, which can be one of:
11235///
11236/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11237/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11238/// * [`_MM_FROUND_TO_POS_INF`] : round up
11239/// * [`_MM_FROUND_TO_ZERO`] : truncate
11240/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11241///
11242/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11243///
11244/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
11245#[inline]
11246#[target_feature(enable = "avx512fp16")]
11247#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11248#[rustc_legacy_const_generics(2, 3)]
11249#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11250pub fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
11251    static_assert_uimm_bits!(IMM8, 8);
11252    static_assert_sae!(SAE);
11253    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
11254}
11255
11256/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11257/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
11258/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a
11259/// to the upper elements of dst.
11260///
11261/// Rounding is done according to the imm8 parameter, which can be one of:
11262///
11263/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11264/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11265/// * [`_MM_FROUND_TO_POS_INF`] : round up
11266/// * [`_MM_FROUND_TO_ZERO`] : truncate
11267/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11268///
11269/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11270///
11271/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
11272#[inline]
11273#[target_feature(enable = "avx512fp16")]
11274#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11275#[rustc_legacy_const_generics(4, 5)]
11276#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11277pub fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
11278    src: __m128h,
11279    k: __mmask8,
11280    a: __m128h,
11281    b: __m128h,
11282) -> __m128h {
11283    unsafe {
11284        static_assert_uimm_bits!(IMM8, 8);
11285        static_assert_sae!(SAE);
11286        vreducesh(a, b, src, k, IMM8, SAE)
11287    }
11288}
11289
11290/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11291/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
11292/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
11293/// to the upper elements of dst.
11294///
11295/// Rounding is done according to the imm8 parameter, which can be one of:
11296///
11297/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11298/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11299/// * [`_MM_FROUND_TO_POS_INF`] : round up
11300/// * [`_MM_FROUND_TO_ZERO`] : truncate
11301/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11302///
11303/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11304///
11305/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
11306#[inline]
11307#[target_feature(enable = "avx512fp16")]
11308#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11309#[rustc_legacy_const_generics(3, 4)]
11310#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11311pub fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
11312    k: __mmask8,
11313    a: __m128h,
11314    b: __m128h,
11315) -> __m128h {
11316    static_assert_uimm_bits!(IMM8, 8);
11317    static_assert_sae!(SAE);
11318    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
11319}
11320
11321/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11322/// sum of all elements in a.
11323///
11324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_ph)
11325#[inline]
11326#[target_feature(enable = "avx512fp16,avx512vl")]
11327#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11328#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11329pub const fn _mm_reduce_add_ph(a: __m128h) -> f16 {
11330    unsafe {
11331        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11332        let a = _mm_add_ph(a, b);
11333        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11334        let a = _mm_add_ph(a, b);
11335        simd_extract!(a, 0, f16) + simd_extract!(a, 1, f16)
11336    }
11337}
11338
11339/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11340/// sum of all elements in a.
11341///
11342/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_ph)
11343#[inline]
11344#[target_feature(enable = "avx512fp16,avx512vl")]
11345#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11346#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11347pub const fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
11348    unsafe {
11349        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11350        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11351        _mm_reduce_add_ph(_mm_add_ph(p, q))
11352    }
11353}
11354
11355/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11356/// sum of all elements in a.
11357///
11358/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ph)
11359#[inline]
11360#[target_feature(enable = "avx512fp16")]
11361#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11362#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11363pub const fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
11364    unsafe {
11365        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11366        let q = simd_shuffle!(
11367            a,
11368            a,
11369            [
11370                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11371            ]
11372        );
11373        _mm256_reduce_add_ph(_mm256_add_ph(p, q))
11374    }
11375}
11376
11377/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11378/// the product of all elements in a.
11379///
11380/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_ph)
11381#[inline]
11382#[target_feature(enable = "avx512fp16,avx512vl")]
11383#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11384#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11385pub const fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
11386    unsafe {
11387        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11388        let a = _mm_mul_ph(a, b);
11389        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11390        let a = _mm_mul_ph(a, b);
11391        simd_extract!(a, 0, f16) * simd_extract!(a, 1, f16)
11392    }
11393}
11394
11395/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11396/// the product of all elements in a.
11397///
11398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_ph)
11399#[inline]
11400#[target_feature(enable = "avx512fp16,avx512vl")]
11401#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11402#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11403pub const fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
11404    unsafe {
11405        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11406        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11407        _mm_reduce_mul_ph(_mm_mul_ph(p, q))
11408    }
11409}
11410
11411/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11412/// the product of all elements in a.
11413///
11414/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ph)
11415#[inline]
11416#[target_feature(enable = "avx512fp16")]
11417#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11418#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11419pub const fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
11420    unsafe {
11421        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11422        let q = simd_shuffle!(
11423            a,
11424            a,
11425            [
11426                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11427            ]
11428        );
11429        _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
11430    }
11431}
11432
11433/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11434/// minimum of all elements in a.
11435///
11436/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_ph)
11437#[inline]
11438#[target_feature(enable = "avx512fp16,avx512vl")]
11439#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11440pub fn _mm_reduce_min_ph(a: __m128h) -> f16 {
11441    unsafe {
11442        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11443        let a = _mm_min_ph(a, b);
11444        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11445        let a = _mm_min_ph(a, b);
11446        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11447        simd_extract!(_mm_min_sh(a, b), 0)
11448    }
11449}
11450
11451/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11452/// minimum of all elements in a.
11453///
11454/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_ph)
11455#[inline]
11456#[target_feature(enable = "avx512fp16,avx512vl")]
11457#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11458pub fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
11459    unsafe {
11460        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11461        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11462        _mm_reduce_min_ph(_mm_min_ph(p, q))
11463    }
11464}
11465
11466/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11467/// minimum of all elements in a.
11468///
11469/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ph)
11470#[inline]
11471#[target_feature(enable = "avx512fp16")]
11472#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11473pub fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
11474    unsafe {
11475        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11476        let q = simd_shuffle!(
11477            a,
11478            a,
11479            [
11480                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11481            ]
11482        );
11483        _mm256_reduce_min_ph(_mm256_min_ph(p, q))
11484    }
11485}
11486
11487/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11488/// maximum of all elements in a.
11489///
11490/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_ph)
11491#[inline]
11492#[target_feature(enable = "avx512fp16,avx512vl")]
11493#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11494pub fn _mm_reduce_max_ph(a: __m128h) -> f16 {
11495    unsafe {
11496        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11497        let a = _mm_max_ph(a, b);
11498        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11499        let a = _mm_max_ph(a, b);
11500        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11501        simd_extract!(_mm_max_sh(a, b), 0)
11502    }
11503}
11504
11505/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11506/// maximum of all elements in a.
11507///
11508/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_ph)
11509#[inline]
11510#[target_feature(enable = "avx512fp16,avx512vl")]
11511#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11512pub fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
11513    unsafe {
11514        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11515        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11516        _mm_reduce_max_ph(_mm_max_ph(p, q))
11517    }
11518}
11519
11520/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11521/// maximum of all elements in a.
11522///
11523/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ph)
11524#[inline]
11525#[target_feature(enable = "avx512fp16")]
11526#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11527pub fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
11528    unsafe {
11529        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11530        let q = simd_shuffle!(
11531            a,
11532            a,
11533            [
11534                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11535            ]
11536        );
11537        _mm256_reduce_max_ph(_mm256_max_ph(p, q))
11538    }
11539}
11540
11541macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
11542    ($mask_type: ty, $reg: ident, $a: expr) => {{
11543        let dst: $mask_type;
11544        asm!(
11545            "vfpclassph {k}, {src}, {imm8}",
11546            k = lateout(kreg) dst,
11547            src = in($reg) $a,
11548            imm8 = const IMM8,
11549            options(pure, nomem, nostack)
11550        );
11551        dst
11552    }};
11553    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
11554        let dst: $mask_type;
11555        asm!(
11556            "vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
11557            k = lateout(kreg) dst,
11558            mask = in(kreg) $mask,
11559            src = in($reg) $a,
11560            imm8 = const IMM8,
11561            options(pure, nomem, nostack)
11562        );
11563        dst
11564    }};
11565}
11566
11567/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11568/// by imm8, and store the results in mask vector k.
11569/// imm can be a combination of:
11570///
11571///     0x01 // QNaN
11572///     0x02 // Positive Zero
11573///     0x04 // Negative Zero
11574///     0x08 // Positive Infinity
11575///     0x10 // Negative Infinity
11576///     0x20 // Denormal
11577///     0x40 // Negative
11578///     0x80 // SNaN
11579///
11580/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask)
11581#[inline]
11582#[target_feature(enable = "avx512fp16,avx512vl")]
11583#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11584#[rustc_legacy_const_generics(1)]
11585#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11586pub fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11587    unsafe {
11588        static_assert_uimm_bits!(IMM8, 8);
11589        fpclass_asm!(__mmask8, xmm_reg, a)
11590    }
11591}
11592
11593/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11594/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11595/// corresponding mask bit is not set).
11596/// imm can be a combination of:
11597///
11598///     0x01 // QNaN
11599///     0x02 // Positive Zero
11600///     0x04 // Negative Zero
11601///     0x08 // Positive Infinity
11602///     0x10 // Negative Infinity
11603///     0x20 // Denormal
11604///     0x40 // Negative
11605///     0x80 // SNaN
11606///
11607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask)
11608#[inline]
11609#[target_feature(enable = "avx512fp16,avx512vl")]
11610#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11611#[rustc_legacy_const_generics(2)]
11612#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11613pub fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11614    unsafe {
11615        static_assert_uimm_bits!(IMM8, 8);
11616        fpclass_asm!(__mmask8, k1, xmm_reg, a)
11617    }
11618}
11619
11620/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11621/// by imm8, and store the results in mask vector k.
11622/// imm can be a combination of:
11623///
11624///     0x01 // QNaN
11625///     0x02 // Positive Zero
11626///     0x04 // Negative Zero
11627///     0x08 // Positive Infinity
11628///     0x10 // Negative Infinity
11629///     0x20 // Denormal
11630///     0x40 // Negative
11631///     0x80 // SNaN
11632///
11633/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask)
11634#[inline]
11635#[target_feature(enable = "avx512fp16,avx512vl")]
11636#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11637#[rustc_legacy_const_generics(1)]
11638#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11639pub fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
11640    unsafe {
11641        static_assert_uimm_bits!(IMM8, 8);
11642        fpclass_asm!(__mmask16, ymm_reg, a)
11643    }
11644}
11645
11646/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11647/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11648/// corresponding mask bit is not set).
11649/// imm can be a combination of:
11650///
11651///     0x01 // QNaN
11652///     0x02 // Positive Zero
11653///     0x04 // Negative Zero
11654///     0x08 // Positive Infinity
11655///     0x10 // Negative Infinity
11656///     0x20 // Denormal
11657///     0x40 // Negative
11658///     0x80 // SNaN
11659///
11660/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask)
11661#[inline]
11662#[target_feature(enable = "avx512fp16,avx512vl")]
11663#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11664#[rustc_legacy_const_generics(2)]
11665#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11666pub fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
11667    unsafe {
11668        static_assert_uimm_bits!(IMM8, 8);
11669        fpclass_asm!(__mmask16, k1, ymm_reg, a)
11670    }
11671}
11672
11673/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11674/// by imm8, and store the results in mask vector k.
11675/// imm can be a combination of:
11676///
11677///     0x01 // QNaN
11678///     0x02 // Positive Zero
11679///     0x04 // Negative Zero
11680///     0x08 // Positive Infinity
11681///     0x10 // Negative Infinity
11682///     0x20 // Denormal
11683///     0x40 // Negative
11684///     0x80 // SNaN
11685///
11686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fpclass_ph_mask)
11687#[inline]
11688#[target_feature(enable = "avx512fp16")]
11689#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11690#[rustc_legacy_const_generics(1)]
11691#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11692pub fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
11693    unsafe {
11694        static_assert_uimm_bits!(IMM8, 8);
11695        fpclass_asm!(__mmask32, zmm_reg, a)
11696    }
11697}
11698
11699/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11700/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11701/// corresponding mask bit is not set).
11702/// imm can be a combination of:
11703///
11704///     0x01 // QNaN
11705///     0x02 // Positive Zero
11706///     0x04 // Negative Zero
11707///     0x08 // Positive Infinity
11708///     0x10 // Negative Infinity
11709///     0x20 // Denormal
11710///     0x40 // Negative
11711///     0x80 // SNaN
11712///
11713/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fpclass_ph_mask)
11714#[inline]
11715#[target_feature(enable = "avx512fp16")]
11716#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11717#[rustc_legacy_const_generics(2)]
11718#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11719pub fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
11720    unsafe {
11721        static_assert_uimm_bits!(IMM8, 8);
11722        fpclass_asm!(__mmask32, k1, zmm_reg, a)
11723    }
11724}
11725
11726/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11727/// by imm8, and store the result in mask vector k.
11728/// imm can be a combination of:
11729///
11730///     0x01 // QNaN
11731///     0x02 // Positive Zero
11732///     0x04 // Negative Zero
11733///     0x08 // Positive Infinity
11734///     0x10 // Negative Infinity
11735///     0x20 // Denormal
11736///     0x40 // Negative
11737///     0x80 // SNaN
11738///
11739/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_sh_mask)
11740#[inline]
11741#[target_feature(enable = "avx512fp16")]
11742#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11743#[rustc_legacy_const_generics(1)]
11744#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11745pub fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11746    _mm_mask_fpclass_sh_mask::<IMM8>(0xff, a)
11747}
11748
11749/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11750/// by imm8, and store the result in mask vector k using zeromask k (elements are zeroed out when the
11751/// corresponding mask bit is not set).
11752/// imm can be a combination of:
11753///
11754///     0x01 // QNaN
11755///     0x02 // Positive Zero
11756///     0x04 // Negative Zero
11757///     0x08 // Positive Infinity
11758///     0x10 // Negative Infinity
11759///     0x20 // Denormal
11760///     0x40 // Negative
11761///     0x80 // SNaN
11762///
11763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_sh_mask)
11764#[inline]
11765#[target_feature(enable = "avx512fp16")]
11766#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11767#[rustc_legacy_const_generics(2)]
11768#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11769pub fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11770    unsafe {
11771        static_assert_uimm_bits!(IMM8, 8);
11772        vfpclasssh(a, IMM8, k1)
11773    }
11774}
11775
11776/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11777/// and store the results in dst.
11778///
11779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph)
11780#[inline]
11781#[target_feature(enable = "avx512fp16,avx512vl")]
11782#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11783#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11784pub const fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
11785    unsafe { simd_select_bitmask(k, b, a) }
11786}
11787
11788/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11789/// and store the results in dst.
11790///
11791/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph)
11792#[inline]
11793#[target_feature(enable = "avx512fp16,avx512vl")]
11794#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11795#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11796pub const fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
11797    unsafe { simd_select_bitmask(k, b, a) }
11798}
11799
11800/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11801/// and store the results in dst.
11802///
11803/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph)
11804#[inline]
11805#[target_feature(enable = "avx512fp16")]
11806#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11807#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11808pub const fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
11809    unsafe { simd_select_bitmask(k, b, a) }
11810}
11811
11812/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11813/// and index in idx, and store the results in dst.
11814///
11815/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph)
11816#[inline]
11817#[target_feature(enable = "avx512fp16,avx512vl")]
11818#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11819pub fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
11820    _mm_castsi128_ph(_mm_permutex2var_epi16(
11821        _mm_castph_si128(a),
11822        idx,
11823        _mm_castph_si128(b),
11824    ))
11825}
11826
11827/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11828/// and index in idx, and store the results in dst.
11829///
11830/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph)
11831#[inline]
11832#[target_feature(enable = "avx512fp16,avx512vl")]
11833#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11834pub fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
11835    _mm256_castsi256_ph(_mm256_permutex2var_epi16(
11836        _mm256_castph_si256(a),
11837        idx,
11838        _mm256_castph_si256(b),
11839    ))
11840}
11841
11842/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11843/// and index in idx, and store the results in dst.
11844///
11845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph)
11846#[inline]
11847#[target_feature(enable = "avx512fp16")]
11848#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11849pub fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
11850    _mm512_castsi512_ph(_mm512_permutex2var_epi16(
11851        _mm512_castph_si512(a),
11852        idx,
11853        _mm512_castph_si512(b),
11854    ))
11855}
11856
11857/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11858/// and store the results in dst.
11859///
11860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph)
11861#[inline]
11862#[target_feature(enable = "avx512fp16,avx512vl")]
11863#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11864pub fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
11865    _mm_castsi128_ph(_mm_permutexvar_epi16(idx, _mm_castph_si128(a)))
11866}
11867
11868/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11869/// and store the results in dst.
11870///
11871/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph)
11872#[inline]
11873#[target_feature(enable = "avx512fp16,avx512vl")]
11874#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11875pub fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
11876    _mm256_castsi256_ph(_mm256_permutexvar_epi16(idx, _mm256_castph_si256(a)))
11877}
11878
11879/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11880/// and store the results in dst.
11881///
11882/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph)
11883#[inline]
11884#[target_feature(enable = "avx512fp16")]
11885#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11886pub fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
11887    _mm512_castsi512_ph(_mm512_permutexvar_epi16(idx, _mm512_castph_si512(a)))
11888}
11889
11890/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11891/// and store the results in dst.
11892///
11893/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph)
11894#[inline]
11895#[target_feature(enable = "avx512fp16,avx512vl")]
11896#[cfg_attr(test, assert_instr(vcvtw2ph))]
11897#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11898pub fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
11899    unsafe { vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION) }
11900}
11901
11902/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11903/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11904/// mask bit is not set).
11905///
11906/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph)
11907#[inline]
11908#[target_feature(enable = "avx512fp16,avx512vl")]
11909#[cfg_attr(test, assert_instr(vcvtw2ph))]
11910#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11911pub fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11912    unsafe { simd_select_bitmask(k, _mm_cvtepi16_ph(a), src) }
11913}
11914
11915/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11916/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11917///
11918/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph)
11919#[inline]
11920#[target_feature(enable = "avx512fp16,avx512vl")]
11921#[cfg_attr(test, assert_instr(vcvtw2ph))]
11922#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11923pub fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
11924    _mm_mask_cvtepi16_ph(_mm_setzero_ph(), k, a)
11925}
11926
11927/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11928/// and store the results in dst.
11929///
11930/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_ph)
11931#[inline]
11932#[target_feature(enable = "avx512fp16,avx512vl")]
11933#[cfg_attr(test, assert_instr(vcvtw2ph))]
11934#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11935pub fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
11936    unsafe { vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION) }
11937}
11938
11939/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11940/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11941/// mask bit is not set).
11942///
11943/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph)
11944#[inline]
11945#[target_feature(enable = "avx512fp16,avx512vl")]
11946#[cfg_attr(test, assert_instr(vcvtw2ph))]
11947#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11948pub fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11949    unsafe { simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src) }
11950}
11951
11952/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11953/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11954///
11955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph)
11956#[inline]
11957#[target_feature(enable = "avx512fp16,avx512vl")]
11958#[cfg_attr(test, assert_instr(vcvtw2ph))]
11959#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11960pub fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
11961    _mm256_mask_cvtepi16_ph(_mm256_setzero_ph(), k, a)
11962}
11963
11964/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11965/// and store the results in dst.
11966///
11967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_ph)
11968#[inline]
11969#[target_feature(enable = "avx512fp16")]
11970#[cfg_attr(test, assert_instr(vcvtw2ph))]
11971#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11972pub fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
11973    unsafe { vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION) }
11974}
11975
11976/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11977/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11978/// mask bit is not set).
11979///
11980/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_ph)
11981#[inline]
11982#[target_feature(enable = "avx512fp16")]
11983#[cfg_attr(test, assert_instr(vcvtw2ph))]
11984#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11985pub fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11986    unsafe { simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src) }
11987}
11988
11989/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11990/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11991///
11992/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_ph)
11993#[inline]
11994#[target_feature(enable = "avx512fp16")]
11995#[cfg_attr(test, assert_instr(vcvtw2ph))]
11996#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
11997pub fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
11998    _mm512_mask_cvtepi16_ph(_mm512_setzero_ph(), k, a)
11999}
12000
12001/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12002/// and store the results in dst.
12003///
12004/// Rounding is done according to the rounding parameter, which can be one of:
12005///
12006/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12007/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12008/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12009/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12010/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12011///
12012/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi16_ph)
12013#[inline]
12014#[target_feature(enable = "avx512fp16")]
12015#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
12016#[rustc_legacy_const_generics(1)]
12017#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12018pub fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
12019    unsafe {
12020        static_assert_rounding!(ROUNDING);
12021        vcvtw2ph_512(a.as_i16x32(), ROUNDING)
12022    }
12023}
12024
12025/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12026/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12027/// mask bit is not set).
12028///
12029/// Rounding is done according to the rounding parameter, which can be one of:
12030///
12031/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12032/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12033/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12034/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12035/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12036///
12037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi16_ph)
12038#[inline]
12039#[target_feature(enable = "avx512fp16")]
12040#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
12041#[rustc_legacy_const_generics(3)]
12042#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12043pub fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
12044    src: __m512h,
12045    k: __mmask32,
12046    a: __m512i,
12047) -> __m512h {
12048    unsafe {
12049        static_assert_rounding!(ROUNDING);
12050        simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
12051    }
12052}
12053
12054/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12055/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12056///
12057/// Rounding is done according to the rounding parameter, which can be one of:
12058///
12059/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12060/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12061/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12062/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12063/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12064///
12065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph)
12066#[inline]
12067#[target_feature(enable = "avx512fp16")]
12068#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
12069#[rustc_legacy_const_generics(2)]
12070#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12071pub fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
12072    static_assert_rounding!(ROUNDING);
12073    _mm512_mask_cvt_roundepi16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
12074}
12075
12076/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12077/// and store the results in dst.
12078///
12079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_ph)
12080#[inline]
12081#[target_feature(enable = "avx512fp16,avx512vl")]
12082#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12083#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12084pub fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
12085    unsafe { vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION) }
12086}
12087
12088/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12089/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12090/// mask bit is not set).
12091///
12092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_ph)
12093#[inline]
12094#[target_feature(enable = "avx512fp16,avx512vl")]
12095#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12096#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12097pub fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12098    unsafe { simd_select_bitmask(k, _mm_cvtepu16_ph(a), src) }
12099}
12100
12101/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12102/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12103///
12104/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_ph)
12105#[inline]
12106#[target_feature(enable = "avx512fp16,avx512vl")]
12107#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12108#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12109pub fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
12110    _mm_mask_cvtepu16_ph(_mm_setzero_ph(), k, a)
12111}
12112
12113/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12114/// and store the results in dst.
12115///
12116/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_ph)
12117#[inline]
12118#[target_feature(enable = "avx512fp16,avx512vl")]
12119#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12120#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12121pub fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
12122    unsafe { vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION) }
12123}
12124
12125/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12126/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12127/// mask bit is not set).
12128///
12129/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_ph)
12130#[inline]
12131#[target_feature(enable = "avx512fp16,avx512vl")]
12132#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12133#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12134pub fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
12135    unsafe { simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src) }
12136}
12137
12138/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12139/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12140///
12141/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_ph)
12142#[inline]
12143#[target_feature(enable = "avx512fp16,avx512vl")]
12144#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12145#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12146pub fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
12147    _mm256_mask_cvtepu16_ph(_mm256_setzero_ph(), k, a)
12148}
12149
12150/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12151/// and store the results in dst.
12152///
12153/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_ph)
12154#[inline]
12155#[target_feature(enable = "avx512fp16")]
12156#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12157#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12158pub fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
12159    unsafe { vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION) }
12160}
12161
12162/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12163/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12164/// mask bit is not set).
12165///
12166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_ph)
12167#[inline]
12168#[target_feature(enable = "avx512fp16")]
12169#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12170#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12171pub fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
12172    unsafe { simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src) }
12173}
12174
12175/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12176/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12177///
12178/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_ph)
12179#[inline]
12180#[target_feature(enable = "avx512fp16")]
12181#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12182#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12183pub fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
12184    _mm512_mask_cvtepu16_ph(_mm512_setzero_ph(), k, a)
12185}
12186
12187/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12188/// and store the results in dst.
12189///
12190/// Rounding is done according to the rounding parameter, which can be one of:
12191///
12192/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12193/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12194/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12195/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12196/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12197///
12198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu16_ph)
12199#[inline]
12200#[target_feature(enable = "avx512fp16")]
12201#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
12202#[rustc_legacy_const_generics(1)]
12203#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12204pub fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
12205    unsafe {
12206        static_assert_rounding!(ROUNDING);
12207        vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
12208    }
12209}
12210
12211/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12212/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12213/// mask bit is not set).
12214///
12215/// Rounding is done according to the rounding parameter, which can be one of:
12216///
12217/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12218/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12219/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12220/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12221/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12222///
12223/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu16_ph)
12224#[inline]
12225#[target_feature(enable = "avx512fp16")]
12226#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
12227#[rustc_legacy_const_generics(3)]
12228#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12229pub fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
12230    src: __m512h,
12231    k: __mmask32,
12232    a: __m512i,
12233) -> __m512h {
12234    unsafe {
12235        static_assert_rounding!(ROUNDING);
12236        simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
12237    }
12238}
12239
12240/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12241/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12242///
12243/// Rounding is done according to the rounding parameter, which can be one of:
12244///
12245/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12246/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12247/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12248/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12249/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12250///
12251/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu16_ph)
12252#[inline]
12253#[target_feature(enable = "avx512fp16")]
12254#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
12255#[rustc_legacy_const_generics(2)]
12256#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12257pub fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
12258    static_assert_rounding!(ROUNDING);
12259    _mm512_mask_cvt_roundepu16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
12260}
12261
12262/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12263/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12264///
12265/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph)
12266#[inline]
12267#[target_feature(enable = "avx512fp16,avx512vl")]
12268#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12269#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12270pub fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
12271    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), 0xff, a)
12272}
12273
12274/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12275/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12276/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12277///
12278/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ph)
12279#[inline]
12280#[target_feature(enable = "avx512fp16,avx512vl")]
12281#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12282#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12283pub fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12284    unsafe { vcvtdq2ph_128(a.as_i32x4(), src, k) }
12285}
12286
12287/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12288/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12289/// The upper 64 bits of dst are zeroed out.
12290///
12291/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph)
12292#[inline]
12293#[target_feature(enable = "avx512fp16,avx512vl")]
12294#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12295#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12296pub fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
12297    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
12298}
12299
12300/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12301/// and store the results in dst.
12302///
12303/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ph)
12304#[inline]
12305#[target_feature(enable = "avx512fp16,avx512vl")]
12306#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12307#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12308pub fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
12309    unsafe { vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION) }
12310}
12311
12312/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12313/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12314/// mask bit is not set).
12315///
12316/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ph)
12317#[inline]
12318#[target_feature(enable = "avx512fp16,avx512vl")]
12319#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12320#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12321pub fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12322    unsafe { simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src) }
12323}
12324
12325/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12326/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12327///
12328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph)
12329#[inline]
12330#[target_feature(enable = "avx512fp16,avx512vl")]
12331#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12332#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12333pub fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
12334    _mm256_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
12335}
12336
12337/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12338/// and store the results in dst.
12339///
12340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ph)
12341#[inline]
12342#[target_feature(enable = "avx512fp16")]
12343#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12344#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12345pub fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
12346    unsafe { vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION) }
12347}
12348
12349/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12350/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12351/// mask bit is not set).
12352///
12353/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ph)
12354#[inline]
12355#[target_feature(enable = "avx512fp16")]
12356#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12357#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12358pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12359    unsafe { simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src) }
12360}
12361
12362/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12363/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12364///
12365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ph)
12366#[inline]
12367#[target_feature(enable = "avx512fp16")]
12368#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12369#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12370pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
12371    _mm512_mask_cvtepi32_ph(f16x16::ZERO.as_m256h(), k, a)
12372}
12373
12374/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12375/// and store the results in dst.
12376///
12377/// Rounding is done according to the rounding parameter, which can be one of:
12378///
12379/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12380/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12381/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12382/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12383/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12384///
12385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ph)
12386#[inline]
12387#[target_feature(enable = "avx512fp16")]
12388#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12389#[rustc_legacy_const_generics(1)]
12390#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12391pub fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12392    unsafe {
12393        static_assert_rounding!(ROUNDING);
12394        vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
12395    }
12396}
12397
12398/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12399/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12400/// mask bit is not set).
12401///
12402/// Rounding is done according to the rounding parameter, which can be one of:
12403///
12404/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12405/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12406/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12407/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12408/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12409///
12410/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ph)
12411#[inline]
12412#[target_feature(enable = "avx512fp16")]
12413#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12414#[rustc_legacy_const_generics(3)]
12415#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12416pub fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
12417    src: __m256h,
12418    k: __mmask16,
12419    a: __m512i,
12420) -> __m256h {
12421    unsafe {
12422        static_assert_rounding!(ROUNDING);
12423        simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
12424    }
12425}
12426
12427/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12428/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12429///
12430/// Rounding is done according to the rounding parameter, which can be one of:
12431///
12432/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12433/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12434/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12435/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12436/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12437///
12438/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph)
12439#[inline]
12440#[target_feature(enable = "avx512fp16")]
12441#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12442#[rustc_legacy_const_generics(2)]
12443#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12444pub fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12445    static_assert_rounding!(ROUNDING);
12446    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12447}
12448
12449/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12450/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12451/// of dst.
12452///
12453/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti32_sh)
12454#[inline]
12455#[target_feature(enable = "avx512fp16")]
12456#[cfg_attr(test, assert_instr(vcvtsi2sh))]
12457#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12458pub fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
12459    unsafe { vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12460}
12461
12462/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12463/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12464/// of dst.
12465///
12466/// Rounding is done according to the rounding parameter, which can be one of:
12467///
12468/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12469/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12470/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12471/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12472/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12473///
12474/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh)
12475#[inline]
12476#[target_feature(enable = "avx512fp16")]
12477#[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
12478#[rustc_legacy_const_generics(2)]
12479#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12480pub fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
12481    unsafe {
12482        static_assert_rounding!(ROUNDING);
12483        vcvtsi2sh(a, b, ROUNDING)
12484    }
12485}
12486
12487/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12488/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12489///
12490/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_ph)
12491#[inline]
12492#[target_feature(enable = "avx512fp16,avx512vl")]
12493#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12494#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12495pub fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
12496    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), 0xff, a)
12497}
12498
12499/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12500/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12501/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12502///
12503/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_ph)
12504#[inline]
12505#[target_feature(enable = "avx512fp16,avx512vl")]
12506#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12507#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12508pub fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12509    unsafe { vcvtudq2ph_128(a.as_u32x4(), src, k) }
12510}
12511
12512/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12513/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12514/// The upper 64 bits of dst are zeroed out.
12515///
12516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_ph)
12517#[inline]
12518#[target_feature(enable = "avx512fp16,avx512vl")]
12519#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12520#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12521pub fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
12522    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12523}
12524
12525/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12526/// and store the results in dst.
12527///
12528/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_ph)
12529#[inline]
12530#[target_feature(enable = "avx512fp16,avx512vl")]
12531#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12532#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12533pub fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
12534    unsafe { vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION) }
12535}
12536
12537/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12538/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12539/// mask bit is not set).
12540///
12541/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_ph)
12542#[inline]
12543#[target_feature(enable = "avx512fp16,avx512vl")]
12544#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12545#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12546pub fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12547    unsafe { simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src) }
12548}
12549
12550/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12551/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12552///
12553/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_ph)
12554#[inline]
12555#[target_feature(enable = "avx512fp16,avx512vl")]
12556#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12557#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12558pub fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
12559    _mm256_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12560}
12561
12562/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12563/// and store the results in dst.
12564///
12565/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ph)
12566#[inline]
12567#[target_feature(enable = "avx512fp16")]
12568#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12569#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12570pub fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
12571    unsafe { vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION) }
12572}
12573
12574/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12575/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12576/// mask bit is not set).
12577///
12578/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ph)
12579#[inline]
12580#[target_feature(enable = "avx512fp16")]
12581#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12582#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12583pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12584    unsafe { simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src) }
12585}
12586
12587/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12588/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12589///
12590/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ph)
12591#[inline]
12592#[target_feature(enable = "avx512fp16")]
12593#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12594#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12595pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
12596    _mm512_mask_cvtepu32_ph(f16x16::ZERO.as_m256h(), k, a)
12597}
12598
12599/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12600/// and store the results in dst.
12601///
12602/// Rounding is done according to the rounding parameter, which can be one of:
12603///
12604/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12605/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12606/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12607/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12608/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12609///
12610/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ph)
12611#[inline]
12612#[target_feature(enable = "avx512fp16")]
12613#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12614#[rustc_legacy_const_generics(1)]
12615#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12616pub fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12617    unsafe {
12618        static_assert_rounding!(ROUNDING);
12619        vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
12620    }
12621}
12622
12623/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12624/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12625/// mask bit is not set).
12626///
12627/// Rounding is done according to the rounding parameter, which can be one of:
12628///
12629/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12630/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12631/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12632/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12633/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12634///
12635/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ph)
12636#[inline]
12637#[target_feature(enable = "avx512fp16")]
12638#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12639#[rustc_legacy_const_generics(3)]
12640#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12641pub fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
12642    src: __m256h,
12643    k: __mmask16,
12644    a: __m512i,
12645) -> __m256h {
12646    unsafe {
12647        static_assert_rounding!(ROUNDING);
12648        simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
12649    }
12650}
12651
12652/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12653/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12654///
12655/// Rounding is done according to the rounding parameter, which can be one of:
12656///
12657/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12658/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12659/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12660/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12661/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12662///
12663/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ph)
12664#[inline]
12665#[target_feature(enable = "avx512fp16")]
12666#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12667#[rustc_legacy_const_generics(2)]
12668#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12669pub fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12670    static_assert_rounding!(ROUNDING);
12671    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12672}
12673
12674/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12675/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12676/// of dst.
12677///
12678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sh)
12679#[inline]
12680#[target_feature(enable = "avx512fp16")]
12681#[cfg_attr(test, assert_instr(vcvtusi2sh))]
12682#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12683pub fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
12684    unsafe { vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12685}
12686
12687/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12688/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12689/// of dst.
12690///
12691/// Rounding is done according to the rounding parameter, which can be one of:
12692///
12693/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12694/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12695/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12696/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12697/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12698///
12699/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu32_sh)
12700#[inline]
12701#[target_feature(enable = "avx512fp16")]
12702#[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
12703#[rustc_legacy_const_generics(2)]
12704#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12705pub fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
12706    unsafe {
12707        static_assert_rounding!(ROUNDING);
12708        vcvtusi2sh(a, b, ROUNDING)
12709    }
12710}
12711
12712/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12713/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12714///
12715/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_ph)
12716#[inline]
12717#[target_feature(enable = "avx512fp16,avx512vl")]
12718#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12719#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12720pub fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
12721    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12722}
12723
12724/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12725/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12726/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12727///
12728/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_ph)
12729#[inline]
12730#[target_feature(enable = "avx512fp16,avx512vl")]
12731#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12732#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12733pub fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12734    unsafe { vcvtqq2ph_128(a.as_i64x2(), src, k) }
12735}
12736
12737/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12738/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12739/// The upper 96 bits of dst are zeroed out.
12740///
12741/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_ph)
12742#[inline]
12743#[target_feature(enable = "avx512fp16,avx512vl")]
12744#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12745#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12746pub fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
12747    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12748}
12749
12750/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12751/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12752///
12753/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_ph)
12754#[inline]
12755#[target_feature(enable = "avx512fp16,avx512vl")]
12756#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12757#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12758pub fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
12759    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12760}
12761
12762/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12763/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12764/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12765///
12766/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_ph)
12767#[inline]
12768#[target_feature(enable = "avx512fp16,avx512vl")]
12769#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12770#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12771pub fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12772    unsafe { vcvtqq2ph_256(a.as_i64x4(), src, k) }
12773}
12774
12775/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12776/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12777/// The upper 64 bits of dst are zeroed out.
12778///
12779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_ph)
12780#[inline]
12781#[target_feature(enable = "avx512fp16,avx512vl")]
12782#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12783#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12784pub fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
12785    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12786}
12787
12788/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12789/// and store the results in dst.
12790///
12791/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_ph)
12792#[inline]
12793#[target_feature(enable = "avx512fp16")]
12794#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12795#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12796pub fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
12797    unsafe { vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION) }
12798}
12799
12800/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12801/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12802/// mask bit is not set).
12803///
12804/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_ph)
12805#[inline]
12806#[target_feature(enable = "avx512fp16")]
12807#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12808#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12809pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12810    unsafe { simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src) }
12811}
12812
12813/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12814/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12815///
12816/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_ph)
12817#[inline]
12818#[target_feature(enable = "avx512fp16")]
12819#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12820#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12821pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
12822    _mm512_mask_cvtepi64_ph(f16x8::ZERO.as_m128h(), k, a)
12823}
12824
12825/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12826/// and store the results in dst.
12827///
12828/// Rounding is done according to the rounding parameter, which can be one of:
12829///
12830/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12831/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12832/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12833/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12834/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12835///
12836/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi64_ph)
12837#[inline]
12838#[target_feature(enable = "avx512fp16")]
12839#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12840#[rustc_legacy_const_generics(1)]
12841#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12842pub fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12843    unsafe {
12844        static_assert_rounding!(ROUNDING);
12845        vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
12846    }
12847}
12848
12849/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12850/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12851/// mask bit is not set).
12852///
12853/// Rounding is done according to the rounding parameter, which can be one of:
12854///
12855/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12856/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12857/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12858/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12859/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12860///
12861/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi64_ph)
12862#[inline]
12863#[target_feature(enable = "avx512fp16")]
12864#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12865#[rustc_legacy_const_generics(3)]
12866#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12867pub fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
12868    src: __m128h,
12869    k: __mmask8,
12870    a: __m512i,
12871) -> __m128h {
12872    unsafe {
12873        static_assert_rounding!(ROUNDING);
12874        simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
12875    }
12876}
12877
12878/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12879/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12880///
12881/// Rounding is done according to the rounding parameter, which can be one of:
12882///
12883/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12884/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12885/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12886/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12887/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12888///
12889/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi64_ph)
12890#[inline]
12891#[target_feature(enable = "avx512fp16")]
12892#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12893#[rustc_legacy_const_generics(2)]
12894#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12895pub fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12896    static_assert_rounding!(ROUNDING);
12897    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
12898}
12899
12900/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12901/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12902///
12903/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu64_ph)
12904#[inline]
12905#[target_feature(enable = "avx512fp16,avx512vl")]
12906#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12907#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12908pub fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
12909    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12910}
12911
12912/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12913/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12914/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12915///
12916/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu64_ph)
12917#[inline]
12918#[target_feature(enable = "avx512fp16,avx512vl")]
12919#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12920#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12921pub fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12922    unsafe { vcvtuqq2ph_128(a.as_u64x2(), src, k) }
12923}
12924
12925/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12926/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12927/// The upper 96 bits of dst are zeroed out.
12928///
12929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu64_ph)
12930#[inline]
12931#[target_feature(enable = "avx512fp16,avx512vl")]
12932#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12933#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12934pub fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
12935    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12936}
12937
12938/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12939/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12940///
12941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu64_ph)
12942#[inline]
12943#[target_feature(enable = "avx512fp16,avx512vl")]
12944#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12945#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12946pub fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
12947    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12948}
12949
12950/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12951/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12952/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12953///
12954/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu64_ph)
12955#[inline]
12956#[target_feature(enable = "avx512fp16,avx512vl")]
12957#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12958#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12959pub fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12960    unsafe { vcvtuqq2ph_256(a.as_u64x4(), src, k) }
12961}
12962
12963/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12964/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12965/// The upper 64 bits of dst are zeroed out.
12966///
12967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu64_ph)
12968#[inline]
12969#[target_feature(enable = "avx512fp16,avx512vl")]
12970#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12971#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12972pub fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
12973    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12974}
12975
12976/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12977/// and store the results in dst.
12978///
12979/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu64_ph)
12980#[inline]
12981#[target_feature(enable = "avx512fp16")]
12982#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12983#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12984pub fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
12985    unsafe { vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION) }
12986}
12987
12988/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12989/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12990/// mask bit is not set).
12991///
12992/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu64_ph)
12993#[inline]
12994#[target_feature(enable = "avx512fp16")]
12995#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12996#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
12997pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12998    unsafe { simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src) }
12999}
13000
13001/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
13002/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13003///
13004/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu64_ph)
13005#[inline]
13006#[target_feature(enable = "avx512fp16")]
13007#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
13008#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13009pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
13010    _mm512_mask_cvtepu64_ph(f16x8::ZERO.as_m128h(), k, a)
13011}
13012
13013/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
13014/// and store the results in dst.
13015///
13016/// Rounding is done according to the rounding parameter, which can be one of:
13017///
13018/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13019/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13020/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13021/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13022/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13023///
13024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu64_ph)
13025#[inline]
13026#[target_feature(enable = "avx512fp16")]
13027#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
13028#[rustc_legacy_const_generics(1)]
13029#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13030pub fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
13031    unsafe {
13032        static_assert_rounding!(ROUNDING);
13033        vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
13034    }
13035}
13036
13037/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
13038/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
13039/// mask bit is not set).
13040///
13041/// Rounding is done according to the rounding parameter, which can be one of:
13042///
13043/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13044/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13045/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13046/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13047/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13048///
13049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu64_ph)
13050#[inline]
13051#[target_feature(enable = "avx512fp16")]
13052#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
13053#[rustc_legacy_const_generics(3)]
13054#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13055pub fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
13056    src: __m128h,
13057    k: __mmask8,
13058    a: __m512i,
13059) -> __m128h {
13060    unsafe {
13061        static_assert_rounding!(ROUNDING);
13062        simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
13063    }
13064}
13065
13066/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
13067/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13068///
13069/// Rounding is done according to the rounding parameter, which can be one of:
13070///
13071/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13072/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13073/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13074/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13075/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13076///
13077/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu64_ph)
13078#[inline]
13079#[target_feature(enable = "avx512fp16")]
13080#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
13081#[rustc_legacy_const_generics(2)]
13082#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13083pub fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
13084    static_assert_rounding!(ROUNDING);
13085    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
13086}
13087
13088/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13089/// floating-point elements, and store the results in dst.
13090///
13091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph)
13092#[inline]
13093#[target_feature(enable = "avx512fp16,avx512vl")]
13094#[cfg_attr(test, assert_instr(vcvtps2phx))]
13095#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13096pub fn _mm_cvtxps_ph(a: __m128) -> __m128h {
13097    _mm_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
13098}
13099
13100/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13101/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13102/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13103///
13104/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph)
13105#[inline]
13106#[target_feature(enable = "avx512fp16,avx512vl")]
13107#[cfg_attr(test, assert_instr(vcvtps2phx))]
13108#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13109pub fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
13110    unsafe { vcvtps2phx_128(a, src, k) }
13111}
13112
13113/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13114/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13115/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13116///
13117/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph)
13118#[inline]
13119#[target_feature(enable = "avx512fp16,avx512vl")]
13120#[cfg_attr(test, assert_instr(vcvtps2phx))]
13121#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13122pub fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
13123    _mm_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
13124}
13125
13126/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13127/// floating-point elements, and store the results in dst.
13128///
13129/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph)
13130#[inline]
13131#[target_feature(enable = "avx512fp16,avx512vl")]
13132#[cfg_attr(test, assert_instr(vcvtps2phx))]
13133#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13134pub fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
13135    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
13136}
13137
13138/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13139/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13140/// when the corresponding mask bit is not set).
13141///
13142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph)
13143#[inline]
13144#[target_feature(enable = "avx512fp16,avx512vl")]
13145#[cfg_attr(test, assert_instr(vcvtps2phx))]
13146#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13147pub fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
13148    unsafe { vcvtps2phx_256(a, src, k) }
13149}
13150
13151/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13152/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13153/// corresponding mask bit is not set).
13154///
13155/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph)
13156#[inline]
13157#[target_feature(enable = "avx512fp16,avx512vl")]
13158#[cfg_attr(test, assert_instr(vcvtps2phx))]
13159#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13160pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
13161    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
13162}
13163
13164/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13165/// floating-point elements, and store the results in dst.
13166///
13167/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph)
13168#[inline]
13169#[target_feature(enable = "avx512fp16")]
13170#[cfg_attr(test, assert_instr(vcvtps2phx))]
13171#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13172pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
13173    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), 0xffff, a)
13174}
13175
13176/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13177/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13178/// when the corresponding mask bit is not set).
13179///
13180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph)
13181#[inline]
13182#[target_feature(enable = "avx512fp16")]
13183#[cfg_attr(test, assert_instr(vcvtps2phx))]
13184#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13185pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
13186    unsafe { vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
13187}
13188
13189/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13190/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13191/// corresponding mask bit is not set).
13192///
13193/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph)
13194#[inline]
13195#[target_feature(enable = "avx512fp16")]
13196#[cfg_attr(test, assert_instr(vcvtps2phx))]
13197#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13198pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
13199    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), k, a)
13200}
13201
13202/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13203/// floating-point elements, and store the results in dst.
13204///
13205/// Rounding is done according to the rounding parameter, which can be one of:
13206///
13207/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13208/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13209/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13210/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13211/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13212///
13213/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundps_ph)
13214#[inline]
13215#[target_feature(enable = "avx512fp16")]
13216#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
13217#[rustc_legacy_const_generics(1)]
13218#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13219pub fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
13220    static_assert_rounding!(ROUNDING);
13221    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), 0xffff, a)
13222}
13223
13224/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13225/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13226/// when the corresponding mask bit is not set).
13227///
13228/// Rounding is done according to the rounding parameter, which can be one of:
13229///
13230/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13231/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13232/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13233/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13234/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13235///
13236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundps_ph)
13237#[inline]
13238#[target_feature(enable = "avx512fp16")]
13239#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
13240#[rustc_legacy_const_generics(3)]
13241#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13242pub fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
13243    src: __m256h,
13244    k: __mmask16,
13245    a: __m512,
13246) -> __m256h {
13247    unsafe {
13248        static_assert_rounding!(ROUNDING);
13249        vcvtps2phx_512(a, src, k, ROUNDING)
13250    }
13251}
13252
13253/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13254/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13255/// corresponding mask bit is not set).
13256///
13257/// Rounding is done according to the rounding parameter, which can be one of:
13258///
13259/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13260/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13261/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13262/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13263/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13264///
13265/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundps_ph)
13266#[inline]
13267#[target_feature(enable = "avx512fp16")]
13268#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
13269#[rustc_legacy_const_generics(2)]
13270#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13271pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256h {
13272    static_assert_rounding!(ROUNDING);
13273    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
13274}
13275
13276/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13277/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13278/// elements from a to the upper elements of dst.
13279///
13280/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sh)
13281#[inline]
13282#[target_feature(enable = "avx512fp16")]
13283#[cfg_attr(test, assert_instr(vcvtss2sh))]
13284#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13285pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
13286    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
13287}
13288
13289/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13290/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13291/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13292/// upper elements of dst.
13293///
13294/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh)
13295#[inline]
13296#[target_feature(enable = "avx512fp16")]
13297#[cfg_attr(test, assert_instr(vcvtss2sh))]
13298#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13299pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
13300    unsafe { vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
13301}
13302
13303/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13304/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13305/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13306/// elements of dst.
13307///
13308/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh)
13309#[inline]
13310#[target_feature(enable = "avx512fp16")]
13311#[cfg_attr(test, assert_instr(vcvtss2sh))]
13312#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13313pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
13314    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), k, a, b)
13315}
13316
13317/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13318/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13319/// elements from a to the upper elements of dst.
13320///
13321/// Rounding is done according to the rounding parameter, which can be one of:
13322///
13323/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13324/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13325/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13326/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13327/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13328///
13329/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_sh)
13330#[inline]
13331#[target_feature(enable = "avx512fp16")]
13332#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13333#[rustc_legacy_const_generics(2)]
13334#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13335pub fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
13336    static_assert_rounding!(ROUNDING);
13337    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
13338}
13339
13340/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13341/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13342/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13343/// upper elements of dst.
13344///
13345/// Rounding is done according to the rounding parameter, which can be one of:
13346///
13347/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13348/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13349/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13350/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13351/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13352///
13353/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sh)
13354#[inline]
13355#[target_feature(enable = "avx512fp16")]
13356#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13357#[rustc_legacy_const_generics(4)]
13358#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13359pub fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
13360    src: __m128h,
13361    k: __mmask8,
13362    a: __m128h,
13363    b: __m128,
13364) -> __m128h {
13365    unsafe {
13366        static_assert_rounding!(ROUNDING);
13367        vcvtss2sh(a, b, src, k, ROUNDING)
13368    }
13369}
13370
13371/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13372/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13373/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13374/// elements of dst.
13375///
13376/// Rounding is done according to the rounding parameter, which can be one of:
13377///
13378/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13379/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13380/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13381/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13382/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13383///
13384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sh)
13385#[inline]
13386#[target_feature(enable = "avx512fp16")]
13387#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13388#[rustc_legacy_const_generics(3)]
13389#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13390pub fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
13391    k: __mmask8,
13392    a: __m128h,
13393    b: __m128,
13394) -> __m128h {
13395    static_assert_rounding!(ROUNDING);
13396    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
13397}
13398
13399/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13400/// floating-point elements, and store the results in dst. The upper 96 bits of dst are zeroed out.
13401///
13402/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ph)
13403#[inline]
13404#[target_feature(enable = "avx512fp16,avx512vl")]
13405#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13406#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13407pub fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
13408    _mm_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13409}
13410
13411/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13412/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13413/// when the corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13414///
13415/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ph)
13416#[inline]
13417#[target_feature(enable = "avx512fp16,avx512vl")]
13418#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13419#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13420pub fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
13421    unsafe { vcvtpd2ph_128(a, src, k) }
13422}
13423
13424/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13425/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13426/// corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13427///
13428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ph)
13429#[inline]
13430#[target_feature(enable = "avx512fp16,avx512vl")]
13431#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13432#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13433pub fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
13434    _mm_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13435}
13436
13437/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13438/// floating-point elements, and store the results in dst. The upper 64 bits of dst are zeroed out.
13439///
13440/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ph)
13441#[inline]
13442#[target_feature(enable = "avx512fp16,avx512vl")]
13443#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13444#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13445pub fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
13446    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13447}
13448
13449/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13450/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13451/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13452///
13453/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ph)
13454#[inline]
13455#[target_feature(enable = "avx512fp16,avx512vl")]
13456#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13457#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13458pub fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
13459    unsafe { vcvtpd2ph_256(a, src, k) }
13460}
13461
13462/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13463/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13464/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13465///
13466/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ph)
13467#[inline]
13468#[target_feature(enable = "avx512fp16,avx512vl")]
13469#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13470#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13471pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
13472    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13473}
13474
13475/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13476/// floating-point elements, and store the results in dst.
13477///
13478/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ph)
13479#[inline]
13480#[target_feature(enable = "avx512fp16")]
13481#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13482#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13483pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
13484    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), 0xff, a)
13485}
13486
13487/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13488/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13489/// when the corresponding mask bit is not set).
13490///
13491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ph)
13492#[inline]
13493#[target_feature(enable = "avx512fp16")]
13494#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13495#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13496pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
13497    unsafe { vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
13498}
13499
13500/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13501/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13502/// corresponding mask bit is not set).
13503///
13504/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ph)
13505#[inline]
13506#[target_feature(enable = "avx512fp16")]
13507#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13508#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13509pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
13510    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), k, a)
13511}
13512
13513/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13514/// floating-point elements, and store the results in dst.
13515///
13516/// Rounding is done according to the rounding parameter, which can be one of:
13517///
13518/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13519/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13520/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13521/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13522/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13523///
13524/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ph)
13525#[inline]
13526#[target_feature(enable = "avx512fp16")]
13527#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13528#[rustc_legacy_const_generics(1)]
13529#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13530pub fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
13531    static_assert_rounding!(ROUNDING);
13532    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a)
13533}
13534
13535/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13536/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13537/// when the corresponding mask bit is not set).
13538///
13539/// Rounding is done according to the rounding parameter, which can be one of:
13540///
13541/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13542/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13543/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13544/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13545/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13546///
13547/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ph)
13548#[inline]
13549#[target_feature(enable = "avx512fp16")]
13550#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13551#[rustc_legacy_const_generics(3)]
13552#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13553pub fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
13554    src: __m128h,
13555    k: __mmask8,
13556    a: __m512d,
13557) -> __m128h {
13558    unsafe {
13559        static_assert_rounding!(ROUNDING);
13560        vcvtpd2ph_512(a, src, k, ROUNDING)
13561    }
13562}
13563
13564/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13565/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13566/// corresponding mask bit is not set).
13567///
13568/// Rounding is done according to the rounding parameter, which can be one of:
13569///
13570/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13571/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13572/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13573/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13574/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13575///
13576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ph)
13577#[inline]
13578#[target_feature(enable = "avx512fp16")]
13579#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13580#[rustc_legacy_const_generics(2)]
13581#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13582pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
13583    static_assert_rounding!(ROUNDING);
13584    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
13585}
13586
13587/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13588/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13589/// elements from a to the upper elements of dst.
13590///
13591/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_sh)
13592#[inline]
13593#[target_feature(enable = "avx512fp16")]
13594#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13595#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13596pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
13597    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
13598}
13599
13600/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13601/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13602/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13603/// upper elements of dst.
13604///
13605/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_sh)
13606#[inline]
13607#[target_feature(enable = "avx512fp16")]
13608#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13609#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13610pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13611    unsafe { vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
13612}
13613
13614/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13615/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13616/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13617/// elements of dst.
13618///
13619/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_sh)
13620#[inline]
13621#[target_feature(enable = "avx512fp16")]
13622#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13623#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13624pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13625    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), k, a, b)
13626}
13627
13628/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13629/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13630/// elements from a to the upper elements of dst.
13631///
13632/// Rounding is done according to the rounding parameter, which can be one of:
13633///
13634/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13635/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13636/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13637/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13638/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13639///
13640/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_sh)
13641#[inline]
13642#[target_feature(enable = "avx512fp16")]
13643#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13644#[rustc_legacy_const_generics(2)]
13645#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13646pub fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
13647    static_assert_rounding!(ROUNDING);
13648    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
13649}
13650
13651/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13652/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13653/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13654/// upper elements of dst.
13655///
13656/// Rounding is done according to the rounding parameter, which can be one of:
13657///
13658/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13659/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13660/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13661/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13662/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13663///
13664/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh)
13665#[inline]
13666#[target_feature(enable = "avx512fp16")]
13667#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13668#[rustc_legacy_const_generics(4)]
13669#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13670pub fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
13671    src: __m128h,
13672    k: __mmask8,
13673    a: __m128h,
13674    b: __m128d,
13675) -> __m128h {
13676    unsafe {
13677        static_assert_rounding!(ROUNDING);
13678        vcvtsd2sh(a, b, src, k, ROUNDING)
13679    }
13680}
13681
13682/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13683/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13684/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13685/// elements of dst.
13686///
13687/// Rounding is done according to the rounding parameter, which can be one of:
13688///
13689/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13690/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13691/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13692/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13693/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13694///
13695/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh)
13696#[inline]
13697#[target_feature(enable = "avx512fp16")]
13698#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13699#[rustc_legacy_const_generics(3)]
13700#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13701pub fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
13702    k: __mmask8,
13703    a: __m128h,
13704    b: __m128d,
13705) -> __m128h {
13706    static_assert_rounding!(ROUNDING);
13707    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
13708}
13709
13710/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13711/// store the results in dst.
13712///
13713/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi16)
13714#[inline]
13715#[target_feature(enable = "avx512fp16,avx512vl")]
13716#[cfg_attr(test, assert_instr(vcvtph2w))]
13717#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13718pub fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
13719    _mm_mask_cvtph_epi16(_mm_undefined_si128(), 0xff, a)
13720}
13721
13722/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13723/// store the results in dst using writemask k (elements are copied from src when the corresponding
13724/// mask bit is not set).
13725///
13726/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi16)
13727#[inline]
13728#[target_feature(enable = "avx512fp16,avx512vl")]
13729#[cfg_attr(test, assert_instr(vcvtph2w))]
13730#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13731pub fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13732    unsafe { transmute(vcvtph2w_128(a, src.as_i16x8(), k)) }
13733}
13734
13735/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13736/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13737///
13738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi16)
13739#[inline]
13740#[target_feature(enable = "avx512fp16,avx512vl")]
13741#[cfg_attr(test, assert_instr(vcvtph2w))]
13742#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13743pub fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13744    _mm_mask_cvtph_epi16(_mm_setzero_si128(), k, a)
13745}
13746
13747/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13748/// store the results in dst.
13749///
13750/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi16)
13751#[inline]
13752#[target_feature(enable = "avx512fp16,avx512vl")]
13753#[cfg_attr(test, assert_instr(vcvtph2w))]
13754#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13755pub fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
13756    _mm256_mask_cvtph_epi16(_mm256_undefined_si256(), 0xffff, a)
13757}
13758
13759/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13760/// store the results in dst using writemask k (elements are copied from src when the corresponding
13761/// mask bit is not set).
13762///
13763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi16)
13764#[inline]
13765#[target_feature(enable = "avx512fp16,avx512vl")]
13766#[cfg_attr(test, assert_instr(vcvtph2w))]
13767#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13768pub fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13769    unsafe { transmute(vcvtph2w_256(a, src.as_i16x16(), k)) }
13770}
13771
13772/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13773/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13774///
13775/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi16)
13776#[inline]
13777#[target_feature(enable = "avx512fp16,avx512vl")]
13778#[cfg_attr(test, assert_instr(vcvtph2w))]
13779#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13780pub fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13781    _mm256_mask_cvtph_epi16(_mm256_setzero_si256(), k, a)
13782}
13783
13784/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13785/// store the results in dst.
13786///
13787/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi16)
13788#[inline]
13789#[target_feature(enable = "avx512fp16")]
13790#[cfg_attr(test, assert_instr(vcvtph2w))]
13791#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13792pub fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
13793    _mm512_mask_cvtph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13794}
13795
13796/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13797/// store the results in dst using writemask k (elements are copied from src when the corresponding
13798/// mask bit is not set).
13799///
13800/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi16)
13801#[inline]
13802#[target_feature(enable = "avx512fp16")]
13803#[cfg_attr(test, assert_instr(vcvtph2w))]
13804#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13805pub fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13806    unsafe {
13807        transmute(vcvtph2w_512(
13808            a,
13809            src.as_i16x32(),
13810            k,
13811            _MM_FROUND_CUR_DIRECTION,
13812        ))
13813    }
13814}
13815
13816/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13817/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13818///
13819/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi16)
13820#[inline]
13821#[target_feature(enable = "avx512fp16")]
13822#[cfg_attr(test, assert_instr(vcvtph2w))]
13823#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13824pub fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13825    _mm512_mask_cvtph_epi16(_mm512_setzero_si512(), k, a)
13826}
13827
13828/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13829/// store the results in dst.
13830///
13831/// Rounding is done according to the rounding parameter, which can be one of:
13832///
13833/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13834/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13835/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13836/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13837/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13838///
13839/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi16)
13840#[inline]
13841#[target_feature(enable = "avx512fp16")]
13842#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13843#[rustc_legacy_const_generics(1)]
13844#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13845pub fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
13846    static_assert_rounding!(ROUNDING);
13847    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
13848}
13849
13850/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13851/// store the results in dst using writemask k (elements are copied from src when the corresponding
13852/// mask bit is not set).
13853///
13854/// Rounding is done according to the rounding parameter, which can be one of:
13855///
13856/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13857/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13858/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13859/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13860/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13861///
13862/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi16)
13863#[inline]
13864#[target_feature(enable = "avx512fp16")]
13865#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13866#[rustc_legacy_const_generics(3)]
13867#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13868pub fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
13869    src: __m512i,
13870    k: __mmask32,
13871    a: __m512h,
13872) -> __m512i {
13873    unsafe {
13874        static_assert_rounding!(ROUNDING);
13875        transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
13876    }
13877}
13878
13879/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13880/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13881///
13882/// Rounding is done according to the rounding parameter, which can be one of:
13883///
13884/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13885/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13886/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13887/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13888/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13889///
13890/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi16)
13891#[inline]
13892#[target_feature(enable = "avx512fp16")]
13893#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13894#[rustc_legacy_const_generics(2)]
13895#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13896pub fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
13897    static_assert_rounding!(ROUNDING);
13898    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_setzero_si512(), k, a)
13899}
13900
13901/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13902/// and store the results in dst.
13903///
13904/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu16)
13905#[inline]
13906#[target_feature(enable = "avx512fp16,avx512vl")]
13907#[cfg_attr(test, assert_instr(vcvtph2uw))]
13908#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13909pub fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
13910    _mm_mask_cvtph_epu16(_mm_undefined_si128(), 0xff, a)
13911}
13912
13913/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13914/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13915/// mask bit is not set).
13916///
13917/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu16)
13918#[inline]
13919#[target_feature(enable = "avx512fp16,avx512vl")]
13920#[cfg_attr(test, assert_instr(vcvtph2uw))]
13921#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13922pub fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13923    unsafe { transmute(vcvtph2uw_128(a, src.as_u16x8(), k)) }
13924}
13925
13926/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13927/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13928///
13929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu16)
13930#[inline]
13931#[target_feature(enable = "avx512fp16,avx512vl")]
13932#[cfg_attr(test, assert_instr(vcvtph2uw))]
13933#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13934pub fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13935    _mm_mask_cvtph_epu16(_mm_setzero_si128(), k, a)
13936}
13937
13938/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13939/// and store the results in dst.
13940///
13941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu16)
13942#[inline]
13943#[target_feature(enable = "avx512fp16,avx512vl")]
13944#[cfg_attr(test, assert_instr(vcvtph2uw))]
13945#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13946pub fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
13947    _mm256_mask_cvtph_epu16(_mm256_undefined_si256(), 0xffff, a)
13948}
13949
13950/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13951/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13952/// mask bit is not set).
13953///
13954/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu16)
13955#[inline]
13956#[target_feature(enable = "avx512fp16,avx512vl")]
13957#[cfg_attr(test, assert_instr(vcvtph2uw))]
13958#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13959pub fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13960    unsafe { transmute(vcvtph2uw_256(a, src.as_u16x16(), k)) }
13961}
13962
13963/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13964/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13965///
13966/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu16)
13967#[inline]
13968#[target_feature(enable = "avx512fp16,avx512vl")]
13969#[cfg_attr(test, assert_instr(vcvtph2uw))]
13970#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13971pub fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
13972    _mm256_mask_cvtph_epu16(_mm256_setzero_si256(), k, a)
13973}
13974
13975/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13976/// and store the results in dst.
13977///
13978/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu16)
13979#[inline]
13980#[target_feature(enable = "avx512fp16")]
13981#[cfg_attr(test, assert_instr(vcvtph2uw))]
13982#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13983pub fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
13984    _mm512_mask_cvtph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
13985}
13986
13987/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13988/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13989/// mask bit is not set).
13990///
13991/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu16)
13992#[inline]
13993#[target_feature(enable = "avx512fp16")]
13994#[cfg_attr(test, assert_instr(vcvtph2uw))]
13995#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
13996pub fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13997    unsafe {
13998        transmute(vcvtph2uw_512(
13999            a,
14000            src.as_u16x32(),
14001            k,
14002            _MM_FROUND_CUR_DIRECTION,
14003        ))
14004    }
14005}
14006
14007/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
14008/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14009///
14010/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu16)
14011#[inline]
14012#[target_feature(enable = "avx512fp16")]
14013#[cfg_attr(test, assert_instr(vcvtph2uw))]
14014#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14015pub fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
14016    _mm512_mask_cvtph_epu16(_mm512_setzero_si512(), k, a)
14017}
14018
14019/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
14020/// and store the results in dst.
14021///
14022/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
14023///
14024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu16)
14025#[inline]
14026#[target_feature(enable = "avx512fp16")]
14027#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
14028#[rustc_legacy_const_generics(1)]
14029#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14030pub fn _mm512_cvt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
14031    static_assert_sae!(SAE);
14032    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
14033}
14034
14035/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
14036/// and store the results in dst using writemask k (elements are copied from src when the corresponding
14037/// mask bit is not set).
14038///
14039/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
14040///
14041/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu16)
14042#[inline]
14043#[target_feature(enable = "avx512fp16")]
14044#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
14045#[rustc_legacy_const_generics(3)]
14046#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14047pub fn _mm512_mask_cvt_roundph_epu16<const SAE: i32>(
14048    src: __m512i,
14049    k: __mmask32,
14050    a: __m512h,
14051) -> __m512i {
14052    unsafe {
14053        static_assert_sae!(SAE);
14054        transmute(vcvtph2uw_512(a, src.as_u16x32(), k, SAE))
14055    }
14056}
14057
14058/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
14059/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14060///
14061/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
14062///
14063/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu16)
14064#[inline]
14065#[target_feature(enable = "avx512fp16")]
14066#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
14067#[rustc_legacy_const_generics(2)]
14068#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14069pub fn _mm512_maskz_cvt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14070    static_assert_sae!(SAE);
14071    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
14072}
14073
14074/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14075/// truncation, and store the results in dst.
14076///
14077/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi16)
14078#[inline]
14079#[target_feature(enable = "avx512fp16,avx512vl")]
14080#[cfg_attr(test, assert_instr(vcvttph2w))]
14081#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14082pub fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
14083    _mm_mask_cvttph_epi16(_mm_undefined_si128(), 0xff, a)
14084}
14085
14086/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14087/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14088/// mask bit is not set).
14089///
14090/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi16)
14091#[inline]
14092#[target_feature(enable = "avx512fp16,avx512vl")]
14093#[cfg_attr(test, assert_instr(vcvttph2w))]
14094#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14095pub fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14096    unsafe { transmute(vcvttph2w_128(a, src.as_i16x8(), k)) }
14097}
14098
14099/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14100/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14101/// mask bit is not set).
14102///
14103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi16)
14104#[inline]
14105#[target_feature(enable = "avx512fp16,avx512vl")]
14106#[cfg_attr(test, assert_instr(vcvttph2w))]
14107#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14108pub fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
14109    _mm_mask_cvttph_epi16(_mm_setzero_si128(), k, a)
14110}
14111
14112/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14113/// truncation, and store the results in dst.
14114///
14115/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi16)
14116#[inline]
14117#[target_feature(enable = "avx512fp16,avx512vl")]
14118#[cfg_attr(test, assert_instr(vcvttph2w))]
14119#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14120pub fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
14121    _mm256_mask_cvttph_epi16(_mm256_undefined_si256(), 0xffff, a)
14122}
14123
14124/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14125/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14126/// mask bit is not set).
14127///
14128/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi16)
14129#[inline]
14130#[target_feature(enable = "avx512fp16,avx512vl")]
14131#[cfg_attr(test, assert_instr(vcvttph2w))]
14132#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14133pub fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
14134    unsafe { transmute(vcvttph2w_256(a, src.as_i16x16(), k)) }
14135}
14136
14137/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14138/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14139/// mask bit is not set).
14140///
14141/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi16)
14142#[inline]
14143#[target_feature(enable = "avx512fp16,avx512vl")]
14144#[cfg_attr(test, assert_instr(vcvttph2w))]
14145#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14146pub fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
14147    _mm256_mask_cvttph_epi16(_mm256_setzero_si256(), k, a)
14148}
14149
14150/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14151/// truncation, and store the results in dst.
14152///
14153/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi16)
14154#[inline]
14155#[target_feature(enable = "avx512fp16")]
14156#[cfg_attr(test, assert_instr(vcvttph2w))]
14157#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14158pub fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
14159    _mm512_mask_cvttph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
14160}
14161
14162/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14163/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14164/// mask bit is not set).
14165///
14166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi16)
14167#[inline]
14168#[target_feature(enable = "avx512fp16")]
14169#[cfg_attr(test, assert_instr(vcvttph2w))]
14170#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14171pub fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
14172    unsafe {
14173        transmute(vcvttph2w_512(
14174            a,
14175            src.as_i16x32(),
14176            k,
14177            _MM_FROUND_CUR_DIRECTION,
14178        ))
14179    }
14180}
14181
14182/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14183/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14184/// mask bit is not set).
14185///
14186/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi16)
14187#[inline]
14188#[target_feature(enable = "avx512fp16")]
14189#[cfg_attr(test, assert_instr(vcvttph2w))]
14190#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14191pub fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
14192    _mm512_mask_cvttph_epi16(_mm512_setzero_si512(), k, a)
14193}
14194
14195/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14196/// truncation, and store the results in dst.
14197///
14198/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14199///
14200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi16)
14201#[inline]
14202#[target_feature(enable = "avx512fp16")]
14203#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
14204#[rustc_legacy_const_generics(1)]
14205#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14206pub fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
14207    static_assert_sae!(SAE);
14208    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
14209}
14210
14211/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14212/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14213/// mask bit is not set).
14214///
14215/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14216///
14217/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi16)
14218#[inline]
14219#[target_feature(enable = "avx512fp16")]
14220#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
14221#[rustc_legacy_const_generics(3)]
14222#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14223pub fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
14224    src: __m512i,
14225    k: __mmask32,
14226    a: __m512h,
14227) -> __m512i {
14228    unsafe {
14229        static_assert_sae!(SAE);
14230        transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
14231    }
14232}
14233
14234/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14235/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14236/// mask bit is not set).
14237///
14238/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14239///
14240/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi16)
14241#[inline]
14242#[target_feature(enable = "avx512fp16")]
14243#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
14244#[rustc_legacy_const_generics(2)]
14245#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14246pub fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14247    static_assert_sae!(SAE);
14248    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_setzero_si512(), k, a)
14249}
14250
14251/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14252/// truncation, and store the results in dst.
14253///
14254/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu16)
14255#[inline]
14256#[target_feature(enable = "avx512fp16,avx512vl")]
14257#[cfg_attr(test, assert_instr(vcvttph2uw))]
14258#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14259pub fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
14260    _mm_mask_cvttph_epu16(_mm_undefined_si128(), 0xff, a)
14261}
14262
14263/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14264/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14265/// mask bit is not set).
14266///
14267/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu16)
14268#[inline]
14269#[target_feature(enable = "avx512fp16,avx512vl")]
14270#[cfg_attr(test, assert_instr(vcvttph2uw))]
14271#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14272pub fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14273    unsafe { transmute(vcvttph2uw_128(a, src.as_u16x8(), k)) }
14274}
14275
14276/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14277/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14278/// mask bit is not set).
14279///
14280/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu16)
14281#[inline]
14282#[target_feature(enable = "avx512fp16,avx512vl")]
14283#[cfg_attr(test, assert_instr(vcvttph2uw))]
14284#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14285pub fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
14286    _mm_mask_cvttph_epu16(_mm_setzero_si128(), k, a)
14287}
14288
14289/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14290/// truncation, and store the results in dst.
14291///
14292/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu16)
14293#[inline]
14294#[target_feature(enable = "avx512fp16,avx512vl")]
14295#[cfg_attr(test, assert_instr(vcvttph2uw))]
14296#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14297pub fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
14298    _mm256_mask_cvttph_epu16(_mm256_undefined_si256(), 0xffff, a)
14299}
14300
14301/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14302/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14303/// mask bit is not set).
14304///
14305/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu16)
14306#[inline]
14307#[target_feature(enable = "avx512fp16,avx512vl")]
14308#[cfg_attr(test, assert_instr(vcvttph2uw))]
14309#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14310pub fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
14311    unsafe { transmute(vcvttph2uw_256(a, src.as_u16x16(), k)) }
14312}
14313
14314/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14315/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14316/// mask bit is not set).
14317///
14318/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu16)
14319#[inline]
14320#[target_feature(enable = "avx512fp16,avx512vl")]
14321#[cfg_attr(test, assert_instr(vcvttph2uw))]
14322#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14323pub fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
14324    _mm256_mask_cvttph_epu16(_mm256_setzero_si256(), k, a)
14325}
14326
14327/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14328/// truncation, and store the results in dst.
14329///
14330/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu16)
14331#[inline]
14332#[target_feature(enable = "avx512fp16")]
14333#[cfg_attr(test, assert_instr(vcvttph2uw))]
14334#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14335pub fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
14336    _mm512_mask_cvttph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
14337}
14338
14339/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14340/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14341/// mask bit is not set).
14342///
14343/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu16)
14344#[inline]
14345#[target_feature(enable = "avx512fp16")]
14346#[cfg_attr(test, assert_instr(vcvttph2uw))]
14347#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14348pub fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
14349    unsafe {
14350        transmute(vcvttph2uw_512(
14351            a,
14352            src.as_u16x32(),
14353            k,
14354            _MM_FROUND_CUR_DIRECTION,
14355        ))
14356    }
14357}
14358
14359/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14360/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14361/// mask bit is not set).
14362///
14363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu16)
14364#[inline]
14365#[target_feature(enable = "avx512fp16")]
14366#[cfg_attr(test, assert_instr(vcvttph2uw))]
14367#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14368pub fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
14369    _mm512_mask_cvttph_epu16(_mm512_setzero_si512(), k, a)
14370}
14371
14372/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14373/// truncation, and store the results in dst.
14374///
14375/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14376///
14377/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu16)
14378#[inline]
14379#[target_feature(enable = "avx512fp16")]
14380#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14381#[rustc_legacy_const_generics(1)]
14382#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14383pub fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
14384    static_assert_sae!(SAE);
14385    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
14386}
14387
14388/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14389/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14390/// mask bit is not set).
14391///
14392/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14393///
14394/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu16)
14395#[inline]
14396#[target_feature(enable = "avx512fp16")]
14397#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14398#[rustc_legacy_const_generics(3)]
14399#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14400pub fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
14401    src: __m512i,
14402    k: __mmask32,
14403    a: __m512h,
14404) -> __m512i {
14405    unsafe {
14406        static_assert_sae!(SAE);
14407        transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
14408    }
14409}
14410
14411/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14412/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14413/// mask bit is not set).
14414///
14415/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14416///
14417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu16)
14418#[inline]
14419#[target_feature(enable = "avx512fp16")]
14420#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14421#[rustc_legacy_const_generics(2)]
14422#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14423pub fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14424    static_assert_sae!(SAE);
14425    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
14426}
14427
14428/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14429/// results in dst.
14430///
14431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi32)
14432#[inline]
14433#[target_feature(enable = "avx512fp16,avx512vl")]
14434#[cfg_attr(test, assert_instr(vcvtph2dq))]
14435#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14436pub fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
14437    _mm_mask_cvtph_epi32(_mm_undefined_si128(), 0xff, a)
14438}
14439
14440/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14441/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14442///
14443/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi32)
14444#[inline]
14445#[target_feature(enable = "avx512fp16,avx512vl")]
14446#[cfg_attr(test, assert_instr(vcvtph2dq))]
14447#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14448pub fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14449    unsafe { transmute(vcvtph2dq_128(a, src.as_i32x4(), k)) }
14450}
14451
14452/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14453/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14454///
14455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi32)
14456#[inline]
14457#[target_feature(enable = "avx512fp16,avx512vl")]
14458#[cfg_attr(test, assert_instr(vcvtph2dq))]
14459#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14460pub fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14461    _mm_mask_cvtph_epi32(_mm_setzero_si128(), k, a)
14462}
14463
14464/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14465/// results in dst.
14466///
14467/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi32)
14468#[inline]
14469#[target_feature(enable = "avx512fp16,avx512vl")]
14470#[cfg_attr(test, assert_instr(vcvtph2dq))]
14471#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14472pub fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
14473    _mm256_mask_cvtph_epi32(_mm256_undefined_si256(), 0xff, a)
14474}
14475
14476/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14477/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14478///
14479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi32)
14480#[inline]
14481#[target_feature(enable = "avx512fp16,avx512vl")]
14482#[cfg_attr(test, assert_instr(vcvtph2dq))]
14483#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14484pub fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14485    unsafe { transmute(vcvtph2dq_256(a, src.as_i32x8(), k)) }
14486}
14487
14488/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14489/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14490///
14491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi32)
14492#[inline]
14493#[target_feature(enable = "avx512fp16,avx512vl")]
14494#[cfg_attr(test, assert_instr(vcvtph2dq))]
14495#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14496pub fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14497    _mm256_mask_cvtph_epi32(_mm256_setzero_si256(), k, a)
14498}
14499
14500/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14501/// results in dst.
14502///
14503/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi32)
14504#[inline]
14505#[target_feature(enable = "avx512fp16")]
14506#[cfg_attr(test, assert_instr(vcvtph2dq))]
14507#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14508pub fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
14509    _mm512_mask_cvtph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14510}
14511
14512/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14513/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14514///
14515/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi32)
14516#[inline]
14517#[target_feature(enable = "avx512fp16")]
14518#[cfg_attr(test, assert_instr(vcvtph2dq))]
14519#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14520pub fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14521    unsafe {
14522        transmute(vcvtph2dq_512(
14523            a,
14524            src.as_i32x16(),
14525            k,
14526            _MM_FROUND_CUR_DIRECTION,
14527        ))
14528    }
14529}
14530
14531/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14532/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14533///
14534/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi32)
14535#[inline]
14536#[target_feature(enable = "avx512fp16")]
14537#[cfg_attr(test, assert_instr(vcvtph2dq))]
14538#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14539pub fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14540    _mm512_mask_cvtph_epi32(_mm512_setzero_si512(), k, a)
14541}
14542
14543/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14544/// results in dst.
14545///
14546/// Rounding is done according to the rounding parameter, which can be one of:
14547///
14548/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14549/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14550/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14551/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14552/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14553///
14554/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi32)
14555#[inline]
14556#[target_feature(enable = "avx512fp16")]
14557#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14558#[rustc_legacy_const_generics(1)]
14559#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14560pub fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14561    static_assert_rounding!(ROUNDING);
14562    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14563}
14564
14565/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14566/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14567///
14568/// Rounding is done according to the rounding parameter, which can be one of:
14569///
14570/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14571/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14572/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14573/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14574/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14575///
14576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi32)
14577#[inline]
14578#[target_feature(enable = "avx512fp16")]
14579#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14580#[rustc_legacy_const_generics(3)]
14581#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14582pub fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
14583    src: __m512i,
14584    k: __mmask16,
14585    a: __m256h,
14586) -> __m512i {
14587    unsafe {
14588        static_assert_rounding!(ROUNDING);
14589        transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
14590    }
14591}
14592
14593/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14594/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14595///
14596/// Rounding is done according to the rounding parameter, which can be one of:
14597///
14598/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14599/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14600/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14601/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14602/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14603///
14604/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi32)
14605#[inline]
14606#[target_feature(enable = "avx512fp16")]
14607#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14608#[rustc_legacy_const_generics(2)]
14609#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14610pub fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14611    static_assert_rounding!(ROUNDING);
14612    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14613}
14614
14615/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14616/// the result in dst.
14617///
14618/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_i32)
14619#[inline]
14620#[target_feature(enable = "avx512fp16")]
14621#[cfg_attr(test, assert_instr(vcvtsh2si))]
14622#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14623pub fn _mm_cvtsh_i32(a: __m128h) -> i32 {
14624    unsafe { vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14625}
14626
14627/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14628/// the result in dst.
14629///
14630/// Rounding is done according to the rounding parameter, which can be one of:
14631///
14632/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14633/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14634/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14635/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14636/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14637///
14638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_i32)
14639#[inline]
14640#[target_feature(enable = "avx512fp16")]
14641#[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
14642#[rustc_legacy_const_generics(1)]
14643#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14644pub fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
14645    unsafe {
14646        static_assert_rounding!(ROUNDING);
14647        vcvtsh2si32(a, ROUNDING)
14648    }
14649}
14650
14651/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14652/// results in dst.
14653///
14654/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu32)
14655#[inline]
14656#[target_feature(enable = "avx512fp16,avx512vl")]
14657#[cfg_attr(test, assert_instr(vcvtph2udq))]
14658#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14659pub fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
14660    _mm_mask_cvtph_epu32(_mm_undefined_si128(), 0xff, a)
14661}
14662
14663/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14664/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14665///
14666/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu32)
14667#[inline]
14668#[target_feature(enable = "avx512fp16,avx512vl")]
14669#[cfg_attr(test, assert_instr(vcvtph2udq))]
14670#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14671pub fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14672    unsafe { transmute(vcvtph2udq_128(a, src.as_u32x4(), k)) }
14673}
14674
14675/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14676/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14677///
14678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu32)
14679#[inline]
14680#[target_feature(enable = "avx512fp16,avx512vl")]
14681#[cfg_attr(test, assert_instr(vcvtph2udq))]
14682#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14683pub fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14684    _mm_mask_cvtph_epu32(_mm_setzero_si128(), k, a)
14685}
14686
14687/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14688/// the results in dst.
14689///
14690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu32)
14691#[inline]
14692#[target_feature(enable = "avx512fp16,avx512vl")]
14693#[cfg_attr(test, assert_instr(vcvtph2udq))]
14694#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14695pub fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
14696    _mm256_mask_cvtph_epu32(_mm256_undefined_si256(), 0xff, a)
14697}
14698
14699/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14700/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14701///
14702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu32)
14703#[inline]
14704#[target_feature(enable = "avx512fp16,avx512vl")]
14705#[cfg_attr(test, assert_instr(vcvtph2udq))]
14706#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14707pub fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14708    unsafe { transmute(vcvtph2udq_256(a, src.as_u32x8(), k)) }
14709}
14710
14711/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14712/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14713///
14714/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu32)
14715#[inline]
14716#[target_feature(enable = "avx512fp16,avx512vl")]
14717#[cfg_attr(test, assert_instr(vcvtph2udq))]
14718#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14719pub fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14720    _mm256_mask_cvtph_epu32(_mm256_setzero_si256(), k, a)
14721}
14722
14723/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14724/// the results in dst.
14725///
14726/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu32)
14727#[inline]
14728#[target_feature(enable = "avx512fp16")]
14729#[cfg_attr(test, assert_instr(vcvtph2udq))]
14730#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14731pub fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
14732    _mm512_mask_cvtph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14733}
14734
14735/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14736/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14737///
14738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu32)
14739#[inline]
14740#[target_feature(enable = "avx512fp16")]
14741#[cfg_attr(test, assert_instr(vcvtph2udq))]
14742#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14743pub fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14744    unsafe {
14745        transmute(vcvtph2udq_512(
14746            a,
14747            src.as_u32x16(),
14748            k,
14749            _MM_FROUND_CUR_DIRECTION,
14750        ))
14751    }
14752}
14753
14754/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14755/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14756///
14757/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu32)
14758#[inline]
14759#[target_feature(enable = "avx512fp16")]
14760#[cfg_attr(test, assert_instr(vcvtph2udq))]
14761#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14762pub fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14763    _mm512_mask_cvtph_epu32(_mm512_setzero_si512(), k, a)
14764}
14765
14766/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14767/// the results in dst.
14768///
14769/// Rounding is done according to the rounding parameter, which can be one of:
14770///
14771/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14772/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14773/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14774/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14775/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14776///
14777/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu32)
14778#[inline]
14779#[target_feature(enable = "avx512fp16")]
14780#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14781#[rustc_legacy_const_generics(1)]
14782#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14783pub fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14784    static_assert_rounding!(ROUNDING);
14785    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14786}
14787
14788/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14789/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14790///
14791/// Rounding is done according to the rounding parameter, which can be one of:
14792///
14793/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14794/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14795/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14796/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14797/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14798///
14799/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu32)
14800#[inline]
14801#[target_feature(enable = "avx512fp16")]
14802#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14803#[rustc_legacy_const_generics(3)]
14804#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14805pub fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
14806    src: __m512i,
14807    k: __mmask16,
14808    a: __m256h,
14809) -> __m512i {
14810    unsafe {
14811        static_assert_rounding!(ROUNDING);
14812        transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
14813    }
14814}
14815
14816/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14817/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14818///
14819/// Rounding is done according to the rounding parameter, which can be one of:
14820///
14821/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14822/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14823/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14824/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14825/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14826///
14827/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu32)
14828#[inline]
14829#[target_feature(enable = "avx512fp16")]
14830#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14831#[rustc_legacy_const_generics(2)]
14832#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14833pub fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14834    static_assert_rounding!(ROUNDING);
14835    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14836}
14837
14838/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14839/// the result in dst.
14840///
14841/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_u32)
14842#[inline]
14843#[target_feature(enable = "avx512fp16")]
14844#[cfg_attr(test, assert_instr(vcvtsh2usi))]
14845#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14846pub fn _mm_cvtsh_u32(a: __m128h) -> u32 {
14847    unsafe { vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
14848}
14849
14850/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14851/// the result in dst.
14852///
14853/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
14854///
14855/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_u32)
14856#[inline]
14857#[target_feature(enable = "avx512fp16")]
14858#[cfg_attr(test, assert_instr(vcvtsh2usi, SAE = 8))]
14859#[rustc_legacy_const_generics(1)]
14860#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14861pub fn _mm_cvt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
14862    unsafe {
14863        static_assert_rounding!(SAE);
14864        vcvtsh2usi32(a, SAE)
14865    }
14866}
14867
14868/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14869/// store the results in dst.
14870///
14871/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi32)
14872#[inline]
14873#[target_feature(enable = "avx512fp16,avx512vl")]
14874#[cfg_attr(test, assert_instr(vcvttph2dq))]
14875#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14876pub fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
14877    _mm_mask_cvttph_epi32(_mm_undefined_si128(), 0xff, a)
14878}
14879
14880/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14881/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14882///
14883/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi32)
14884#[inline]
14885#[target_feature(enable = "avx512fp16,avx512vl")]
14886#[cfg_attr(test, assert_instr(vcvttph2dq))]
14887#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14888pub fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14889    unsafe { transmute(vcvttph2dq_128(a, src.as_i32x4(), k)) }
14890}
14891
14892/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14893/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14894///
14895/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi32)
14896#[inline]
14897#[target_feature(enable = "avx512fp16,avx512vl")]
14898#[cfg_attr(test, assert_instr(vcvttph2dq))]
14899#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14900pub fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14901    _mm_mask_cvttph_epi32(_mm_setzero_si128(), k, a)
14902}
14903
14904/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14905/// store the results in dst.
14906///
14907/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi32)
14908#[inline]
14909#[target_feature(enable = "avx512fp16,avx512vl")]
14910#[cfg_attr(test, assert_instr(vcvttph2dq))]
14911#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14912pub fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
14913    _mm256_mask_cvttph_epi32(_mm256_undefined_si256(), 0xff, a)
14914}
14915
14916/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14917/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14918///
14919/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi32)
14920#[inline]
14921#[target_feature(enable = "avx512fp16,avx512vl")]
14922#[cfg_attr(test, assert_instr(vcvttph2dq))]
14923#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14924pub fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14925    unsafe { transmute(vcvttph2dq_256(a, src.as_i32x8(), k)) }
14926}
14927
14928/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14929/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14930///
14931/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi32)
14932#[inline]
14933#[target_feature(enable = "avx512fp16,avx512vl")]
14934#[cfg_attr(test, assert_instr(vcvttph2dq))]
14935#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14936pub fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14937    _mm256_mask_cvttph_epi32(_mm256_setzero_si256(), k, a)
14938}
14939
14940/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14941/// store the results in dst.
14942///
14943/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi32)
14944#[inline]
14945#[target_feature(enable = "avx512fp16")]
14946#[cfg_attr(test, assert_instr(vcvttph2dq))]
14947#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14948pub fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
14949    _mm512_mask_cvttph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14950}
14951
14952/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14953/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14954///
14955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi32)
14956#[inline]
14957#[target_feature(enable = "avx512fp16")]
14958#[cfg_attr(test, assert_instr(vcvttph2dq))]
14959#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14960pub fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14961    unsafe {
14962        transmute(vcvttph2dq_512(
14963            a,
14964            src.as_i32x16(),
14965            k,
14966            _MM_FROUND_CUR_DIRECTION,
14967        ))
14968    }
14969}
14970
14971/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14972/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14973///
14974/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi32)
14975#[inline]
14976#[target_feature(enable = "avx512fp16")]
14977#[cfg_attr(test, assert_instr(vcvttph2dq))]
14978#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14979pub fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14980    _mm512_mask_cvttph_epi32(_mm512_setzero_si512(), k, a)
14981}
14982
14983/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14984/// store the results in dst.
14985///
14986/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14987///
14988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi32)
14989#[inline]
14990#[target_feature(enable = "avx512fp16")]
14991#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14992#[rustc_legacy_const_generics(1)]
14993#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
14994pub fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
14995    static_assert_sae!(SAE);
14996    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14997}
14998
14999/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
15000/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15001///
15002/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15003///
15004/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi32)
15005#[inline]
15006#[target_feature(enable = "avx512fp16")]
15007#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
15008#[rustc_legacy_const_generics(3)]
15009#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15010pub fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
15011    src: __m512i,
15012    k: __mmask16,
15013    a: __m256h,
15014) -> __m512i {
15015    unsafe {
15016        static_assert_sae!(SAE);
15017        transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
15018    }
15019}
15020
15021/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
15022/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15023///
15024/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15025///
15026/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi32)
15027#[inline]
15028#[target_feature(enable = "avx512fp16")]
15029#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
15030#[rustc_legacy_const_generics(2)]
15031#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15032pub fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
15033    static_assert_sae!(SAE);
15034    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_setzero_si512(), k, a)
15035}
15036
15037/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
15038/// the result in dst.
15039///
15040/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_i32)
15041#[inline]
15042#[target_feature(enable = "avx512fp16")]
15043#[cfg_attr(test, assert_instr(vcvttsh2si))]
15044#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15045pub fn _mm_cvttsh_i32(a: __m128h) -> i32 {
15046    unsafe { vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
15047}
15048
15049/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
15050/// the result in dst.
15051///
15052/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15053///
15054/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_i32)
15055#[inline]
15056#[target_feature(enable = "avx512fp16")]
15057#[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
15058#[rustc_legacy_const_generics(1)]
15059#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15060pub fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
15061    unsafe {
15062        static_assert_sae!(SAE);
15063        vcvttsh2si32(a, SAE)
15064    }
15065}
15066
15067/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15068/// store the results in dst.
15069///
15070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu32)
15071#[inline]
15072#[target_feature(enable = "avx512fp16,avx512vl")]
15073#[cfg_attr(test, assert_instr(vcvttph2udq))]
15074#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15075pub fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
15076    _mm_mask_cvttph_epu32(_mm_undefined_si128(), 0xff, a)
15077}
15078
15079/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15080/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15081///
15082/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu32)
15083#[inline]
15084#[target_feature(enable = "avx512fp16,avx512vl")]
15085#[cfg_attr(test, assert_instr(vcvttph2udq))]
15086#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15087pub fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15088    unsafe { transmute(vcvttph2udq_128(a, src.as_u32x4(), k)) }
15089}
15090
15091/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15092/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15093///
15094/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu32)
15095#[inline]
15096#[target_feature(enable = "avx512fp16,avx512vl")]
15097#[cfg_attr(test, assert_instr(vcvttph2udq))]
15098#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15099pub fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
15100    _mm_mask_cvttph_epu32(_mm_setzero_si128(), k, a)
15101}
15102
15103/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15104/// store the results in dst.
15105///
15106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu32)
15107#[inline]
15108#[target_feature(enable = "avx512fp16,avx512vl")]
15109#[cfg_attr(test, assert_instr(vcvttph2udq))]
15110#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15111pub fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
15112    _mm256_mask_cvttph_epu32(_mm256_undefined_si256(), 0xff, a)
15113}
15114
15115/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15116/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15117///
15118/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu32)
15119#[inline]
15120#[target_feature(enable = "avx512fp16,avx512vl")]
15121#[cfg_attr(test, assert_instr(vcvttph2udq))]
15122#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15123pub fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15124    unsafe { transmute(vcvttph2udq_256(a, src.as_u32x8(), k)) }
15125}
15126
15127/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15128/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15129///
15130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu32)
15131#[inline]
15132#[target_feature(enable = "avx512fp16,avx512vl")]
15133#[cfg_attr(test, assert_instr(vcvttph2udq))]
15134#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15135pub fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
15136    _mm256_mask_cvttph_epu32(_mm256_setzero_si256(), k, a)
15137}
15138
15139/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15140/// store the results in dst.
15141///
15142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu32)
15143#[inline]
15144#[target_feature(enable = "avx512fp16")]
15145#[cfg_attr(test, assert_instr(vcvttph2udq))]
15146#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15147pub fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
15148    _mm512_mask_cvttph_epu32(_mm512_undefined_epi32(), 0xffff, a)
15149}
15150
15151/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15152/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15153///
15154/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu32)
15155#[inline]
15156#[target_feature(enable = "avx512fp16")]
15157#[cfg_attr(test, assert_instr(vcvttph2udq))]
15158#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15159pub fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
15160    unsafe {
15161        transmute(vcvttph2udq_512(
15162            a,
15163            src.as_u32x16(),
15164            k,
15165            _MM_FROUND_CUR_DIRECTION,
15166        ))
15167    }
15168}
15169
15170/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15171/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15172///
15173/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu32)
15174#[inline]
15175#[target_feature(enable = "avx512fp16")]
15176#[cfg_attr(test, assert_instr(vcvttph2udq))]
15177#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15178pub fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
15179    _mm512_mask_cvttph_epu32(_mm512_setzero_si512(), k, a)
15180}
15181
15182/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15183/// store the results in dst.
15184///
15185/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15186///
15187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu32)
15188#[inline]
15189#[target_feature(enable = "avx512fp16")]
15190#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
15191#[rustc_legacy_const_generics(1)]
15192#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15193pub fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
15194    static_assert_sae!(SAE);
15195    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
15196}
15197
15198/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15199/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15200///
15201/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15202///
15203/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu32)
15204#[inline]
15205#[target_feature(enable = "avx512fp16")]
15206#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
15207#[rustc_legacy_const_generics(3)]
15208#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15209pub fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
15210    src: __m512i,
15211    k: __mmask16,
15212    a: __m256h,
15213) -> __m512i {
15214    unsafe {
15215        static_assert_sae!(SAE);
15216        transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
15217    }
15218}
15219
15220/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15221/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15222///
15223/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15224///
15225/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu32)
15226#[inline]
15227#[target_feature(enable = "avx512fp16")]
15228#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
15229#[rustc_legacy_const_generics(2)]
15230#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15231pub fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
15232    static_assert_sae!(SAE);
15233    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_setzero_si512(), k, a)
15234}
15235
15236/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
15237/// the result in dst.
15238///
15239/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_u32)
15240#[inline]
15241#[target_feature(enable = "avx512fp16")]
15242#[cfg_attr(test, assert_instr(vcvttsh2usi))]
15243#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15244pub fn _mm_cvttsh_u32(a: __m128h) -> u32 {
15245    unsafe { vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
15246}
15247
15248/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
15249/// the result in dst.
15250///
15251/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15252///
15253/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_u32)
15254#[inline]
15255#[target_feature(enable = "avx512fp16")]
15256#[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
15257#[rustc_legacy_const_generics(1)]
15258#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15259pub fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
15260    unsafe {
15261        static_assert_sae!(SAE);
15262        vcvttsh2usi32(a, SAE)
15263    }
15264}
15265
15266/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15267/// store the results in dst.
15268///
15269/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi64)
15270#[inline]
15271#[target_feature(enable = "avx512fp16,avx512vl")]
15272#[cfg_attr(test, assert_instr(vcvtph2qq))]
15273#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15274pub fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
15275    _mm_mask_cvtph_epi64(_mm_undefined_si128(), 0xff, a)
15276}
15277
15278/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15279/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15280///
15281/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi64)
15282#[inline]
15283#[target_feature(enable = "avx512fp16,avx512vl")]
15284#[cfg_attr(test, assert_instr(vcvtph2qq))]
15285#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15286pub fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15287    unsafe { transmute(vcvtph2qq_128(a, src.as_i64x2(), k)) }
15288}
15289
15290/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15291/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15292///
15293/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi64)
15294#[inline]
15295#[target_feature(enable = "avx512fp16,avx512vl")]
15296#[cfg_attr(test, assert_instr(vcvtph2qq))]
15297#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15298pub fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15299    _mm_mask_cvtph_epi64(_mm_setzero_si128(), k, a)
15300}
15301
15302/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15303/// store the results in dst.
15304///
15305/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi64)
15306#[inline]
15307#[target_feature(enable = "avx512fp16,avx512vl")]
15308#[cfg_attr(test, assert_instr(vcvtph2qq))]
15309#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15310pub fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
15311    _mm256_mask_cvtph_epi64(_mm256_undefined_si256(), 0xff, a)
15312}
15313
15314/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15315/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15316///
15317/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi64)
15318#[inline]
15319#[target_feature(enable = "avx512fp16,avx512vl")]
15320#[cfg_attr(test, assert_instr(vcvtph2qq))]
15321#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15322pub fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15323    unsafe { transmute(vcvtph2qq_256(a, src.as_i64x4(), k)) }
15324}
15325
15326/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15327/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15328///
15329/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi64)
15330#[inline]
15331#[target_feature(enable = "avx512fp16,avx512vl")]
15332#[cfg_attr(test, assert_instr(vcvtph2qq))]
15333#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15334pub fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15335    _mm256_mask_cvtph_epi64(_mm256_setzero_si256(), k, a)
15336}
15337
15338/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15339/// store the results in dst.
15340///
15341/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi64)
15342#[inline]
15343#[target_feature(enable = "avx512fp16")]
15344#[cfg_attr(test, assert_instr(vcvtph2qq))]
15345#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15346pub fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
15347    _mm512_mask_cvtph_epi64(_mm512_undefined_epi32(), 0xff, a)
15348}
15349
15350/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15351/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15352///
15353/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi64)
15354#[inline]
15355#[target_feature(enable = "avx512fp16")]
15356#[cfg_attr(test, assert_instr(vcvtph2qq))]
15357#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15358pub fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15359    unsafe {
15360        transmute(vcvtph2qq_512(
15361            a,
15362            src.as_i64x8(),
15363            k,
15364            _MM_FROUND_CUR_DIRECTION,
15365        ))
15366    }
15367}
15368
15369/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15370/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15371///
15372/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi64)
15373#[inline]
15374#[target_feature(enable = "avx512fp16")]
15375#[cfg_attr(test, assert_instr(vcvtph2qq))]
15376#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15377pub fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15378    _mm512_mask_cvtph_epi64(_mm512_setzero_si512(), k, a)
15379}
15380
15381/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15382/// store the results in dst.
15383///
15384/// Rounding is done according to the rounding parameter, which can be one of:
15385///
15386/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15387/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15388/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15389/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15390/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15391///
15392/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi64)
15393#[inline]
15394#[target_feature(enable = "avx512fp16")]
15395#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15396#[rustc_legacy_const_generics(1)]
15397#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15398pub fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15399    static_assert_rounding!(ROUNDING);
15400    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15401}
15402
15403/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15404/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15405///
15406/// Rounding is done according to the rounding parameter, which can be one of:
15407///
15408/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15409/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15410/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15411/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15412/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15413///
15414/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi64)
15415#[inline]
15416#[target_feature(enable = "avx512fp16")]
15417#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15418#[rustc_legacy_const_generics(3)]
15419#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15420pub fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
15421    src: __m512i,
15422    k: __mmask8,
15423    a: __m128h,
15424) -> __m512i {
15425    unsafe {
15426        static_assert_rounding!(ROUNDING);
15427        transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
15428    }
15429}
15430
15431/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15432/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15433///
15434/// Rounding is done according to the rounding parameter, which can be one of:
15435///
15436/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15437/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15438/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15439/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15440/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15441///
15442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi64)
15443#[inline]
15444#[target_feature(enable = "avx512fp16")]
15445#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15446#[rustc_legacy_const_generics(2)]
15447#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15448pub fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15449    static_assert_rounding!(ROUNDING);
15450    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15451}
15452
15453/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15454/// store the results in dst.
15455///
15456/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu64)
15457#[inline]
15458#[target_feature(enable = "avx512fp16,avx512vl")]
15459#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15460#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15461pub fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
15462    _mm_mask_cvtph_epu64(_mm_undefined_si128(), 0xff, a)
15463}
15464
15465/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15466/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15467///
15468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu64)
15469#[inline]
15470#[target_feature(enable = "avx512fp16,avx512vl")]
15471#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15472#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15473pub fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15474    unsafe { transmute(vcvtph2uqq_128(a, src.as_u64x2(), k)) }
15475}
15476
15477/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15478/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15479///
15480/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu64)
15481#[inline]
15482#[target_feature(enable = "avx512fp16,avx512vl")]
15483#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15484#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15485pub fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15486    _mm_mask_cvtph_epu64(_mm_setzero_si128(), k, a)
15487}
15488
15489/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15490/// store the results in dst.
15491///
15492/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu64)
15493#[inline]
15494#[target_feature(enable = "avx512fp16,avx512vl")]
15495#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15496#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15497pub fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
15498    _mm256_mask_cvtph_epu64(_mm256_undefined_si256(), 0xff, a)
15499}
15500
15501/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15502/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15503///
15504/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu64)
15505#[inline]
15506#[target_feature(enable = "avx512fp16,avx512vl")]
15507#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15508#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15509pub fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15510    unsafe { transmute(vcvtph2uqq_256(a, src.as_u64x4(), k)) }
15511}
15512
15513/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15514/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15515///
15516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu64)
15517#[inline]
15518#[target_feature(enable = "avx512fp16,avx512vl")]
15519#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15520#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15521pub fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15522    _mm256_mask_cvtph_epu64(_mm256_setzero_si256(), k, a)
15523}
15524
15525/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15526/// store the results in dst.
15527///
15528/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu64)
15529#[inline]
15530#[target_feature(enable = "avx512fp16")]
15531#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15532#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15533pub fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
15534    _mm512_mask_cvtph_epu64(_mm512_undefined_epi32(), 0xff, a)
15535}
15536
15537/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15538/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15539///
15540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu64)
15541#[inline]
15542#[target_feature(enable = "avx512fp16")]
15543#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15544#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15545pub fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15546    unsafe {
15547        transmute(vcvtph2uqq_512(
15548            a,
15549            src.as_u64x8(),
15550            k,
15551            _MM_FROUND_CUR_DIRECTION,
15552        ))
15553    }
15554}
15555
15556/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15557/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15558///
15559/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu64)
15560#[inline]
15561#[target_feature(enable = "avx512fp16")]
15562#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15563#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15564pub fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15565    _mm512_mask_cvtph_epu64(_mm512_setzero_si512(), k, a)
15566}
15567
15568/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15569/// store the results in dst.
15570///
15571/// Rounding is done according to the rounding parameter, which can be one of:
15572///
15573/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15574/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15575/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15576/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15577/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15578///
15579/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu64)
15580#[inline]
15581#[target_feature(enable = "avx512fp16")]
15582#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15583#[rustc_legacy_const_generics(1)]
15584#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15585pub fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15586    static_assert_rounding!(ROUNDING);
15587    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15588}
15589
15590/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15591/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15592///
15593/// Rounding is done according to the rounding parameter, which can be one of:
15594///
15595/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15596/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15597/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15598/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15599/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15600///
15601/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu64)
15602#[inline]
15603#[target_feature(enable = "avx512fp16")]
15604#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15605#[rustc_legacy_const_generics(3)]
15606#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15607pub fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
15608    src: __m512i,
15609    k: __mmask8,
15610    a: __m128h,
15611) -> __m512i {
15612    unsafe {
15613        static_assert_rounding!(ROUNDING);
15614        transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
15615    }
15616}
15617
15618/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15619/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15620///
15621/// Rounding is done according to the rounding parameter, which can be one of:
15622///
15623/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15624/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15625/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15626/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15627/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15628///
15629/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu64)
15630#[inline]
15631#[target_feature(enable = "avx512fp16")]
15632#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15633#[rustc_legacy_const_generics(2)]
15634#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15635pub fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15636    static_assert_rounding!(ROUNDING);
15637    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15638}
15639
15640/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15641/// store the results in dst.
15642///
15643/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi64)
15644#[inline]
15645#[target_feature(enable = "avx512fp16,avx512vl")]
15646#[cfg_attr(test, assert_instr(vcvttph2qq))]
15647#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15648pub fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
15649    _mm_mask_cvttph_epi64(_mm_undefined_si128(), 0xff, a)
15650}
15651
15652/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15653/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15654///
15655/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi64)
15656#[inline]
15657#[target_feature(enable = "avx512fp16,avx512vl")]
15658#[cfg_attr(test, assert_instr(vcvttph2qq))]
15659#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15660pub fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15661    unsafe { transmute(vcvttph2qq_128(a, src.as_i64x2(), k)) }
15662}
15663
15664/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15665/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15666///
15667/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi64)
15668#[inline]
15669#[target_feature(enable = "avx512fp16,avx512vl")]
15670#[cfg_attr(test, assert_instr(vcvttph2qq))]
15671#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15672pub fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15673    _mm_mask_cvttph_epi64(_mm_setzero_si128(), k, a)
15674}
15675
15676/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15677/// store the results in dst.
15678///
15679/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi64)
15680#[inline]
15681#[target_feature(enable = "avx512fp16,avx512vl")]
15682#[cfg_attr(test, assert_instr(vcvttph2qq))]
15683#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15684pub fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
15685    _mm256_mask_cvttph_epi64(_mm256_undefined_si256(), 0xff, a)
15686}
15687
15688/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15689/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15690///
15691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi64)
15692#[inline]
15693#[target_feature(enable = "avx512fp16,avx512vl")]
15694#[cfg_attr(test, assert_instr(vcvttph2qq))]
15695#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15696pub fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15697    unsafe { transmute(vcvttph2qq_256(a, src.as_i64x4(), k)) }
15698}
15699
15700/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15701/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15702///
15703/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi64)
15704#[inline]
15705#[target_feature(enable = "avx512fp16,avx512vl")]
15706#[cfg_attr(test, assert_instr(vcvttph2qq))]
15707#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15708pub fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15709    _mm256_mask_cvttph_epi64(_mm256_setzero_si256(), k, a)
15710}
15711
15712/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15713/// store the results in dst.
15714///
15715/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi64)
15716#[inline]
15717#[target_feature(enable = "avx512fp16")]
15718#[cfg_attr(test, assert_instr(vcvttph2qq))]
15719#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15720pub fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
15721    _mm512_mask_cvttph_epi64(_mm512_undefined_epi32(), 0xff, a)
15722}
15723
15724/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15725/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15726///
15727/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi64)
15728#[inline]
15729#[target_feature(enable = "avx512fp16")]
15730#[cfg_attr(test, assert_instr(vcvttph2qq))]
15731#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15732pub fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15733    unsafe {
15734        transmute(vcvttph2qq_512(
15735            a,
15736            src.as_i64x8(),
15737            k,
15738            _MM_FROUND_CUR_DIRECTION,
15739        ))
15740    }
15741}
15742
15743/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15744/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15745///
15746/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi64)
15747#[inline]
15748#[target_feature(enable = "avx512fp16")]
15749#[cfg_attr(test, assert_instr(vcvttph2qq))]
15750#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15751pub fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15752    _mm512_mask_cvttph_epi64(_mm512_setzero_si512(), k, a)
15753}
15754
15755/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15756/// store the results in dst.
15757///
15758/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15759///
15760/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi64)
15761#[inline]
15762#[target_feature(enable = "avx512fp16")]
15763#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15764#[rustc_legacy_const_generics(1)]
15765#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15766pub fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
15767    static_assert_sae!(SAE);
15768    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15769}
15770
15771/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15772/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15773///
15774/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15775///
15776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi64)
15777#[inline]
15778#[target_feature(enable = "avx512fp16")]
15779#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15780#[rustc_legacy_const_generics(3)]
15781#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15782pub fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
15783    src: __m512i,
15784    k: __mmask8,
15785    a: __m128h,
15786) -> __m512i {
15787    unsafe {
15788        static_assert_sae!(SAE);
15789        transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
15790    }
15791}
15792
15793/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15794/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15795///
15796/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15797///
15798/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi64)
15799#[inline]
15800#[target_feature(enable = "avx512fp16")]
15801#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15802#[rustc_legacy_const_generics(2)]
15803#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15804pub fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15805    static_assert_sae!(SAE);
15806    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_setzero_si512(), k, a)
15807}
15808
15809/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15810/// store the results in dst.
15811///
15812/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu64)
15813#[inline]
15814#[target_feature(enable = "avx512fp16,avx512vl")]
15815#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15816#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15817pub fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
15818    _mm_mask_cvttph_epu64(_mm_undefined_si128(), 0xff, a)
15819}
15820
15821/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15822/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15823///
15824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu64)
15825#[inline]
15826#[target_feature(enable = "avx512fp16,avx512vl")]
15827#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15828#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15829pub fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15830    unsafe { transmute(vcvttph2uqq_128(a, src.as_u64x2(), k)) }
15831}
15832
15833/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15834/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15835///
15836/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu64)
15837#[inline]
15838#[target_feature(enable = "avx512fp16,avx512vl")]
15839#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15840#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15841pub fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15842    _mm_mask_cvttph_epu64(_mm_setzero_si128(), k, a)
15843}
15844
15845/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15846/// store the results in dst.
15847///
15848/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu64)
15849#[inline]
15850#[target_feature(enable = "avx512fp16,avx512vl")]
15851#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15852#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15853pub fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
15854    _mm256_mask_cvttph_epu64(_mm256_undefined_si256(), 0xff, a)
15855}
15856
15857/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15858/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15859///
15860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu64)
15861#[inline]
15862#[target_feature(enable = "avx512fp16,avx512vl")]
15863#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15864#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15865pub fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15866    unsafe { transmute(vcvttph2uqq_256(a, src.as_u64x4(), k)) }
15867}
15868
15869/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15870/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15871///
15872/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu64)
15873#[inline]
15874#[target_feature(enable = "avx512fp16,avx512vl")]
15875#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15876#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15877pub fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15878    _mm256_mask_cvttph_epu64(_mm256_setzero_si256(), k, a)
15879}
15880
15881/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15882/// store the results in dst.
15883///
15884/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu64)
15885#[inline]
15886#[target_feature(enable = "avx512fp16")]
15887#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15888#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15889pub fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
15890    _mm512_mask_cvttph_epu64(_mm512_undefined_epi32(), 0xff, a)
15891}
15892
15893/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15894/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15895///
15896/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu64)
15897#[inline]
15898#[target_feature(enable = "avx512fp16")]
15899#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15900#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15901pub fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15902    unsafe {
15903        transmute(vcvttph2uqq_512(
15904            a,
15905            src.as_u64x8(),
15906            k,
15907            _MM_FROUND_CUR_DIRECTION,
15908        ))
15909    }
15910}
15911
15912/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15913/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15914///
15915/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu64)
15916#[inline]
15917#[target_feature(enable = "avx512fp16")]
15918#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15919#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15920pub fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15921    _mm512_mask_cvttph_epu64(_mm512_setzero_si512(), k, a)
15922}
15923
15924/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15925/// store the results in dst.
15926///
15927/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15928///
15929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu64)
15930#[inline]
15931#[target_feature(enable = "avx512fp16")]
15932#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15933#[rustc_legacy_const_generics(1)]
15934#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15935pub fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
15936    static_assert_sae!(SAE);
15937    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15938}
15939
15940/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15941/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15942///
15943/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15944///
15945/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu64)
15946#[inline]
15947#[target_feature(enable = "avx512fp16")]
15948#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15949#[rustc_legacy_const_generics(3)]
15950#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15951pub fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
15952    src: __m512i,
15953    k: __mmask8,
15954    a: __m128h,
15955) -> __m512i {
15956    unsafe {
15957        static_assert_sae!(SAE);
15958        transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
15959    }
15960}
15961
15962/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15963/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15964///
15965/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15966///
15967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu64)
15968#[inline]
15969#[target_feature(enable = "avx512fp16")]
15970#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15971#[rustc_legacy_const_generics(2)]
15972#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15973pub fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15974    static_assert_sae!(SAE);
15975    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_setzero_si512(), k, a)
15976}
15977
15978/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15979/// floating-point elements, and store the results in dst.
15980///
15981/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps)
15982#[inline]
15983#[target_feature(enable = "avx512fp16,avx512vl")]
15984#[cfg_attr(test, assert_instr(vcvtph2psx))]
15985#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15986pub fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
15987    _mm_mask_cvtxph_ps(_mm_setzero_ps(), 0xff, a)
15988}
15989
15990/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15991/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15992/// dst when the corresponding mask bit is not set).
15993///
15994/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps)
15995#[inline]
15996#[target_feature(enable = "avx512fp16,avx512vl")]
15997#[cfg_attr(test, assert_instr(vcvtph2psx))]
15998#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
15999pub fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
16000    unsafe { vcvtph2psx_128(a, src, k) }
16001}
16002
16003/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16004/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16005/// corresponding mask bit is not set).
16006///
16007/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps)
16008#[inline]
16009#[target_feature(enable = "avx512fp16,avx512vl")]
16010#[cfg_attr(test, assert_instr(vcvtph2psx))]
16011#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16012pub fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
16013    _mm_mask_cvtxph_ps(_mm_setzero_ps(), k, a)
16014}
16015
16016/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16017/// floating-point elements, and store the results in dst.
16018///
16019/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps)
16020#[inline]
16021#[target_feature(enable = "avx512fp16,avx512vl")]
16022#[cfg_attr(test, assert_instr(vcvtph2psx))]
16023#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16024pub fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
16025    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), 0xff, a)
16026}
16027
16028/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16029/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16030/// dst when the corresponding mask bit is not set).
16031///
16032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps)
16033#[inline]
16034#[target_feature(enable = "avx512fp16,avx512vl")]
16035#[cfg_attr(test, assert_instr(vcvtph2psx))]
16036#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16037pub fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
16038    unsafe { vcvtph2psx_256(a, src, k) }
16039}
16040
16041/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16042/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16043/// corresponding mask bit is not set).
16044///
16045/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps)
16046#[inline]
16047#[target_feature(enable = "avx512fp16,avx512vl")]
16048#[cfg_attr(test, assert_instr(vcvtph2psx))]
16049#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16050pub fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
16051    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), k, a)
16052}
16053
16054/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16055/// floating-point elements, and store the results in dst.
16056///
16057/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxph_ps)
16058#[inline]
16059#[target_feature(enable = "avx512fp16")]
16060#[cfg_attr(test, assert_instr(vcvtph2psx))]
16061#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16062pub fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
16063    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), 0xffff, a)
16064}
16065
16066/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16067/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16068/// dst when the corresponding mask bit is not set).
16069///
16070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxph_ps)
16071#[inline]
16072#[target_feature(enable = "avx512fp16")]
16073#[cfg_attr(test, assert_instr(vcvtph2psx))]
16074#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16075pub fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
16076    unsafe { vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
16077}
16078
16079/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16080/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16081/// corresponding mask bit is not set).
16082///
16083/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxph_ps)
16084#[inline]
16085#[target_feature(enable = "avx512fp16")]
16086#[cfg_attr(test, assert_instr(vcvtph2psx))]
16087#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16088pub fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
16089    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), k, a)
16090}
16091
16092/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16093/// floating-point elements, and store the results in dst.
16094///
16095/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16096///
16097/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundph_ps)
16098#[inline]
16099#[target_feature(enable = "avx512fp16")]
16100#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
16101#[rustc_legacy_const_generics(1)]
16102#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16103pub fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
16104    static_assert_sae!(SAE);
16105    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), 0xffff, a)
16106}
16107
16108/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16109/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16110/// dst when the corresponding mask bit is not set).
16111///
16112/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16113///
16114/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundph_ps)
16115#[inline]
16116#[target_feature(enable = "avx512fp16")]
16117#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
16118#[rustc_legacy_const_generics(3)]
16119#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16120pub fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
16121    src: __m512,
16122    k: __mmask16,
16123    a: __m256h,
16124) -> __m512 {
16125    unsafe {
16126        static_assert_sae!(SAE);
16127        vcvtph2psx_512(a, src, k, SAE)
16128    }
16129}
16130
16131/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16132/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16133/// corresponding mask bit is not set).
16134///
16135/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16136///
16137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundph_ps)
16138#[inline]
16139#[target_feature(enable = "avx512fp16")]
16140#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
16141#[rustc_legacy_const_generics(2)]
16142#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16143pub fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
16144    static_assert_sae!(SAE);
16145    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), k, a)
16146}
16147
16148/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16149/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed
16150/// elements from a to the upper elements of dst.
16151///
16152/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_ss)
16153#[inline]
16154#[target_feature(enable = "avx512fp16")]
16155#[cfg_attr(test, assert_instr(vcvtsh2ss))]
16156#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16157pub fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
16158    _mm_mask_cvtsh_ss(a, 0xff, a, b)
16159}
16160
16161/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16162/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16163/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
16164/// upper elements of dst.
16165///
16166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_ss)
16167#[inline]
16168#[target_feature(enable = "avx512fp16")]
16169#[cfg_attr(test, assert_instr(vcvtsh2ss))]
16170#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16171pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
16172    unsafe { vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
16173}
16174
16175/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16176/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16177/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
16178/// of dst.
16179///
16180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_ss)
16181#[inline]
16182#[target_feature(enable = "avx512fp16")]
16183#[cfg_attr(test, assert_instr(vcvtsh2ss))]
16184#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16185pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
16186    _mm_mask_cvtsh_ss(_mm_set_ss(0.0), k, a, b)
16187}
16188
16189/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16190/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements
16191/// from a to the upper elements of dst.
16192///
16193/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16194///
16195/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_ss)
16196#[inline]
16197#[target_feature(enable = "avx512fp16")]
16198#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
16199#[rustc_legacy_const_generics(2)]
16200#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16201pub fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
16202    static_assert_sae!(SAE);
16203    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_undefined_ps(), 0xff, a, b)
16204}
16205
16206/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16207/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16208/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
16209/// upper elements of dst.
16210///
16211/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16212///
16213/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss)
16214#[inline]
16215#[target_feature(enable = "avx512fp16")]
16216#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
16217#[rustc_legacy_const_generics(4)]
16218#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16219pub fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
16220    src: __m128,
16221    k: __mmask8,
16222    a: __m128,
16223    b: __m128h,
16224) -> __m128 {
16225    unsafe {
16226        static_assert_sae!(SAE);
16227        vcvtsh2ss(a, b, src, k, SAE)
16228    }
16229}
16230
16231/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16232/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16233/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
16234/// of dst.
16235///
16236/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16237///
16238/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss)
16239#[inline]
16240#[target_feature(enable = "avx512fp16")]
16241#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
16242#[rustc_legacy_const_generics(3)]
16243#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16244pub fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
16245    static_assert_sae!(SAE);
16246    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_set_ss(0.0), k, a, b)
16247}
16248
16249/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16250/// floating-point elements, and store the results in dst.
16251///
16252/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_pd)
16253#[inline]
16254#[target_feature(enable = "avx512fp16,avx512vl")]
16255#[cfg_attr(test, assert_instr(vcvtph2pd))]
16256#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16257pub fn _mm_cvtph_pd(a: __m128h) -> __m128d {
16258    _mm_mask_cvtph_pd(_mm_setzero_pd(), 0xff, a)
16259}
16260
16261/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16262/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16263/// dst when the corresponding mask bit is not set).
16264///
16265/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_pd)
16266#[inline]
16267#[target_feature(enable = "avx512fp16,avx512vl")]
16268#[cfg_attr(test, assert_instr(vcvtph2pd))]
16269#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16270pub fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
16271    unsafe { vcvtph2pd_128(a, src, k) }
16272}
16273
16274/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16275/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16276/// corresponding mask bit is not set).
16277///
16278/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_pd)
16279#[inline]
16280#[target_feature(enable = "avx512fp16,avx512vl")]
16281#[cfg_attr(test, assert_instr(vcvtph2pd))]
16282#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16283pub fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
16284    _mm_mask_cvtph_pd(_mm_setzero_pd(), k, a)
16285}
16286
16287/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16288/// floating-point elements, and store the results in dst.
16289///
16290/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_pd)
16291#[inline]
16292#[target_feature(enable = "avx512fp16,avx512vl")]
16293#[cfg_attr(test, assert_instr(vcvtph2pd))]
16294#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16295pub fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
16296    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), 0xff, a)
16297}
16298
16299/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16300/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16301/// dst when the corresponding mask bit is not set).
16302///
16303/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_pd)
16304#[inline]
16305#[target_feature(enable = "avx512fp16,avx512vl")]
16306#[cfg_attr(test, assert_instr(vcvtph2pd))]
16307#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16308pub fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
16309    unsafe { vcvtph2pd_256(a, src, k) }
16310}
16311
16312/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16313/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16314/// corresponding mask bit is not set).
16315///
16316/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_pd)
16317#[inline]
16318#[target_feature(enable = "avx512fp16,avx512vl")]
16319#[cfg_attr(test, assert_instr(vcvtph2pd))]
16320#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16321pub fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
16322    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), k, a)
16323}
16324
16325/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16326/// floating-point elements, and store the results in dst.
16327///
16328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_pd)
16329#[inline]
16330#[target_feature(enable = "avx512fp16")]
16331#[cfg_attr(test, assert_instr(vcvtph2pd))]
16332#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16333pub fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
16334    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), 0xff, a)
16335}
16336
16337/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16338/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16339/// dst when the corresponding mask bit is not set).
16340///
16341/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_pd)
16342#[inline]
16343#[target_feature(enable = "avx512fp16")]
16344#[cfg_attr(test, assert_instr(vcvtph2pd))]
16345#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16346pub fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
16347    unsafe { vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
16348}
16349
16350/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16351/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16352/// corresponding mask bit is not set).
16353///
16354/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_pd)
16355#[inline]
16356#[target_feature(enable = "avx512fp16")]
16357#[cfg_attr(test, assert_instr(vcvtph2pd))]
16358#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16359pub fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
16360    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), k, a)
16361}
16362
16363/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16364/// floating-point elements, and store the results in dst.
16365///
16366/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16367///
16368/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_pd)
16369#[inline]
16370#[target_feature(enable = "avx512fp16")]
16371#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16372#[rustc_legacy_const_generics(1)]
16373#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16374pub fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
16375    static_assert_sae!(SAE);
16376    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), 0xff, a)
16377}
16378
16379/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16380/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16381/// dst when the corresponding mask bit is not set).
16382///
16383/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16384///
16385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_pd)
16386#[inline]
16387#[target_feature(enable = "avx512fp16")]
16388#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16389#[rustc_legacy_const_generics(3)]
16390#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16391pub fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
16392    src: __m512d,
16393    k: __mmask8,
16394    a: __m128h,
16395) -> __m512d {
16396    unsafe {
16397        static_assert_sae!(SAE);
16398        vcvtph2pd_512(a, src, k, SAE)
16399    }
16400}
16401
16402/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16403/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16404/// corresponding mask bit is not set).
16405///
16406/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16407///
16408/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_pd)
16409#[inline]
16410#[target_feature(enable = "avx512fp16")]
16411#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16412#[rustc_legacy_const_generics(2)]
16413#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16414pub fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
16415    static_assert_sae!(SAE);
16416    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), k, a)
16417}
16418
16419/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16420/// floating-point element, store the result in the lower element of dst, and copy the upper element
16421/// from a to the upper element of dst.
16422///
16423/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_sd)
16424#[inline]
16425#[target_feature(enable = "avx512fp16")]
16426#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16427#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16428pub fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
16429    _mm_mask_cvtsh_sd(a, 0xff, a, b)
16430}
16431
16432/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16433/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16434/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16435/// of dst.
16436///
16437/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_sd)
16438#[inline]
16439#[target_feature(enable = "avx512fp16")]
16440#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16441#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16442pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16443    unsafe { vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
16444}
16445
16446/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16447/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16448/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16449///
16450/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_sd)
16451#[inline]
16452#[target_feature(enable = "avx512fp16")]
16453#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16454#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16455pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16456    _mm_mask_cvtsh_sd(_mm_set_sd(0.0), k, a, b)
16457}
16458
16459/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16460/// floating-point element, store the result in the lower element of dst, and copy the upper element from a
16461/// to the upper element of dst.
16462///
16463/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16464///
16465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_sd)
16466#[inline]
16467#[target_feature(enable = "avx512fp16")]
16468#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16469#[rustc_legacy_const_generics(2)]
16470#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16471pub fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
16472    static_assert_sae!(SAE);
16473    _mm_mask_cvt_roundsh_sd::<SAE>(a, 0xff, a, b)
16474}
16475
16476/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16477/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16478/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16479/// of dst.
16480///
16481/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16482///
16483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd)
16484#[inline]
16485#[target_feature(enable = "avx512fp16")]
16486#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16487#[rustc_legacy_const_generics(4)]
16488#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16489pub fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
16490    src: __m128d,
16491    k: __mmask8,
16492    a: __m128d,
16493    b: __m128h,
16494) -> __m128d {
16495    unsafe {
16496        static_assert_sae!(SAE);
16497        vcvtsh2sd(a, b, src, k, SAE)
16498    }
16499}
16500
16501/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16502/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16503/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16504///
16505/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16506///
16507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd)
16508#[inline]
16509#[target_feature(enable = "avx512fp16")]
16510#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16511#[rustc_legacy_const_generics(3)]
16512#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16513pub fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16514    static_assert_sae!(SAE);
16515    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_set_sd(0.0), k, a, b)
16516}
16517
16518/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16519///
16520/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
16521#[inline]
16522#[target_feature(enable = "avx512fp16")]
16523#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16524#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16525pub const fn _mm_cvtsh_h(a: __m128h) -> f16 {
16526    unsafe { simd_extract!(a, 0) }
16527}
16528
16529/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16530///
16531/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
16532#[inline]
16533#[target_feature(enable = "avx512fp16")]
16534#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16535#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16536pub const fn _mm256_cvtsh_h(a: __m256h) -> f16 {
16537    unsafe { simd_extract!(a, 0) }
16538}
16539
16540/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16541///
16542/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
16543#[inline]
16544#[target_feature(enable = "avx512fp16")]
16545#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16546#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16547pub const fn _mm512_cvtsh_h(a: __m512h) -> f16 {
16548    unsafe { simd_extract!(a, 0) }
16549}
16550
16551/// Copy the lower 16-bit integer in a to dst.
16552///
16553/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
16554#[inline]
16555#[target_feature(enable = "avx512fp16")]
16556#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16557#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16558pub const fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
16559    unsafe { simd_extract!(a.as_i16x8(), 0) }
16560}
16561
16562/// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
16563///
16564/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
16565#[inline]
16566#[target_feature(enable = "avx512fp16")]
16567#[stable(feature = "stdarch_x86_avx512fp16", since = "1.94.0")]
16568#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16569pub const fn _mm_cvtsi16_si128(a: i16) -> __m128i {
16570    unsafe { transmute(simd_insert!(i16x8::ZERO, 0, a)) }
16571}
16572
16573#[allow(improper_ctypes)]
16574unsafe extern "C" {
16575    #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
16576    fn vcmpsh(a: __m128h, b: __m128h, imm8: i32, mask: __mmask8, sae: i32) -> __mmask8;
16577    #[link_name = "llvm.x86.avx512fp16.vcomi.sh"]
16578    fn vcomish(a: __m128h, b: __m128h, imm8: i32, sae: i32) -> i32;
16579
16580    #[link_name = "llvm.x86.avx512fp16.add.ph.512"]
16581    fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16582    #[link_name = "llvm.x86.avx512fp16.sub.ph.512"]
16583    fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16584    #[link_name = "llvm.x86.avx512fp16.mul.ph.512"]
16585    fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16586    #[link_name = "llvm.x86.avx512fp16.div.ph.512"]
16587    fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16588
16589    #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"]
16590    fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16591    #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"]
16592    fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16593    #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"]
16594    fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16595    #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"]
16596    fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16597
16598    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.128"]
16599    fn vfmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16600    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.256"]
16601    fn vfmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16602    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.512"]
16603    fn vfmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16604    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.csh"]
16605    fn vfmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16606
16607    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.128"]
16608    fn vfcmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16609    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.256"]
16610    fn vfcmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16611    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.512"]
16612    fn vfcmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16613    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.csh"]
16614    fn vfcmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16615
16616    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.128"]
16617    fn vfmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16618    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.128"]
16619    fn vfmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16620    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.256"]
16621    fn vfmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16622    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.256"]
16623    fn vfmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16624    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.512"]
16625    fn vfmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16626    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.512"]
16627    fn vfmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16628    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.csh"]
16629    fn vfmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16630    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.csh"]
16631    fn vfmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16632
16633    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.128"]
16634    fn vfcmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16635    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.128"]
16636    fn vfcmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16637    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.256"]
16638    fn vfcmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16639    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.256"]
16640    fn vfcmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16641    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.512"]
16642    fn vfcmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16643    -> __m512;
16644    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.512"]
16645    fn vfcmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16646    -> __m512;
16647    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.csh"]
16648    fn vfcmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16649    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.csh"]
16650    fn vfcmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16651
16652    #[link_name = "llvm.x86.avx512fp16.vfmadd.ph.512"]
16653    fn vfmaddph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16654    #[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
16655    fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;
16656
16657    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
16658    fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16659
16660    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.128"]
16661    fn vrcpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16662    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.256"]
16663    fn vrcpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16664    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.512"]
16665    fn vrcpph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16666    #[link_name = "llvm.x86.avx512fp16.mask.rcp.sh"]
16667    fn vrcpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16668
16669    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.128"]
16670    fn vrsqrtph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16671    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.256"]
16672    fn vrsqrtph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16673    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.512"]
16674    fn vrsqrtph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16675    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.sh"]
16676    fn vrsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16677
16678    #[link_name = "llvm.x86.avx512fp16.sqrt.ph.512"]
16679    fn vsqrtph_512(a: __m512h, rounding: i32) -> __m512h;
16680    #[link_name = "llvm.x86.avx512fp16.mask.sqrt.sh"]
16681    fn vsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16682
16683    #[link_name = "llvm.x86.avx512fp16.max.ph.128"]
16684    fn vmaxph_128(a: __m128h, b: __m128h) -> __m128h;
16685    #[link_name = "llvm.x86.avx512fp16.max.ph.256"]
16686    fn vmaxph_256(a: __m256h, b: __m256h) -> __m256h;
16687    #[link_name = "llvm.x86.avx512fp16.max.ph.512"]
16688    fn vmaxph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16689    #[link_name = "llvm.x86.avx512fp16.mask.max.sh.round"]
16690    fn vmaxsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16691
16692    #[link_name = "llvm.x86.avx512fp16.min.ph.128"]
16693    fn vminph_128(a: __m128h, b: __m128h) -> __m128h;
16694    #[link_name = "llvm.x86.avx512fp16.min.ph.256"]
16695    fn vminph_256(a: __m256h, b: __m256h) -> __m256h;
16696    #[link_name = "llvm.x86.avx512fp16.min.ph.512"]
16697    fn vminph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16698    #[link_name = "llvm.x86.avx512fp16.mask.min.sh.round"]
16699    fn vminsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16700
16701    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.128"]
16702    fn vgetexpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16703    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.256"]
16704    fn vgetexpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16705    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.512"]
16706    fn vgetexpph_512(a: __m512h, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16707    #[link_name = "llvm.x86.avx512fp16.mask.getexp.sh"]
16708    fn vgetexpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16709
16710    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.128"]
16711    fn vgetmantph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16712    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.256"]
16713    fn vgetmantph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16714    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.512"]
16715    fn vgetmantph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16716    #[link_name = "llvm.x86.avx512fp16.mask.getmant.sh"]
16717    fn vgetmantsh(
16718        a: __m128h,
16719        b: __m128h,
16720        imm8: i32,
16721        src: __m128h,
16722        k: __mmask8,
16723        sae: i32,
16724    ) -> __m128h;
16725
16726    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.128"]
16727    fn vrndscaleph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16728    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.256"]
16729    fn vrndscaleph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16730    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.512"]
16731    fn vrndscaleph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16732    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.sh"]
16733    fn vrndscalesh(
16734        a: __m128h,
16735        b: __m128h,
16736        src: __m128h,
16737        k: __mmask8,
16738        imm8: i32,
16739        sae: i32,
16740    ) -> __m128h;
16741
16742    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.128"]
16743    fn vscalefph_128(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16744    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.256"]
16745    fn vscalefph_256(a: __m256h, b: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16746    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.512"]
16747    fn vscalefph_512(a: __m512h, b: __m512h, src: __m512h, k: __mmask32, rounding: i32) -> __m512h;
16748    #[link_name = "llvm.x86.avx512fp16.mask.scalef.sh"]
16749    fn vscalefsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16750
16751    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.128"]
16752    fn vreduceph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16753    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.256"]
16754    fn vreduceph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16755    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.512"]
16756    fn vreduceph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16757    #[link_name = "llvm.x86.avx512fp16.mask.reduce.sh"]
16758    fn vreducesh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, imm8: i32, sae: i32)
16759    -> __m128h;
16760
16761    #[link_name = "llvm.x86.avx512fp16.mask.fpclass.sh"]
16762    fn vfpclasssh(a: __m128h, imm8: i32, k: __mmask8) -> __mmask8;
16763
16764    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i16"]
16765    fn vcvtw2ph_128(a: i16x8, rounding: i32) -> __m128h;
16766    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i16"]
16767    fn vcvtw2ph_256(a: i16x16, rounding: i32) -> __m256h;
16768    #[link_name = "llvm.x86.avx512.sitofp.round.v32f16.v32i16"]
16769    fn vcvtw2ph_512(a: i16x32, rounding: i32) -> __m512h;
16770    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i16"]
16771    fn vcvtuw2ph_128(a: u16x8, rounding: i32) -> __m128h;
16772    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i16"]
16773    fn vcvtuw2ph_256(a: u16x16, rounding: i32) -> __m256h;
16774    #[link_name = "llvm.x86.avx512.uitofp.round.v32f16.v32i16"]
16775    fn vcvtuw2ph_512(a: u16x32, rounding: i32) -> __m512h;
16776
16777    #[link_name = "llvm.x86.avx512fp16.mask.vcvtdq2ph.128"]
16778    fn vcvtdq2ph_128(a: i32x4, src: __m128h, k: __mmask8) -> __m128h;
16779    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i32"]
16780    fn vcvtdq2ph_256(a: i32x8, rounding: i32) -> __m128h;
16781    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i32"]
16782    fn vcvtdq2ph_512(a: i32x16, rounding: i32) -> __m256h;
16783    #[link_name = "llvm.x86.avx512fp16.vcvtsi2sh"]
16784    fn vcvtsi2sh(a: __m128h, b: i32, rounding: i32) -> __m128h;
16785    #[link_name = "llvm.x86.avx512fp16.mask.vcvtudq2ph.128"]
16786    fn vcvtudq2ph_128(a: u32x4, src: __m128h, k: __mmask8) -> __m128h;
16787    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i32"]
16788    fn vcvtudq2ph_256(a: u32x8, rounding: i32) -> __m128h;
16789    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i32"]
16790    fn vcvtudq2ph_512(a: u32x16, rounding: i32) -> __m256h;
16791    #[link_name = "llvm.x86.avx512fp16.vcvtusi2sh"]
16792    fn vcvtusi2sh(a: __m128h, b: u32, rounding: i32) -> __m128h;
16793
16794    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.128"]
16795    fn vcvtqq2ph_128(a: i64x2, src: __m128h, k: __mmask8) -> __m128h;
16796    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.256"]
16797    fn vcvtqq2ph_256(a: i64x4, src: __m128h, k: __mmask8) -> __m128h;
16798    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i64"]
16799    fn vcvtqq2ph_512(a: i64x8, rounding: i32) -> __m128h;
16800    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.128"]
16801    fn vcvtuqq2ph_128(a: u64x2, src: __m128h, k: __mmask8) -> __m128h;
16802    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.256"]
16803    fn vcvtuqq2ph_256(a: u64x4, src: __m128h, k: __mmask8) -> __m128h;
16804    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i64"]
16805    fn vcvtuqq2ph_512(a: u64x8, rounding: i32) -> __m128h;
16806
16807    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.128"]
16808    fn vcvtps2phx_128(a: __m128, src: __m128h, k: __mmask8) -> __m128h;
16809    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.256"]
16810    fn vcvtps2phx_256(a: __m256, src: __m128h, k: __mmask8) -> __m128h;
16811    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.512"]
16812    fn vcvtps2phx_512(a: __m512, src: __m256h, k: __mmask16, rounding: i32) -> __m256h;
16813    #[link_name = "llvm.x86.avx512fp16.mask.vcvtss2sh.round"]
16814    fn vcvtss2sh(a: __m128h, b: __m128, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16815
16816    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.128"]
16817    fn vcvtpd2ph_128(a: __m128d, src: __m128h, k: __mmask8) -> __m128h;
16818    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.256"]
16819    fn vcvtpd2ph_256(a: __m256d, src: __m128h, k: __mmask8) -> __m128h;
16820    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.512"]
16821    fn vcvtpd2ph_512(a: __m512d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16822    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsd2sh.round"]
16823    fn vcvtsd2sh(a: __m128h, b: __m128d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16824
16825    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.128"]
16826    fn vcvtph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16827    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.256"]
16828    fn vcvtph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16829    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.512"]
16830    fn vcvtph2w_512(a: __m512h, src: i16x32, k: __mmask32, rounding: i32) -> i16x32;
16831    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.128"]
16832    fn vcvtph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16833    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.256"]
16834    fn vcvtph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16835    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.512"]
16836    fn vcvtph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16837
16838    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.128"]
16839    fn vcvttph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16840    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.256"]
16841    fn vcvttph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16842    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.512"]
16843    fn vcvttph2w_512(a: __m512h, src: i16x32, k: __mmask32, sae: i32) -> i16x32;
16844    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.128"]
16845    fn vcvttph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16846    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.256"]
16847    fn vcvttph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16848    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.512"]
16849    fn vcvttph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16850
16851    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.128"]
16852    fn vcvtph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16853    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.256"]
16854    fn vcvtph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16855    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.512"]
16856    fn vcvtph2dq_512(a: __m256h, src: i32x16, k: __mmask16, rounding: i32) -> i32x16;
16857    #[link_name = "llvm.x86.avx512fp16.vcvtsh2si32"]
16858    fn vcvtsh2si32(a: __m128h, rounding: i32) -> i32;
16859    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.128"]
16860    fn vcvtph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16861    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.256"]
16862    fn vcvtph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16863    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.512"]
16864    fn vcvtph2udq_512(a: __m256h, src: u32x16, k: __mmask16, rounding: i32) -> u32x16;
16865    #[link_name = "llvm.x86.avx512fp16.vcvtsh2usi32"]
16866    fn vcvtsh2usi32(a: __m128h, sae: i32) -> u32;
16867
16868    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.128"]
16869    fn vcvttph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16870    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.256"]
16871    fn vcvttph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16872    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.512"]
16873    fn vcvttph2dq_512(a: __m256h, src: i32x16, k: __mmask16, sae: i32) -> i32x16;
16874    #[link_name = "llvm.x86.avx512fp16.vcvttsh2si32"]
16875    fn vcvttsh2si32(a: __m128h, sae: i32) -> i32;
16876    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.128"]
16877    fn vcvttph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16878    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.256"]
16879    fn vcvttph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16880    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.512"]
16881    fn vcvttph2udq_512(a: __m256h, src: u32x16, k: __mmask16, sae: i32) -> u32x16;
16882    #[link_name = "llvm.x86.avx512fp16.vcvttsh2usi32"]
16883    fn vcvttsh2usi32(a: __m128h, sae: i32) -> u32;
16884
16885    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.128"]
16886    fn vcvtph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16887    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.256"]
16888    fn vcvtph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16889    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.512"]
16890    fn vcvtph2qq_512(a: __m128h, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
16891    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.128"]
16892    fn vcvtph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16893    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.256"]
16894    fn vcvtph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16895    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.512"]
16896    fn vcvtph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
16897
16898    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.128"]
16899    fn vcvttph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16900    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.256"]
16901    fn vcvttph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16902    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.512"]
16903    fn vcvttph2qq_512(a: __m128h, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
16904    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.128"]
16905    fn vcvttph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16906    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.256"]
16907    fn vcvttph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16908    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.512"]
16909    fn vcvttph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
16910
16911    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.128"]
16912    fn vcvtph2psx_128(a: __m128h, src: __m128, k: __mmask8) -> __m128;
16913    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.256"]
16914    fn vcvtph2psx_256(a: __m128h, src: __m256, k: __mmask8) -> __m256;
16915    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.512"]
16916    fn vcvtph2psx_512(a: __m256h, src: __m512, k: __mmask16, sae: i32) -> __m512;
16917    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2ss.round"]
16918    fn vcvtsh2ss(a: __m128, b: __m128h, src: __m128, k: __mmask8, sae: i32) -> __m128;
16919
16920    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.128"]
16921    fn vcvtph2pd_128(a: __m128h, src: __m128d, k: __mmask8) -> __m128d;
16922    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.256"]
16923    fn vcvtph2pd_256(a: __m128h, src: __m256d, k: __mmask8) -> __m256d;
16924    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.512"]
16925    fn vcvtph2pd_512(a: __m128h, src: __m512d, k: __mmask8, sae: i32) -> __m512d;
16926    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2sd.round"]
16927    fn vcvtsh2sd(a: __m128d, b: __m128h, src: __m128d, k: __mmask8, sae: i32) -> __m128d;
16928
16929}
16930
16931#[cfg(test)]
16932mod tests {
16933    use crate::core_arch::assert_eq_const as assert_eq;
16934    use crate::core_arch::x86::*;
16935    use crate::ptr::{addr_of, addr_of_mut};
16936    use stdarch_test::simd_test;
16937
16938    #[target_feature(enable = "avx512fp16")]
16939    #[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
16940    const fn _mm_set1_pch(re: f16, im: f16) -> __m128h {
16941        _mm_setr_ph(re, im, re, im, re, im, re, im)
16942    }
16943
16944    #[target_feature(enable = "avx512fp16")]
16945    #[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
16946    const fn _mm256_set1_pch(re: f16, im: f16) -> __m256h {
16947        _mm256_setr_ph(
16948            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16949        )
16950    }
16951
16952    #[target_feature(enable = "avx512fp16")]
16953    #[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
16954    const fn _mm512_set1_pch(re: f16, im: f16) -> __m512h {
16955        _mm512_setr_ph(
16956            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16957            re, im, re, im, re, im, re, im, re, im,
16958        )
16959    }
16960
16961    #[simd_test(enable = "avx512fp16,avx512vl")]
16962    const fn test_mm_set_ph() {
16963        let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16964        let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16965        assert_eq_m128h(r, e);
16966    }
16967
16968    #[simd_test(enable = "avx512fp16,avx512vl")]
16969    const fn test_mm256_set_ph() {
16970        let r = _mm256_set_ph(
16971            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16972        );
16973        let e = _mm256_setr_ph(
16974            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16975        );
16976        assert_eq_m256h(r, e);
16977    }
16978
16979    #[simd_test(enable = "avx512fp16")]
16980    const fn test_mm512_set_ph() {
16981        let r = _mm512_set_ph(
16982            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16983            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16984            31.0, 32.0,
16985        );
16986        let e = _mm512_setr_ph(
16987            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16988            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16989            3.0, 2.0, 1.0,
16990        );
16991        assert_eq_m512h(r, e);
16992    }
16993
16994    #[simd_test(enable = "avx512fp16,avx512vl")]
16995    const fn test_mm_set_sh() {
16996        let r = _mm_set_sh(1.0);
16997        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0);
16998        assert_eq_m128h(r, e);
16999    }
17000
17001    #[simd_test(enable = "avx512fp16,avx512vl")]
17002    const fn test_mm_set1_ph() {
17003        let r = _mm_set1_ph(1.0);
17004        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
17005        assert_eq_m128h(r, e);
17006    }
17007
17008    #[simd_test(enable = "avx512fp16,avx512vl")]
17009    const fn test_mm256_set1_ph() {
17010        let r = _mm256_set1_ph(1.0);
17011        let e = _mm256_set_ph(
17012            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
17013        );
17014        assert_eq_m256h(r, e);
17015    }
17016
17017    #[simd_test(enable = "avx512fp16")]
17018    const fn test_mm512_set1_ph() {
17019        let r = _mm512_set1_ph(1.0);
17020        let e = _mm512_set_ph(
17021            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
17022            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
17023        );
17024        assert_eq_m512h(r, e);
17025    }
17026
17027    #[simd_test(enable = "avx512fp16,avx512vl")]
17028    const fn test_mm_setr_ph() {
17029        let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17030        let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17031        assert_eq_m128h(r, e);
17032    }
17033
17034    #[simd_test(enable = "avx512fp16,avx512vl")]
17035    const fn test_mm256_setr_ph() {
17036        let r = _mm256_setr_ph(
17037            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17038        );
17039        let e = _mm256_set_ph(
17040            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17041        );
17042        assert_eq_m256h(r, e);
17043    }
17044
17045    #[simd_test(enable = "avx512fp16")]
17046    const fn test_mm512_setr_ph() {
17047        let r = _mm512_setr_ph(
17048            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17049            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17050            31.0, 32.0,
17051        );
17052        let e = _mm512_set_ph(
17053            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17054            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17055            3.0, 2.0, 1.0,
17056        );
17057        assert_eq_m512h(r, e);
17058    }
17059
17060    #[simd_test(enable = "avx512fp16,avx512vl")]
17061    const fn test_mm_setzero_ph() {
17062        let r = _mm_setzero_ph();
17063        let e = _mm_set1_ph(0.0);
17064        assert_eq_m128h(r, e);
17065    }
17066
17067    #[simd_test(enable = "avx512fp16,avx512vl")]
17068    const fn test_mm256_setzero_ph() {
17069        let r = _mm256_setzero_ph();
17070        let e = _mm256_set1_ph(0.0);
17071        assert_eq_m256h(r, e);
17072    }
17073
17074    #[simd_test(enable = "avx512fp16")]
17075    const fn test_mm512_setzero_ph() {
17076        let r = _mm512_setzero_ph();
17077        let e = _mm512_set1_ph(0.0);
17078        assert_eq_m512h(r, e);
17079    }
17080
17081    #[simd_test(enable = "avx512fp16,avx512vl")]
17082    const fn test_mm_castsi128_ph() {
17083        let a = _mm_set1_epi16(0x3c00);
17084        let r = _mm_castsi128_ph(a);
17085        let e = _mm_set1_ph(1.0);
17086        assert_eq_m128h(r, e);
17087    }
17088
17089    #[simd_test(enable = "avx512fp16,avx512vl")]
17090    const fn test_mm256_castsi256_ph() {
17091        let a = _mm256_set1_epi16(0x3c00);
17092        let r = _mm256_castsi256_ph(a);
17093        let e = _mm256_set1_ph(1.0);
17094        assert_eq_m256h(r, e);
17095    }
17096
17097    #[simd_test(enable = "avx512fp16")]
17098    const fn test_mm512_castsi512_ph() {
17099        let a = _mm512_set1_epi16(0x3c00);
17100        let r = _mm512_castsi512_ph(a);
17101        let e = _mm512_set1_ph(1.0);
17102        assert_eq_m512h(r, e);
17103    }
17104
17105    #[simd_test(enable = "avx512fp16")]
17106    const fn test_mm_castph_si128() {
17107        let a = _mm_set1_ph(1.0);
17108        let r = _mm_castph_si128(a);
17109        let e = _mm_set1_epi16(0x3c00);
17110        assert_eq_m128i(r, e);
17111    }
17112
17113    #[simd_test(enable = "avx512fp16")]
17114    const fn test_mm256_castph_si256() {
17115        let a = _mm256_set1_ph(1.0);
17116        let r = _mm256_castph_si256(a);
17117        let e = _mm256_set1_epi16(0x3c00);
17118        assert_eq_m256i(r, e);
17119    }
17120
17121    #[simd_test(enable = "avx512fp16")]
17122    const fn test_mm512_castph_si512() {
17123        let a = _mm512_set1_ph(1.0);
17124        let r = _mm512_castph_si512(a);
17125        let e = _mm512_set1_epi16(0x3c00);
17126        assert_eq_m512i(r, e);
17127    }
17128
17129    #[simd_test(enable = "avx512fp16,avx512vl")]
17130    const fn test_mm_castps_ph() {
17131        let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00));
17132        let r = _mm_castps_ph(a);
17133        let e = _mm_set1_ph(1.0);
17134        assert_eq_m128h(r, e);
17135    }
17136
17137    #[simd_test(enable = "avx512fp16,avx512vl")]
17138    const fn test_mm256_castps_ph() {
17139        let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00));
17140        let r = _mm256_castps_ph(a);
17141        let e = _mm256_set1_ph(1.0);
17142        assert_eq_m256h(r, e);
17143    }
17144
17145    #[simd_test(enable = "avx512fp16")]
17146    const fn test_mm512_castps_ph() {
17147        let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00));
17148        let r = _mm512_castps_ph(a);
17149        let e = _mm512_set1_ph(1.0);
17150        assert_eq_m512h(r, e);
17151    }
17152
17153    #[simd_test(enable = "avx512fp16")]
17154    const fn test_mm_castph_ps() {
17155        let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000));
17156        let r = _mm_castph_ps(a);
17157        let e = _mm_set1_ps(1.0);
17158        assert_eq_m128(r, e);
17159    }
17160
17161    #[simd_test(enable = "avx512fp16")]
17162    const fn test_mm256_castph_ps() {
17163        let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000));
17164        let r = _mm256_castph_ps(a);
17165        let e = _mm256_set1_ps(1.0);
17166        assert_eq_m256(r, e);
17167    }
17168
17169    #[simd_test(enable = "avx512fp16")]
17170    const fn test_mm512_castph_ps() {
17171        let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000));
17172        let r = _mm512_castph_ps(a);
17173        let e = _mm512_set1_ps(1.0);
17174        assert_eq_m512(r, e);
17175    }
17176
17177    #[simd_test(enable = "avx512fp16,avx512vl")]
17178    const fn test_mm_castpd_ph() {
17179        let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00));
17180        let r = _mm_castpd_ph(a);
17181        let e = _mm_set1_ph(1.0);
17182        assert_eq_m128h(r, e);
17183    }
17184
17185    #[simd_test(enable = "avx512fp16,avx512vl")]
17186    const fn test_mm256_castpd_ph() {
17187        let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00));
17188        let r = _mm256_castpd_ph(a);
17189        let e = _mm256_set1_ph(1.0);
17190        assert_eq_m256h(r, e);
17191    }
17192
17193    #[simd_test(enable = "avx512fp16")]
17194    const fn test_mm512_castpd_ph() {
17195        let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00));
17196        let r = _mm512_castpd_ph(a);
17197        let e = _mm512_set1_ph(1.0);
17198        assert_eq_m512h(r, e);
17199    }
17200
17201    #[simd_test(enable = "avx512fp16")]
17202    const fn test_mm_castph_pd() {
17203        let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000));
17204        let r = _mm_castph_pd(a);
17205        let e = _mm_set1_pd(1.0);
17206        assert_eq_m128d(r, e);
17207    }
17208
17209    #[simd_test(enable = "avx512fp16")]
17210    const fn test_mm256_castph_pd() {
17211        let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000));
17212        let r = _mm256_castph_pd(a);
17213        let e = _mm256_set1_pd(1.0);
17214        assert_eq_m256d(r, e);
17215    }
17216
17217    #[simd_test(enable = "avx512fp16")]
17218    const fn test_mm512_castph_pd() {
17219        let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000));
17220        let r = _mm512_castph_pd(a);
17221        let e = _mm512_set1_pd(1.0);
17222        assert_eq_m512d(r, e);
17223    }
17224
17225    #[simd_test(enable = "avx512fp16,avx512vl")]
17226    const fn test_mm256_castph256_ph128() {
17227        let a = _mm256_setr_ph(
17228            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17229        );
17230        let r = _mm256_castph256_ph128(a);
17231        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17232        assert_eq_m128h(r, e);
17233    }
17234
17235    #[simd_test(enable = "avx512fp16,avx512vl")]
17236    const fn test_mm512_castph512_ph128() {
17237        let a = _mm512_setr_ph(
17238            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
17239            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
17240        );
17241        let r = _mm512_castph512_ph128(a);
17242        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17243        assert_eq_m128h(r, e);
17244    }
17245
17246    #[simd_test(enable = "avx512fp16,avx512vl")]
17247    const fn test_mm512_castph512_ph256() {
17248        let a = _mm512_setr_ph(
17249            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
17250            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
17251        );
17252        let r = _mm512_castph512_ph256(a);
17253        let e = _mm256_setr_ph(
17254            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17255        );
17256        assert_eq_m256h(r, e);
17257    }
17258
17259    #[simd_test(enable = "avx512fp16,avx512vl")]
17260    const fn test_mm256_castph128_ph256() {
17261        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17262        let r = _mm256_castph128_ph256(a);
17263        assert_eq_m128h(_mm256_castph256_ph128(r), a);
17264    }
17265
17266    #[simd_test(enable = "avx512fp16,avx512vl")]
17267    const fn test_mm512_castph128_ph512() {
17268        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17269        let r = _mm512_castph128_ph512(a);
17270        assert_eq_m128h(_mm512_castph512_ph128(r), a);
17271    }
17272
17273    #[simd_test(enable = "avx512fp16,avx512vl")]
17274    const fn test_mm512_castph256_ph512() {
17275        let a = _mm256_setr_ph(
17276            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17277        );
17278        let r = _mm512_castph256_ph512(a);
17279        assert_eq_m256h(_mm512_castph512_ph256(r), a);
17280    }
17281
17282    #[simd_test(enable = "avx512fp16,avx512vl")]
17283    const fn test_mm256_zextph128_ph256() {
17284        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17285        let r = _mm256_zextph128_ph256(a);
17286        let e = _mm256_setr_ph(
17287            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
17288        );
17289        assert_eq_m256h(r, e);
17290    }
17291
17292    #[simd_test(enable = "avx512fp16")]
17293    const fn test_mm512_zextph128_ph512() {
17294        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17295        let r = _mm512_zextph128_ph512(a);
17296        let e = _mm512_setr_ph(
17297            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17298            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17299        );
17300        assert_eq_m512h(r, e);
17301    }
17302
17303    #[simd_test(enable = "avx512fp16")]
17304    const fn test_mm512_zextph256_ph512() {
17305        let a = _mm256_setr_ph(
17306            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17307        );
17308        let r = _mm512_zextph256_ph512(a);
17309        let e = _mm512_setr_ph(
17310            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0.,
17311            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17312        );
17313        assert_eq_m512h(r, e);
17314    }
17315
17316    #[simd_test(enable = "avx512fp16,avx512vl")]
17317    fn test_mm_cmp_ph_mask() {
17318        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17319        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17320        let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17321        assert_eq!(r, 0b11110000);
17322    }
17323
17324    #[simd_test(enable = "avx512fp16,avx512vl")]
17325    fn test_mm_mask_cmp_ph_mask() {
17326        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17327        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17328        let r = _mm_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101, a, b);
17329        assert_eq!(r, 0b01010000);
17330    }
17331
17332    #[simd_test(enable = "avx512fp16,avx512vl")]
17333    fn test_mm256_cmp_ph_mask() {
17334        let a = _mm256_set_ph(
17335            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17336        );
17337        let b = _mm256_set_ph(
17338            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17339            -16.0,
17340        );
17341        let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17342        assert_eq!(r, 0b1111000011110000);
17343    }
17344
17345    #[simd_test(enable = "avx512fp16,avx512vl")]
17346    fn test_mm256_mask_cmp_ph_mask() {
17347        let a = _mm256_set_ph(
17348            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17349        );
17350        let b = _mm256_set_ph(
17351            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17352            -16.0,
17353        );
17354        let r = _mm256_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b0101010101010101, a, b);
17355        assert_eq!(r, 0b0101000001010000);
17356    }
17357
17358    #[simd_test(enable = "avx512fp16")]
17359    fn test_mm512_cmp_ph_mask() {
17360        let a = _mm512_set_ph(
17361            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17362            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17363            31.0, 32.0,
17364        );
17365        let b = _mm512_set_ph(
17366            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17367            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17368            -29.0, -30.0, -31.0, -32.0,
17369        );
17370        let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17371        assert_eq!(r, 0b11110000111100001111000011110000);
17372    }
17373
17374    #[simd_test(enable = "avx512fp16")]
17375    fn test_mm512_mask_cmp_ph_mask() {
17376        let a = _mm512_set_ph(
17377            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17378            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17379            31.0, 32.0,
17380        );
17381        let b = _mm512_set_ph(
17382            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17383            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17384            -29.0, -30.0, -31.0, -32.0,
17385        );
17386        let r = _mm512_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101010101010101010101010101, a, b);
17387        assert_eq!(r, 0b01010000010100000101000001010000);
17388    }
17389
17390    #[simd_test(enable = "avx512fp16")]
17391    fn test_mm512_cmp_round_ph_mask() {
17392        let a = _mm512_set_ph(
17393            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17394            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17395            31.0, 32.0,
17396        );
17397        let b = _mm512_set_ph(
17398            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17399            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17400            -29.0, -30.0, -31.0, -32.0,
17401        );
17402        let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17403        assert_eq!(r, 0b11110000111100001111000011110000);
17404    }
17405
17406    #[simd_test(enable = "avx512fp16")]
17407    fn test_mm512_mask_cmp_round_ph_mask() {
17408        let a = _mm512_set_ph(
17409            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17410            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17411            31.0, 32.0,
17412        );
17413        let b = _mm512_set_ph(
17414            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17415            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17416            -29.0, -30.0, -31.0, -32.0,
17417        );
17418        let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
17419            0b01010101010101010101010101010101,
17420            a,
17421            b,
17422        );
17423        assert_eq!(r, 0b01010000010100000101000001010000);
17424    }
17425
17426    #[simd_test(enable = "avx512fp16")]
17427    fn test_mm_cmp_round_sh_mask() {
17428        let a = _mm_set_sh(1.0);
17429        let b = _mm_set_sh(1.0);
17430        let r = _mm_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17431        assert_eq!(r, 1);
17432    }
17433
17434    #[simd_test(enable = "avx512fp16")]
17435    fn test_mm_mask_cmp_round_sh_mask() {
17436        let a = _mm_set_sh(1.0);
17437        let b = _mm_set_sh(1.0);
17438        let r = _mm_mask_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(0, a, b);
17439        assert_eq!(r, 0);
17440    }
17441
17442    #[simd_test(enable = "avx512fp16")]
17443    fn test_mm_cmp_sh_mask() {
17444        let a = _mm_set_sh(1.0);
17445        let b = _mm_set_sh(1.0);
17446        let r = _mm_cmp_sh_mask::<_CMP_EQ_OQ>(a, b);
17447        assert_eq!(r, 1);
17448    }
17449
17450    #[simd_test(enable = "avx512fp16")]
17451    fn test_mm_mask_cmp_sh_mask() {
17452        let a = _mm_set_sh(1.0);
17453        let b = _mm_set_sh(1.0);
17454        let r = _mm_mask_cmp_sh_mask::<_CMP_EQ_OQ>(0, a, b);
17455        assert_eq!(r, 0);
17456    }
17457
17458    #[simd_test(enable = "avx512fp16")]
17459    fn test_mm_comi_round_sh() {
17460        let a = _mm_set_sh(1.0);
17461        let b = _mm_set_sh(1.0);
17462        let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17463        assert_eq!(r, 1);
17464    }
17465
17466    #[simd_test(enable = "avx512fp16")]
17467    fn test_mm_comi_sh() {
17468        let a = _mm_set_sh(1.0);
17469        let b = _mm_set_sh(1.0);
17470        let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b);
17471        assert_eq!(r, 1);
17472    }
17473
17474    #[simd_test(enable = "avx512fp16")]
17475    fn test_mm_comieq_sh() {
17476        let a = _mm_set_sh(1.0);
17477        let b = _mm_set_sh(1.0);
17478        let r = _mm_comieq_sh(a, b);
17479        assert_eq!(r, 1);
17480    }
17481
17482    #[simd_test(enable = "avx512fp16")]
17483    fn test_mm_comige_sh() {
17484        let a = _mm_set_sh(2.0);
17485        let b = _mm_set_sh(1.0);
17486        let r = _mm_comige_sh(a, b);
17487        assert_eq!(r, 1);
17488    }
17489
17490    #[simd_test(enable = "avx512fp16")]
17491    fn test_mm_comigt_sh() {
17492        let a = _mm_set_sh(2.0);
17493        let b = _mm_set_sh(1.0);
17494        let r = _mm_comigt_sh(a, b);
17495        assert_eq!(r, 1);
17496    }
17497
17498    #[simd_test(enable = "avx512fp16")]
17499    fn test_mm_comile_sh() {
17500        let a = _mm_set_sh(1.0);
17501        let b = _mm_set_sh(2.0);
17502        let r = _mm_comile_sh(a, b);
17503        assert_eq!(r, 1);
17504    }
17505
17506    #[simd_test(enable = "avx512fp16")]
17507    fn test_mm_comilt_sh() {
17508        let a = _mm_set_sh(1.0);
17509        let b = _mm_set_sh(2.0);
17510        let r = _mm_comilt_sh(a, b);
17511        assert_eq!(r, 1);
17512    }
17513
17514    #[simd_test(enable = "avx512fp16")]
17515    fn test_mm_comineq_sh() {
17516        let a = _mm_set_sh(1.0);
17517        let b = _mm_set_sh(2.0);
17518        let r = _mm_comineq_sh(a, b);
17519        assert_eq!(r, 1);
17520    }
17521
17522    #[simd_test(enable = "avx512fp16")]
17523    fn test_mm_ucomieq_sh() {
17524        let a = _mm_set_sh(1.0);
17525        let b = _mm_set_sh(1.0);
17526        let r = _mm_ucomieq_sh(a, b);
17527        assert_eq!(r, 1);
17528    }
17529
17530    #[simd_test(enable = "avx512fp16")]
17531    fn test_mm_ucomige_sh() {
17532        let a = _mm_set_sh(2.0);
17533        let b = _mm_set_sh(1.0);
17534        let r = _mm_ucomige_sh(a, b);
17535        assert_eq!(r, 1);
17536    }
17537
17538    #[simd_test(enable = "avx512fp16")]
17539    fn test_mm_ucomigt_sh() {
17540        let a = _mm_set_sh(2.0);
17541        let b = _mm_set_sh(1.0);
17542        let r = _mm_ucomigt_sh(a, b);
17543        assert_eq!(r, 1);
17544    }
17545
17546    #[simd_test(enable = "avx512fp16")]
17547    fn test_mm_ucomile_sh() {
17548        let a = _mm_set_sh(1.0);
17549        let b = _mm_set_sh(2.0);
17550        let r = _mm_ucomile_sh(a, b);
17551        assert_eq!(r, 1);
17552    }
17553
17554    #[simd_test(enable = "avx512fp16")]
17555    fn test_mm_ucomilt_sh() {
17556        let a = _mm_set_sh(1.0);
17557        let b = _mm_set_sh(2.0);
17558        let r = _mm_ucomilt_sh(a, b);
17559        assert_eq!(r, 1);
17560    }
17561
17562    #[simd_test(enable = "avx512fp16")]
17563    fn test_mm_ucomineq_sh() {
17564        let a = _mm_set_sh(1.0);
17565        let b = _mm_set_sh(2.0);
17566        let r = _mm_ucomineq_sh(a, b);
17567        assert_eq!(r, 1);
17568    }
17569
17570    #[simd_test(enable = "avx512fp16,avx512vl")]
17571    const fn test_mm_load_ph() {
17572        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17573        let b = unsafe { _mm_load_ph(addr_of!(a).cast()) };
17574        assert_eq_m128h(a, b);
17575    }
17576
17577    #[simd_test(enable = "avx512fp16,avx512vl")]
17578    const fn test_mm256_load_ph() {
17579        let a = _mm256_set_ph(
17580            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17581        );
17582        let b = unsafe { _mm256_load_ph(addr_of!(a).cast()) };
17583        assert_eq_m256h(a, b);
17584    }
17585
17586    #[simd_test(enable = "avx512fp16")]
17587    const fn test_mm512_load_ph() {
17588        let a = _mm512_set_ph(
17589            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17590            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17591            31.0, 32.0,
17592        );
17593        let b = unsafe { _mm512_load_ph(addr_of!(a).cast()) };
17594        assert_eq_m512h(a, b);
17595    }
17596
17597    #[simd_test(enable = "avx512fp16,avx512vl")]
17598    const fn test_mm_load_sh() {
17599        let a = _mm_set_sh(1.0);
17600        let b = unsafe { _mm_load_sh(addr_of!(a).cast()) };
17601        assert_eq_m128h(a, b);
17602    }
17603
17604    #[simd_test(enable = "avx512fp16,avx512vl")]
17605    fn test_mm_mask_load_sh() {
17606        let a = _mm_set_sh(1.0);
17607        let src = _mm_set_sh(2.);
17608        let b = unsafe { _mm_mask_load_sh(src, 1, addr_of!(a).cast()) };
17609        assert_eq_m128h(a, b);
17610        let b = unsafe { _mm_mask_load_sh(src, 0, addr_of!(a).cast()) };
17611        assert_eq_m128h(src, b);
17612    }
17613
17614    #[simd_test(enable = "avx512fp16,avx512vl")]
17615    fn test_mm_maskz_load_sh() {
17616        let a = _mm_set_sh(1.0);
17617        let b = unsafe { _mm_maskz_load_sh(1, addr_of!(a).cast()) };
17618        assert_eq_m128h(a, b);
17619        let b = unsafe { _mm_maskz_load_sh(0, addr_of!(a).cast()) };
17620        assert_eq_m128h(_mm_setzero_ph(), b);
17621    }
17622
17623    #[simd_test(enable = "avx512fp16,avx512vl")]
17624    const fn test_mm_loadu_ph() {
17625        let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
17626        let r = unsafe { _mm_loadu_ph(array.as_ptr()) };
17627        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17628        assert_eq_m128h(r, e);
17629    }
17630
17631    #[simd_test(enable = "avx512fp16,avx512vl")]
17632    const fn test_mm256_loadu_ph() {
17633        let array = [
17634            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17635        ];
17636        let r = unsafe { _mm256_loadu_ph(array.as_ptr()) };
17637        let e = _mm256_setr_ph(
17638            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17639        );
17640        assert_eq_m256h(r, e);
17641    }
17642
17643    #[simd_test(enable = "avx512fp16")]
17644    const fn test_mm512_loadu_ph() {
17645        let array = [
17646            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17647            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17648            31.0, 32.0,
17649        ];
17650        let r = unsafe { _mm512_loadu_ph(array.as_ptr()) };
17651        let e = _mm512_setr_ph(
17652            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17653            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17654            31.0, 32.0,
17655        );
17656        assert_eq_m512h(r, e);
17657    }
17658
17659    #[simd_test(enable = "avx512fp16,avx512vl")]
17660    const fn test_mm_move_sh() {
17661        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17662        let b = _mm_set_sh(9.0);
17663        let r = _mm_move_sh(a, b);
17664        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0);
17665        assert_eq_m128h(r, e);
17666    }
17667
17668    #[simd_test(enable = "avx512fp16,avx512vl")]
17669    const fn test_mm_mask_move_sh() {
17670        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17671        let b = _mm_set_sh(9.0);
17672        let src = _mm_set_sh(10.0);
17673        let r = _mm_mask_move_sh(src, 0, a, b);
17674        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0);
17675        assert_eq_m128h(r, e);
17676    }
17677
17678    #[simd_test(enable = "avx512fp16,avx512vl")]
17679    const fn test_mm_maskz_move_sh() {
17680        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17681        let b = _mm_set_sh(9.0);
17682        let r = _mm_maskz_move_sh(0, a, b);
17683        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0);
17684        assert_eq_m128h(r, e);
17685    }
17686
17687    #[simd_test(enable = "avx512fp16,avx512vl")]
17688    const fn test_mm_store_ph() {
17689        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17690        let mut b = _mm_setzero_ph();
17691        unsafe {
17692            _mm_store_ph(addr_of_mut!(b).cast(), a);
17693        }
17694        assert_eq_m128h(a, b);
17695    }
17696
17697    #[simd_test(enable = "avx512fp16,avx512vl")]
17698    const fn test_mm256_store_ph() {
17699        let a = _mm256_set_ph(
17700            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17701        );
17702        let mut b = _mm256_setzero_ph();
17703        unsafe {
17704            _mm256_store_ph(addr_of_mut!(b).cast(), a);
17705        }
17706        assert_eq_m256h(a, b);
17707    }
17708
17709    #[simd_test(enable = "avx512fp16")]
17710    const fn test_mm512_store_ph() {
17711        let a = _mm512_set_ph(
17712            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17713            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17714            31.0, 32.0,
17715        );
17716        let mut b = _mm512_setzero_ph();
17717        unsafe {
17718            _mm512_store_ph(addr_of_mut!(b).cast(), a);
17719        }
17720        assert_eq_m512h(a, b);
17721    }
17722
17723    #[simd_test(enable = "avx512fp16,avx512vl")]
17724    const fn test_mm_store_sh() {
17725        let a = _mm_set_sh(1.0);
17726        let mut b = _mm_setzero_ph();
17727        unsafe {
17728            _mm_store_sh(addr_of_mut!(b).cast(), a);
17729        }
17730        assert_eq_m128h(a, b);
17731    }
17732
17733    #[simd_test(enable = "avx512fp16,avx512vl")]
17734    fn test_mm_mask_store_sh() {
17735        let a = _mm_set_sh(1.0);
17736        let mut b = _mm_setzero_ph();
17737        unsafe {
17738            _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a);
17739        }
17740        assert_eq_m128h(_mm_setzero_ph(), b);
17741        unsafe {
17742            _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a);
17743        }
17744        assert_eq_m128h(a, b);
17745    }
17746
17747    #[simd_test(enable = "avx512fp16,avx512vl")]
17748    const fn test_mm_storeu_ph() {
17749        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17750        let mut array = [0.0; 8];
17751        unsafe {
17752            _mm_storeu_ph(array.as_mut_ptr(), a);
17753        }
17754        assert_eq_m128h(a, unsafe { _mm_loadu_ph(array.as_ptr()) });
17755    }
17756
17757    #[simd_test(enable = "avx512fp16,avx512vl")]
17758    const fn test_mm256_storeu_ph() {
17759        let a = _mm256_set_ph(
17760            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17761        );
17762        let mut array = [0.0; 16];
17763        unsafe {
17764            _mm256_storeu_ph(array.as_mut_ptr(), a);
17765        }
17766        assert_eq_m256h(a, unsafe { _mm256_loadu_ph(array.as_ptr()) });
17767    }
17768
17769    #[simd_test(enable = "avx512fp16")]
17770    const fn test_mm512_storeu_ph() {
17771        let a = _mm512_set_ph(
17772            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17773            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17774            31.0, 32.0,
17775        );
17776        let mut array = [0.0; 32];
17777        unsafe {
17778            _mm512_storeu_ph(array.as_mut_ptr(), a);
17779        }
17780        assert_eq_m512h(a, unsafe { _mm512_loadu_ph(array.as_ptr()) });
17781    }
17782
17783    #[simd_test(enable = "avx512fp16,avx512vl")]
17784    const fn test_mm_add_ph() {
17785        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17786        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17787        let r = _mm_add_ph(a, b);
17788        let e = _mm_set1_ph(9.0);
17789        assert_eq_m128h(r, e);
17790    }
17791
17792    #[simd_test(enable = "avx512fp16,avx512vl")]
17793    const fn test_mm_mask_add_ph() {
17794        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17795        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17796        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17797        let r = _mm_mask_add_ph(src, 0b01010101, a, b);
17798        let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.);
17799        assert_eq_m128h(r, e);
17800    }
17801
17802    #[simd_test(enable = "avx512fp16,avx512vl")]
17803    const fn test_mm_maskz_add_ph() {
17804        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17805        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17806        let r = _mm_maskz_add_ph(0b01010101, a, b);
17807        let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.);
17808        assert_eq_m128h(r, e);
17809    }
17810
17811    #[simd_test(enable = "avx512fp16,avx512vl")]
17812    const fn test_mm256_add_ph() {
17813        let a = _mm256_set_ph(
17814            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17815        );
17816        let b = _mm256_set_ph(
17817            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17818        );
17819        let r = _mm256_add_ph(a, b);
17820        let e = _mm256_set1_ph(17.0);
17821        assert_eq_m256h(r, e);
17822    }
17823
17824    #[simd_test(enable = "avx512fp16,avx512vl")]
17825    const fn test_mm256_mask_add_ph() {
17826        let a = _mm256_set_ph(
17827            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17828        );
17829        let b = _mm256_set_ph(
17830            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17831        );
17832        let src = _mm256_set_ph(
17833            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17834        );
17835        let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b);
17836        let e = _mm256_set_ph(
17837            18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17.,
17838        );
17839        assert_eq_m256h(r, e);
17840    }
17841
17842    #[simd_test(enable = "avx512fp16,avx512vl")]
17843    const fn test_mm256_maskz_add_ph() {
17844        let a = _mm256_set_ph(
17845            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17846        );
17847        let b = _mm256_set_ph(
17848            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17849        );
17850        let r = _mm256_maskz_add_ph(0b0101010101010101, a, b);
17851        let e = _mm256_set_ph(
17852            0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17.,
17853        );
17854        assert_eq_m256h(r, e);
17855    }
17856
17857    #[simd_test(enable = "avx512fp16")]
17858    const fn test_mm512_add_ph() {
17859        let a = _mm512_set_ph(
17860            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17861            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17862            31.0, 32.0,
17863        );
17864        let b = _mm512_set_ph(
17865            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17866            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17867            3.0, 2.0, 1.0,
17868        );
17869        let r = _mm512_add_ph(a, b);
17870        let e = _mm512_set1_ph(33.0);
17871        assert_eq_m512h(r, e);
17872    }
17873
17874    #[simd_test(enable = "avx512fp16")]
17875    const fn test_mm512_mask_add_ph() {
17876        let a = _mm512_set_ph(
17877            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17878            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17879            31.0, 32.0,
17880        );
17881        let b = _mm512_set_ph(
17882            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17883            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17884            3.0, 2.0, 1.0,
17885        );
17886        let src = _mm512_set_ph(
17887            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17888            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17889        );
17890        let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b);
17891        let e = _mm512_set_ph(
17892            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17893            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17894        );
17895        assert_eq_m512h(r, e);
17896    }
17897
17898    #[simd_test(enable = "avx512fp16")]
17899    const fn test_mm512_maskz_add_ph() {
17900        let a = _mm512_set_ph(
17901            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17902            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17903            31.0, 32.0,
17904        );
17905        let b = _mm512_set_ph(
17906            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17907            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17908            3.0, 2.0, 1.0,
17909        );
17910        let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b);
17911        let e = _mm512_set_ph(
17912            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17913            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17914        );
17915        assert_eq_m512h(r, e);
17916    }
17917
17918    #[simd_test(enable = "avx512fp16")]
17919    fn test_mm512_add_round_ph() {
17920        let a = _mm512_set_ph(
17921            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17922            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17923            31.0, 32.0,
17924        );
17925        let b = _mm512_set_ph(
17926            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17927            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17928            3.0, 2.0, 1.0,
17929        );
17930        let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17931        let e = _mm512_set1_ph(33.0);
17932        assert_eq_m512h(r, e);
17933    }
17934
17935    #[simd_test(enable = "avx512fp16")]
17936    fn test_mm512_mask_add_round_ph() {
17937        let a = _mm512_set_ph(
17938            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17939            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17940            31.0, 32.0,
17941        );
17942        let b = _mm512_set_ph(
17943            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17944            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17945            3.0, 2.0, 1.0,
17946        );
17947        let src = _mm512_set_ph(
17948            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17949            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17950        );
17951        let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17952            src,
17953            0b01010101010101010101010101010101,
17954            a,
17955            b,
17956        );
17957        let e = _mm512_set_ph(
17958            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17959            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17960        );
17961        assert_eq_m512h(r, e);
17962    }
17963
17964    #[simd_test(enable = "avx512fp16")]
17965    fn test_mm512_maskz_add_round_ph() {
17966        let a = _mm512_set_ph(
17967            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17968            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17969            31.0, 32.0,
17970        );
17971        let b = _mm512_set_ph(
17972            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17973            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17974            3.0, 2.0, 1.0,
17975        );
17976        let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17977            0b01010101010101010101010101010101,
17978            a,
17979            b,
17980        );
17981        let e = _mm512_set_ph(
17982            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17983            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17984        );
17985        assert_eq_m512h(r, e);
17986    }
17987
17988    #[simd_test(enable = "avx512fp16,avx512vl")]
17989    fn test_mm_add_round_sh() {
17990        let a = _mm_set_sh(1.0);
17991        let b = _mm_set_sh(2.0);
17992        let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17993        let e = _mm_set_sh(3.0);
17994        assert_eq_m128h(r, e);
17995    }
17996
17997    #[simd_test(enable = "avx512fp16,avx512vl")]
17998    fn test_mm_mask_add_round_sh() {
17999        let a = _mm_set_sh(1.0);
18000        let b = _mm_set_sh(2.0);
18001        let src = _mm_set_sh(4.0);
18002        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18003            src, 0, a, b,
18004        );
18005        let e = _mm_set_sh(4.0);
18006        assert_eq_m128h(r, e);
18007        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18008            src, 1, a, b,
18009        );
18010        let e = _mm_set_sh(3.0);
18011        assert_eq_m128h(r, e);
18012    }
18013
18014    #[simd_test(enable = "avx512fp16,avx512vl")]
18015    fn test_mm_maskz_add_round_sh() {
18016        let a = _mm_set_sh(1.0);
18017        let b = _mm_set_sh(2.0);
18018        let r =
18019            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18020        let e = _mm_set_sh(0.0);
18021        assert_eq_m128h(r, e);
18022        let r =
18023            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18024        let e = _mm_set_sh(3.0);
18025        assert_eq_m128h(r, e);
18026    }
18027
18028    #[simd_test(enable = "avx512fp16,avx512vl")]
18029    const fn test_mm_add_sh() {
18030        let a = _mm_set_sh(1.0);
18031        let b = _mm_set_sh(2.0);
18032        let r = _mm_add_sh(a, b);
18033        let e = _mm_set_sh(3.0);
18034        assert_eq_m128h(r, e);
18035    }
18036
18037    #[simd_test(enable = "avx512fp16,avx512vl")]
18038    const fn test_mm_mask_add_sh() {
18039        let a = _mm_set_sh(1.0);
18040        let b = _mm_set_sh(2.0);
18041        let src = _mm_set_sh(4.0);
18042        let r = _mm_mask_add_sh(src, 0, a, b);
18043        let e = _mm_set_sh(4.0);
18044        assert_eq_m128h(r, e);
18045        let r = _mm_mask_add_sh(src, 1, a, b);
18046        let e = _mm_set_sh(3.0);
18047        assert_eq_m128h(r, e);
18048    }
18049
18050    #[simd_test(enable = "avx512fp16,avx512vl")]
18051    const fn test_mm_maskz_add_sh() {
18052        let a = _mm_set_sh(1.0);
18053        let b = _mm_set_sh(2.0);
18054        let r = _mm_maskz_add_sh(0, a, b);
18055        let e = _mm_set_sh(0.0);
18056        assert_eq_m128h(r, e);
18057        let r = _mm_maskz_add_sh(1, a, b);
18058        let e = _mm_set_sh(3.0);
18059        assert_eq_m128h(r, e);
18060    }
18061
18062    #[simd_test(enable = "avx512fp16,avx512vl")]
18063    const fn test_mm_sub_ph() {
18064        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18065        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18066        let r = _mm_sub_ph(a, b);
18067        let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0);
18068        assert_eq_m128h(r, e);
18069    }
18070
18071    #[simd_test(enable = "avx512fp16,avx512vl")]
18072    const fn test_mm_mask_sub_ph() {
18073        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18074        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18075        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
18076        let r = _mm_mask_sub_ph(src, 0b01010101, a, b);
18077        let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.);
18078        assert_eq_m128h(r, e);
18079    }
18080
18081    #[simd_test(enable = "avx512fp16,avx512vl")]
18082    const fn test_mm_maskz_sub_ph() {
18083        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18084        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18085        let r = _mm_maskz_sub_ph(0b01010101, a, b);
18086        let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.);
18087        assert_eq_m128h(r, e);
18088    }
18089
18090    #[simd_test(enable = "avx512fp16,avx512vl")]
18091    const fn test_mm256_sub_ph() {
18092        let a = _mm256_set_ph(
18093            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18094        );
18095        let b = _mm256_set_ph(
18096            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18097        );
18098        let r = _mm256_sub_ph(a, b);
18099        let e = _mm256_set_ph(
18100            -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0,
18101            15.0,
18102        );
18103        assert_eq_m256h(r, e);
18104    }
18105
18106    #[simd_test(enable = "avx512fp16,avx512vl")]
18107    const fn test_mm256_mask_sub_ph() {
18108        let a = _mm256_set_ph(
18109            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18110        );
18111        let b = _mm256_set_ph(
18112            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18113        );
18114        let src = _mm256_set_ph(
18115            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
18116        );
18117        let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b);
18118        let e = _mm256_set_ph(
18119            18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15.,
18120        );
18121        assert_eq_m256h(r, e);
18122    }
18123
18124    #[simd_test(enable = "avx512fp16,avx512vl")]
18125    const fn test_mm256_maskz_sub_ph() {
18126        let a = _mm256_set_ph(
18127            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18128        );
18129        let b = _mm256_set_ph(
18130            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18131        );
18132        let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b);
18133        let e = _mm256_set_ph(
18134            0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15.,
18135        );
18136        assert_eq_m256h(r, e);
18137    }
18138
18139    #[simd_test(enable = "avx512fp16")]
18140    const fn test_mm512_sub_ph() {
18141        let a = _mm512_set_ph(
18142            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18143            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18144            31.0, 32.0,
18145        );
18146        let b = _mm512_set_ph(
18147            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18148            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18149            3.0, 2.0, 1.0,
18150        );
18151        let r = _mm512_sub_ph(a, b);
18152        let e = _mm512_set_ph(
18153            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
18154            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
18155            23.0, 25.0, 27.0, 29.0, 31.0,
18156        );
18157        assert_eq_m512h(r, e);
18158    }
18159
18160    #[simd_test(enable = "avx512fp16")]
18161    const fn test_mm512_mask_sub_ph() {
18162        let a = _mm512_set_ph(
18163            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18164            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18165            31.0, 32.0,
18166        );
18167        let b = _mm512_set_ph(
18168            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18169            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18170            3.0, 2.0, 1.0,
18171        );
18172        let src = _mm512_set_ph(
18173            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18174            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18175        );
18176        let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b);
18177        let e = _mm512_set_ph(
18178            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
18179            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
18180        );
18181        assert_eq_m512h(r, e);
18182    }
18183
18184    #[simd_test(enable = "avx512fp16")]
18185    const fn test_mm512_maskz_sub_ph() {
18186        let a = _mm512_set_ph(
18187            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18188            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18189            31.0, 32.0,
18190        );
18191        let b = _mm512_set_ph(
18192            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18193            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18194            3.0, 2.0, 1.0,
18195        );
18196        let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b);
18197        let e = _mm512_set_ph(
18198            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
18199            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
18200        );
18201        assert_eq_m512h(r, e);
18202    }
18203
18204    #[simd_test(enable = "avx512fp16")]
18205    fn test_mm512_sub_round_ph() {
18206        let a = _mm512_set_ph(
18207            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18208            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18209            31.0, 32.0,
18210        );
18211        let b = _mm512_set_ph(
18212            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18213            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18214            3.0, 2.0, 1.0,
18215        );
18216        let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18217        let e = _mm512_set_ph(
18218            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
18219            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
18220            23.0, 25.0, 27.0, 29.0, 31.0,
18221        );
18222        assert_eq_m512h(r, e);
18223    }
18224
18225    #[simd_test(enable = "avx512fp16")]
18226    fn test_mm512_mask_sub_round_ph() {
18227        let a = _mm512_set_ph(
18228            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18229            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18230            31.0, 32.0,
18231        );
18232        let b = _mm512_set_ph(
18233            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18234            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18235            3.0, 2.0, 1.0,
18236        );
18237        let src = _mm512_set_ph(
18238            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18239            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18240        );
18241        let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18242            src,
18243            0b01010101010101010101010101010101,
18244            a,
18245            b,
18246        );
18247        let e = _mm512_set_ph(
18248            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
18249            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
18250        );
18251        assert_eq_m512h(r, e);
18252    }
18253
18254    #[simd_test(enable = "avx512fp16")]
18255    fn test_mm512_maskz_sub_round_ph() {
18256        let a = _mm512_set_ph(
18257            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18258            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18259            31.0, 32.0,
18260        );
18261        let b = _mm512_set_ph(
18262            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18263            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18264            3.0, 2.0, 1.0,
18265        );
18266        let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18267            0b01010101010101010101010101010101,
18268            a,
18269            b,
18270        );
18271        let e = _mm512_set_ph(
18272            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
18273            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
18274        );
18275        assert_eq_m512h(r, e);
18276    }
18277
18278    #[simd_test(enable = "avx512fp16,avx512vl")]
18279    fn test_mm_sub_round_sh() {
18280        let a = _mm_set_sh(1.0);
18281        let b = _mm_set_sh(2.0);
18282        let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18283        let e = _mm_set_sh(-1.0);
18284        assert_eq_m128h(r, e);
18285    }
18286
18287    #[simd_test(enable = "avx512fp16,avx512vl")]
18288    fn test_mm_mask_sub_round_sh() {
18289        let a = _mm_set_sh(1.0);
18290        let b = _mm_set_sh(2.0);
18291        let src = _mm_set_sh(4.0);
18292        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18293            src, 0, a, b,
18294        );
18295        let e = _mm_set_sh(4.0);
18296        assert_eq_m128h(r, e);
18297        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18298            src, 1, a, b,
18299        );
18300        let e = _mm_set_sh(-1.0);
18301        assert_eq_m128h(r, e);
18302    }
18303
18304    #[simd_test(enable = "avx512fp16,avx512vl")]
18305    fn test_mm_maskz_sub_round_sh() {
18306        let a = _mm_set_sh(1.0);
18307        let b = _mm_set_sh(2.0);
18308        let r =
18309            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18310        let e = _mm_set_sh(0.0);
18311        assert_eq_m128h(r, e);
18312        let r =
18313            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18314        let e = _mm_set_sh(-1.0);
18315        assert_eq_m128h(r, e);
18316    }
18317
18318    #[simd_test(enable = "avx512fp16,avx512vl")]
18319    const fn test_mm_sub_sh() {
18320        let a = _mm_set_sh(1.0);
18321        let b = _mm_set_sh(2.0);
18322        let r = _mm_sub_sh(a, b);
18323        let e = _mm_set_sh(-1.0);
18324        assert_eq_m128h(r, e);
18325    }
18326
18327    #[simd_test(enable = "avx512fp16,avx512vl")]
18328    const fn test_mm_mask_sub_sh() {
18329        let a = _mm_set_sh(1.0);
18330        let b = _mm_set_sh(2.0);
18331        let src = _mm_set_sh(4.0);
18332        let r = _mm_mask_sub_sh(src, 0, a, b);
18333        let e = _mm_set_sh(4.0);
18334        assert_eq_m128h(r, e);
18335        let r = _mm_mask_sub_sh(src, 1, a, b);
18336        let e = _mm_set_sh(-1.0);
18337        assert_eq_m128h(r, e);
18338    }
18339
18340    #[simd_test(enable = "avx512fp16,avx512vl")]
18341    const fn test_mm_maskz_sub_sh() {
18342        let a = _mm_set_sh(1.0);
18343        let b = _mm_set_sh(2.0);
18344        let r = _mm_maskz_sub_sh(0, a, b);
18345        let e = _mm_set_sh(0.0);
18346        assert_eq_m128h(r, e);
18347        let r = _mm_maskz_sub_sh(1, a, b);
18348        let e = _mm_set_sh(-1.0);
18349        assert_eq_m128h(r, e);
18350    }
18351
18352    #[simd_test(enable = "avx512fp16,avx512vl")]
18353    const fn test_mm_mul_ph() {
18354        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18355        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18356        let r = _mm_mul_ph(a, b);
18357        let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0);
18358        assert_eq_m128h(r, e);
18359    }
18360
18361    #[simd_test(enable = "avx512fp16,avx512vl")]
18362    const fn test_mm_mask_mul_ph() {
18363        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18364        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18365        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
18366        let r = _mm_mask_mul_ph(src, 0b01010101, a, b);
18367        let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.);
18368        assert_eq_m128h(r, e);
18369    }
18370
18371    #[simd_test(enable = "avx512fp16,avx512vl")]
18372    const fn test_mm_maskz_mul_ph() {
18373        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18374        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18375        let r = _mm_maskz_mul_ph(0b01010101, a, b);
18376        let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.);
18377        assert_eq_m128h(r, e);
18378    }
18379
18380    #[simd_test(enable = "avx512fp16,avx512vl")]
18381    const fn test_mm256_mul_ph() {
18382        let a = _mm256_set_ph(
18383            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18384        );
18385        let b = _mm256_set_ph(
18386            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18387        );
18388        let r = _mm256_mul_ph(a, b);
18389        let e = _mm256_set_ph(
18390            16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0,
18391            30.0, 16.0,
18392        );
18393        assert_eq_m256h(r, e);
18394    }
18395
18396    #[simd_test(enable = "avx512fp16,avx512vl")]
18397    const fn test_mm256_mask_mul_ph() {
18398        let a = _mm256_set_ph(
18399            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18400        );
18401        let b = _mm256_set_ph(
18402            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18403        );
18404        let src = _mm256_set_ph(
18405            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
18406        );
18407        let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b);
18408        let e = _mm256_set_ph(
18409            18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16.,
18410        );
18411        assert_eq_m256h(r, e);
18412    }
18413
18414    #[simd_test(enable = "avx512fp16,avx512vl")]
18415    const fn test_mm256_maskz_mul_ph() {
18416        let a = _mm256_set_ph(
18417            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18418        );
18419        let b = _mm256_set_ph(
18420            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18421        );
18422        let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b);
18423        let e = _mm256_set_ph(
18424            0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16.,
18425        );
18426        assert_eq_m256h(r, e);
18427    }
18428
18429    #[simd_test(enable = "avx512fp16")]
18430    const fn test_mm512_mul_ph() {
18431        let a = _mm512_set_ph(
18432            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18433            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18434            31.0, 32.0,
18435        );
18436        let b = _mm512_set_ph(
18437            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18438            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18439            3.0, 2.0, 1.0,
18440        );
18441        let r = _mm512_mul_ph(a, b);
18442        let e = _mm512_set_ph(
18443            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18444            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18445            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18446        );
18447        assert_eq_m512h(r, e);
18448    }
18449
18450    #[simd_test(enable = "avx512fp16")]
18451    const fn test_mm512_mask_mul_ph() {
18452        let a = _mm512_set_ph(
18453            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18454            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18455            31.0, 32.0,
18456        );
18457        let b = _mm512_set_ph(
18458            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18459            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18460            3.0, 2.0, 1.0,
18461        );
18462        let src = _mm512_set_ph(
18463            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18464            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18465        );
18466        let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b);
18467        let e = _mm512_set_ph(
18468            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18469            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18470        );
18471        assert_eq_m512h(r, e);
18472    }
18473
18474    #[simd_test(enable = "avx512fp16")]
18475    const fn test_mm512_maskz_mul_ph() {
18476        let a = _mm512_set_ph(
18477            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18478            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18479            31.0, 32.0,
18480        );
18481        let b = _mm512_set_ph(
18482            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18483            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18484            3.0, 2.0, 1.0,
18485        );
18486        let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b);
18487        let e = _mm512_set_ph(
18488            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18489            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18490        );
18491        assert_eq_m512h(r, e);
18492    }
18493
18494    #[simd_test(enable = "avx512fp16")]
18495    fn test_mm512_mul_round_ph() {
18496        let a = _mm512_set_ph(
18497            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18498            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18499            31.0, 32.0,
18500        );
18501        let b = _mm512_set_ph(
18502            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18503            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18504            3.0, 2.0, 1.0,
18505        );
18506        let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18507        let e = _mm512_set_ph(
18508            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18509            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18510            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18511        );
18512        assert_eq_m512h(r, e);
18513    }
18514
18515    #[simd_test(enable = "avx512fp16")]
18516    fn test_mm512_mask_mul_round_ph() {
18517        let a = _mm512_set_ph(
18518            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18519            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18520            31.0, 32.0,
18521        );
18522        let b = _mm512_set_ph(
18523            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18524            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18525            3.0, 2.0, 1.0,
18526        );
18527        let src = _mm512_set_ph(
18528            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18529            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18530        );
18531        let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18532            src,
18533            0b01010101010101010101010101010101,
18534            a,
18535            b,
18536        );
18537        let e = _mm512_set_ph(
18538            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18539            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18540        );
18541        assert_eq_m512h(r, e);
18542    }
18543
18544    #[simd_test(enable = "avx512fp16")]
18545    fn test_mm512_maskz_mul_round_ph() {
18546        let a = _mm512_set_ph(
18547            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18548            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18549            31.0, 32.0,
18550        );
18551        let b = _mm512_set_ph(
18552            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18553            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18554            3.0, 2.0, 1.0,
18555        );
18556        let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18557            0b01010101010101010101010101010101,
18558            a,
18559            b,
18560        );
18561        let e = _mm512_set_ph(
18562            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18563            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18564        );
18565        assert_eq_m512h(r, e);
18566    }
18567
18568    #[simd_test(enable = "avx512fp16,avx512vl")]
18569    fn test_mm_mul_round_sh() {
18570        let a = _mm_set_sh(1.0);
18571        let b = _mm_set_sh(2.0);
18572        let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18573        let e = _mm_set_sh(2.0);
18574        assert_eq_m128h(r, e);
18575    }
18576
18577    #[simd_test(enable = "avx512fp16,avx512vl")]
18578    fn test_mm_mask_mul_round_sh() {
18579        let a = _mm_set_sh(1.0);
18580        let b = _mm_set_sh(2.0);
18581        let src = _mm_set_sh(4.0);
18582        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18583            src, 0, a, b,
18584        );
18585        let e = _mm_set_sh(4.0);
18586        assert_eq_m128h(r, e);
18587        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18588            src, 1, a, b,
18589        );
18590        let e = _mm_set_sh(2.0);
18591        assert_eq_m128h(r, e);
18592    }
18593
18594    #[simd_test(enable = "avx512fp16,avx512vl")]
18595    fn test_mm_maskz_mul_round_sh() {
18596        let a = _mm_set_sh(1.0);
18597        let b = _mm_set_sh(2.0);
18598        let r =
18599            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18600        let e = _mm_set_sh(0.0);
18601        assert_eq_m128h(r, e);
18602        let r =
18603            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18604        let e = _mm_set_sh(2.0);
18605        assert_eq_m128h(r, e);
18606    }
18607
18608    #[simd_test(enable = "avx512fp16,avx512vl")]
18609    const fn test_mm_mul_sh() {
18610        let a = _mm_set_sh(1.0);
18611        let b = _mm_set_sh(2.0);
18612        let r = _mm_mul_sh(a, b);
18613        let e = _mm_set_sh(2.0);
18614        assert_eq_m128h(r, e);
18615    }
18616
18617    #[simd_test(enable = "avx512fp16,avx512vl")]
18618    const fn test_mm_mask_mul_sh() {
18619        let a = _mm_set_sh(1.0);
18620        let b = _mm_set_sh(2.0);
18621        let src = _mm_set_sh(4.0);
18622        let r = _mm_mask_mul_sh(src, 0, a, b);
18623        let e = _mm_set_sh(4.0);
18624        assert_eq_m128h(r, e);
18625        let r = _mm_mask_mul_sh(src, 1, a, b);
18626        let e = _mm_set_sh(2.0);
18627        assert_eq_m128h(r, e);
18628    }
18629
18630    #[simd_test(enable = "avx512fp16,avx512vl")]
18631    const fn test_mm_maskz_mul_sh() {
18632        let a = _mm_set_sh(1.0);
18633        let b = _mm_set_sh(2.0);
18634        let r = _mm_maskz_mul_sh(0, a, b);
18635        let e = _mm_set_sh(0.0);
18636        assert_eq_m128h(r, e);
18637        let r = _mm_maskz_mul_sh(1, a, b);
18638        let e = _mm_set_sh(2.0);
18639        assert_eq_m128h(r, e);
18640    }
18641
18642    #[simd_test(enable = "avx512fp16,avx512vl")]
18643    const fn test_mm_div_ph() {
18644        let a = _mm_set1_ph(1.0);
18645        let b = _mm_set1_ph(2.0);
18646        let r = _mm_div_ph(a, b);
18647        let e = _mm_set1_ph(0.5);
18648        assert_eq_m128h(r, e);
18649    }
18650
18651    #[simd_test(enable = "avx512fp16,avx512vl")]
18652    const fn test_mm_mask_div_ph() {
18653        let a = _mm_set1_ph(1.0);
18654        let b = _mm_set1_ph(2.0);
18655        let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0);
18656        let r = _mm_mask_div_ph(src, 0b01010101, a, b);
18657        let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5);
18658        assert_eq_m128h(r, e);
18659    }
18660
18661    #[simd_test(enable = "avx512fp16,avx512vl")]
18662    const fn test_mm_maskz_div_ph() {
18663        let a = _mm_set1_ph(1.0);
18664        let b = _mm_set1_ph(2.0);
18665        let r = _mm_maskz_div_ph(0b01010101, a, b);
18666        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
18667        assert_eq_m128h(r, e);
18668    }
18669
18670    #[simd_test(enable = "avx512fp16,avx512vl")]
18671    const fn test_mm256_div_ph() {
18672        let a = _mm256_set1_ph(1.0);
18673        let b = _mm256_set1_ph(2.0);
18674        let r = _mm256_div_ph(a, b);
18675        let e = _mm256_set1_ph(0.5);
18676        assert_eq_m256h(r, e);
18677    }
18678
18679    #[simd_test(enable = "avx512fp16,avx512vl")]
18680    const fn test_mm256_mask_div_ph() {
18681        let a = _mm256_set1_ph(1.0);
18682        let b = _mm256_set1_ph(2.0);
18683        let src = _mm256_set_ph(
18684            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18685            19.0,
18686        );
18687        let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b);
18688        let e = _mm256_set_ph(
18689            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18690        );
18691        assert_eq_m256h(r, e);
18692    }
18693
18694    #[simd_test(enable = "avx512fp16,avx512vl")]
18695    const fn test_mm256_maskz_div_ph() {
18696        let a = _mm256_set1_ph(1.0);
18697        let b = _mm256_set1_ph(2.0);
18698        let r = _mm256_maskz_div_ph(0b0101010101010101, a, b);
18699        let e = _mm256_set_ph(
18700            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18701        );
18702        assert_eq_m256h(r, e);
18703    }
18704
18705    #[simd_test(enable = "avx512fp16")]
18706    const fn test_mm512_div_ph() {
18707        let a = _mm512_set1_ph(1.0);
18708        let b = _mm512_set1_ph(2.0);
18709        let r = _mm512_div_ph(a, b);
18710        let e = _mm512_set1_ph(0.5);
18711        assert_eq_m512h(r, e);
18712    }
18713
18714    #[simd_test(enable = "avx512fp16")]
18715    const fn test_mm512_mask_div_ph() {
18716        let a = _mm512_set1_ph(1.0);
18717        let b = _mm512_set1_ph(2.0);
18718        let src = _mm512_set_ph(
18719            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18720            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18721            33.0, 34.0, 35.0,
18722        );
18723        let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b);
18724        let e = _mm512_set_ph(
18725            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18726            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18727        );
18728        assert_eq_m512h(r, e);
18729    }
18730
18731    #[simd_test(enable = "avx512fp16")]
18732    const fn test_mm512_maskz_div_ph() {
18733        let a = _mm512_set1_ph(1.0);
18734        let b = _mm512_set1_ph(2.0);
18735        let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b);
18736        let e = _mm512_set_ph(
18737            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18738            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18739        );
18740        assert_eq_m512h(r, e);
18741    }
18742
18743    #[simd_test(enable = "avx512fp16")]
18744    fn test_mm512_div_round_ph() {
18745        let a = _mm512_set1_ph(1.0);
18746        let b = _mm512_set1_ph(2.0);
18747        let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18748        let e = _mm512_set1_ph(0.5);
18749        assert_eq_m512h(r, e);
18750    }
18751
18752    #[simd_test(enable = "avx512fp16")]
18753    fn test_mm512_mask_div_round_ph() {
18754        let a = _mm512_set1_ph(1.0);
18755        let b = _mm512_set1_ph(2.0);
18756        let src = _mm512_set_ph(
18757            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18758            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18759            33.0, 34.0, 35.0,
18760        );
18761        let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18762            src,
18763            0b01010101010101010101010101010101,
18764            a,
18765            b,
18766        );
18767        let e = _mm512_set_ph(
18768            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18769            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18770        );
18771        assert_eq_m512h(r, e);
18772    }
18773
18774    #[simd_test(enable = "avx512fp16")]
18775    fn test_mm512_maskz_div_round_ph() {
18776        let a = _mm512_set1_ph(1.0);
18777        let b = _mm512_set1_ph(2.0);
18778        let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18779            0b01010101010101010101010101010101,
18780            a,
18781            b,
18782        );
18783        let e = _mm512_set_ph(
18784            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18785            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18786        );
18787        assert_eq_m512h(r, e);
18788    }
18789
18790    #[simd_test(enable = "avx512fp16,avx512vl")]
18791    fn test_mm_div_round_sh() {
18792        let a = _mm_set_sh(1.0);
18793        let b = _mm_set_sh(2.0);
18794        let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18795        let e = _mm_set_sh(0.5);
18796        assert_eq_m128h(r, e);
18797    }
18798
18799    #[simd_test(enable = "avx512fp16,avx512vl")]
18800    fn test_mm_mask_div_round_sh() {
18801        let a = _mm_set_sh(1.0);
18802        let b = _mm_set_sh(2.0);
18803        let src = _mm_set_sh(4.0);
18804        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18805            src, 0, a, b,
18806        );
18807        let e = _mm_set_sh(4.0);
18808        assert_eq_m128h(r, e);
18809        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18810            src, 1, a, b,
18811        );
18812        let e = _mm_set_sh(0.5);
18813        assert_eq_m128h(r, e);
18814    }
18815
18816    #[simd_test(enable = "avx512fp16,avx512vl")]
18817    fn test_mm_maskz_div_round_sh() {
18818        let a = _mm_set_sh(1.0);
18819        let b = _mm_set_sh(2.0);
18820        let r =
18821            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18822        let e = _mm_set_sh(0.0);
18823        assert_eq_m128h(r, e);
18824        let r =
18825            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18826        let e = _mm_set_sh(0.5);
18827        assert_eq_m128h(r, e);
18828    }
18829
18830    #[simd_test(enable = "avx512fp16,avx512vl")]
18831    const fn test_mm_div_sh() {
18832        let a = _mm_set_sh(1.0);
18833        let b = _mm_set_sh(2.0);
18834        let r = _mm_div_sh(a, b);
18835        let e = _mm_set_sh(0.5);
18836        assert_eq_m128h(r, e);
18837    }
18838
18839    #[simd_test(enable = "avx512fp16,avx512vl")]
18840    const fn test_mm_mask_div_sh() {
18841        let a = _mm_set_sh(1.0);
18842        let b = _mm_set_sh(2.0);
18843        let src = _mm_set_sh(4.0);
18844        let r = _mm_mask_div_sh(src, 0, a, b);
18845        let e = _mm_set_sh(4.0);
18846        assert_eq_m128h(r, e);
18847        let r = _mm_mask_div_sh(src, 1, a, b);
18848        let e = _mm_set_sh(0.5);
18849        assert_eq_m128h(r, e);
18850    }
18851
18852    #[simd_test(enable = "avx512fp16,avx512vl")]
18853    const fn test_mm_maskz_div_sh() {
18854        let a = _mm_set_sh(1.0);
18855        let b = _mm_set_sh(2.0);
18856        let r = _mm_maskz_div_sh(0, a, b);
18857        let e = _mm_set_sh(0.0);
18858        assert_eq_m128h(r, e);
18859        let r = _mm_maskz_div_sh(1, a, b);
18860        let e = _mm_set_sh(0.5);
18861        assert_eq_m128h(r, e);
18862    }
18863
18864    #[simd_test(enable = "avx512fp16,avx512vl")]
18865    fn test_mm_mul_pch() {
18866        let a = _mm_set1_pch(0.0, 1.0);
18867        let b = _mm_set1_pch(0.0, 1.0);
18868        let r = _mm_mul_pch(a, b);
18869        let e = _mm_set1_pch(-1.0, 0.0);
18870        assert_eq_m128h(r, e);
18871    }
18872
18873    #[simd_test(enable = "avx512fp16,avx512vl")]
18874    fn test_mm_mask_mul_pch() {
18875        let a = _mm_set1_pch(0.0, 1.0);
18876        let b = _mm_set1_pch(0.0, 1.0);
18877        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18878        let r = _mm_mask_mul_pch(src, 0b0101, a, b);
18879        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18880        assert_eq_m128h(r, e);
18881    }
18882
18883    #[simd_test(enable = "avx512fp16,avx512vl")]
18884    fn test_mm_maskz_mul_pch() {
18885        let a = _mm_set1_pch(0.0, 1.0);
18886        let b = _mm_set1_pch(0.0, 1.0);
18887        let r = _mm_maskz_mul_pch(0b0101, a, b);
18888        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18889        assert_eq_m128h(r, e);
18890    }
18891
18892    #[simd_test(enable = "avx512fp16,avx512vl")]
18893    fn test_mm256_mul_pch() {
18894        let a = _mm256_set1_pch(0.0, 1.0);
18895        let b = _mm256_set1_pch(0.0, 1.0);
18896        let r = _mm256_mul_pch(a, b);
18897        let e = _mm256_set1_pch(-1.0, 0.0);
18898        assert_eq_m256h(r, e);
18899    }
18900
18901    #[simd_test(enable = "avx512fp16,avx512vl")]
18902    fn test_mm256_mask_mul_pch() {
18903        let a = _mm256_set1_pch(0.0, 1.0);
18904        let b = _mm256_set1_pch(0.0, 1.0);
18905        let src = _mm256_setr_ph(
18906            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18907        );
18908        let r = _mm256_mask_mul_pch(src, 0b01010101, a, b);
18909        let e = _mm256_setr_ph(
18910            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18911        );
18912        assert_eq_m256h(r, e);
18913    }
18914
18915    #[simd_test(enable = "avx512fp16,avx512vl")]
18916    fn test_mm256_maskz_mul_pch() {
18917        let a = _mm256_set1_pch(0.0, 1.0);
18918        let b = _mm256_set1_pch(0.0, 1.0);
18919        let r = _mm256_maskz_mul_pch(0b01010101, a, b);
18920        let e = _mm256_setr_ph(
18921            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18922        );
18923        assert_eq_m256h(r, e);
18924    }
18925
18926    #[simd_test(enable = "avx512fp16")]
18927    fn test_mm512_mul_pch() {
18928        let a = _mm512_set1_pch(0.0, 1.0);
18929        let b = _mm512_set1_pch(0.0, 1.0);
18930        let r = _mm512_mul_pch(a, b);
18931        let e = _mm512_set1_pch(-1.0, 0.0);
18932        assert_eq_m512h(r, e);
18933    }
18934
18935    #[simd_test(enable = "avx512fp16")]
18936    fn test_mm512_mask_mul_pch() {
18937        let a = _mm512_set1_pch(0.0, 1.0);
18938        let b = _mm512_set1_pch(0.0, 1.0);
18939        let src = _mm512_setr_ph(
18940            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18941            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18942            32.0, 33.0,
18943        );
18944        let r = _mm512_mask_mul_pch(src, 0b0101010101010101, a, b);
18945        let e = _mm512_setr_ph(
18946            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18947            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18948            33.0,
18949        );
18950        assert_eq_m512h(r, e);
18951    }
18952
18953    #[simd_test(enable = "avx512fp16")]
18954    fn test_mm512_maskz_mul_pch() {
18955        let a = _mm512_set1_pch(0.0, 1.0);
18956        let b = _mm512_set1_pch(0.0, 1.0);
18957        let r = _mm512_maskz_mul_pch(0b0101010101010101, a, b);
18958        let e = _mm512_setr_ph(
18959            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18960            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18961        );
18962        assert_eq_m512h(r, e);
18963    }
18964
18965    #[simd_test(enable = "avx512fp16")]
18966    fn test_mm512_mul_round_pch() {
18967        let a = _mm512_set1_pch(0.0, 1.0);
18968        let b = _mm512_set1_pch(0.0, 1.0);
18969        let r = _mm512_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18970        let e = _mm512_set1_pch(-1.0, 0.0);
18971        assert_eq_m512h(r, e);
18972    }
18973
18974    #[simd_test(enable = "avx512fp16")]
18975    fn test_mm512_mask_mul_round_pch() {
18976        let a = _mm512_set1_pch(0.0, 1.0);
18977        let b = _mm512_set1_pch(0.0, 1.0);
18978        let src = _mm512_setr_ph(
18979            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18980            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18981            32.0, 33.0,
18982        );
18983        let r = _mm512_mask_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18984            src,
18985            0b0101010101010101,
18986            a,
18987            b,
18988        );
18989        let e = _mm512_setr_ph(
18990            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18991            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18992            33.0,
18993        );
18994        assert_eq_m512h(r, e);
18995    }
18996
18997    #[simd_test(enable = "avx512fp16")]
18998    fn test_mm512_maskz_mul_round_pch() {
18999        let a = _mm512_set1_pch(0.0, 1.0);
19000        let b = _mm512_set1_pch(0.0, 1.0);
19001        let r = _mm512_maskz_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19002            0b0101010101010101,
19003            a,
19004            b,
19005        );
19006        let e = _mm512_setr_ph(
19007            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19008            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19009        );
19010        assert_eq_m512h(r, e);
19011    }
19012
19013    #[simd_test(enable = "avx512fp16,avx512vl")]
19014    fn test_mm_mul_round_sch() {
19015        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19016        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19017        let r = _mm_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19018        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19019        assert_eq_m128h(r, e);
19020    }
19021
19022    #[simd_test(enable = "avx512fp16,avx512vl")]
19023    fn test_mm_mask_mul_round_sch() {
19024        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19025        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19026        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19027        let r = _mm_mask_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19028            src, 0, a, b,
19029        );
19030        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19031        assert_eq_m128h(r, e);
19032    }
19033
19034    #[simd_test(enable = "avx512fp16,avx512vl")]
19035    fn test_mm_maskz_mul_round_sch() {
19036        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19037        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19038        let r =
19039            _mm_maskz_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19040        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19041        assert_eq_m128h(r, e);
19042    }
19043
19044    #[simd_test(enable = "avx512fp16,avx512vl")]
19045    fn test_mm_mul_sch() {
19046        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19047        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19048        let r = _mm_mul_sch(a, b);
19049        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19050        assert_eq_m128h(r, e);
19051    }
19052
19053    #[simd_test(enable = "avx512fp16,avx512vl")]
19054    fn test_mm_mask_mul_sch() {
19055        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19056        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19057        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19058        let r = _mm_mask_mul_sch(src, 0, a, b);
19059        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19060        assert_eq_m128h(r, e);
19061    }
19062
19063    #[simd_test(enable = "avx512fp16,avx512vl")]
19064    fn test_mm_maskz_mul_sch() {
19065        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19066        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19067        let r = _mm_maskz_mul_sch(0, a, b);
19068        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19069        assert_eq_m128h(r, e);
19070    }
19071
19072    #[simd_test(enable = "avx512fp16,avx512vl")]
19073    fn test_mm_fmul_pch() {
19074        let a = _mm_set1_pch(0.0, 1.0);
19075        let b = _mm_set1_pch(0.0, 1.0);
19076        let r = _mm_fmul_pch(a, b);
19077        let e = _mm_set1_pch(-1.0, 0.0);
19078        assert_eq_m128h(r, e);
19079    }
19080
19081    #[simd_test(enable = "avx512fp16,avx512vl")]
19082    fn test_mm_mask_fmul_pch() {
19083        let a = _mm_set1_pch(0.0, 1.0);
19084        let b = _mm_set1_pch(0.0, 1.0);
19085        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19086        let r = _mm_mask_fmul_pch(src, 0b0101, a, b);
19087        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19088        assert_eq_m128h(r, e);
19089    }
19090
19091    #[simd_test(enable = "avx512fp16,avx512vl")]
19092    fn test_mm_maskz_fmul_pch() {
19093        let a = _mm_set1_pch(0.0, 1.0);
19094        let b = _mm_set1_pch(0.0, 1.0);
19095        let r = _mm_maskz_fmul_pch(0b0101, a, b);
19096        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19097        assert_eq_m128h(r, e);
19098    }
19099
19100    #[simd_test(enable = "avx512fp16,avx512vl")]
19101    fn test_mm256_fmul_pch() {
19102        let a = _mm256_set1_pch(0.0, 1.0);
19103        let b = _mm256_set1_pch(0.0, 1.0);
19104        let r = _mm256_fmul_pch(a, b);
19105        let e = _mm256_set1_pch(-1.0, 0.0);
19106        assert_eq_m256h(r, e);
19107    }
19108
19109    #[simd_test(enable = "avx512fp16,avx512vl")]
19110    fn test_mm256_mask_fmul_pch() {
19111        let a = _mm256_set1_pch(0.0, 1.0);
19112        let b = _mm256_set1_pch(0.0, 1.0);
19113        let src = _mm256_setr_ph(
19114            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19115        );
19116        let r = _mm256_mask_fmul_pch(src, 0b01010101, a, b);
19117        let e = _mm256_setr_ph(
19118            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19119        );
19120        assert_eq_m256h(r, e);
19121    }
19122
19123    #[simd_test(enable = "avx512fp16,avx512vl")]
19124    fn test_mm256_maskz_fmul_pch() {
19125        let a = _mm256_set1_pch(0.0, 1.0);
19126        let b = _mm256_set1_pch(0.0, 1.0);
19127        let r = _mm256_maskz_fmul_pch(0b01010101, a, b);
19128        let e = _mm256_setr_ph(
19129            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19130        );
19131        assert_eq_m256h(r, e);
19132    }
19133
19134    #[simd_test(enable = "avx512fp16")]
19135    fn test_mm512_fmul_pch() {
19136        let a = _mm512_set1_pch(0.0, 1.0);
19137        let b = _mm512_set1_pch(0.0, 1.0);
19138        let r = _mm512_fmul_pch(a, b);
19139        let e = _mm512_set1_pch(-1.0, 0.0);
19140        assert_eq_m512h(r, e);
19141    }
19142
19143    #[simd_test(enable = "avx512fp16")]
19144    fn test_mm512_mask_fmul_pch() {
19145        let a = _mm512_set1_pch(0.0, 1.0);
19146        let b = _mm512_set1_pch(0.0, 1.0);
19147        let src = _mm512_setr_ph(
19148            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19149            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19150            32.0, 33.0,
19151        );
19152        let r = _mm512_mask_fmul_pch(src, 0b0101010101010101, a, b);
19153        let e = _mm512_setr_ph(
19154            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19155            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19156            33.0,
19157        );
19158        assert_eq_m512h(r, e);
19159    }
19160
19161    #[simd_test(enable = "avx512fp16")]
19162    fn test_mm512_maskz_fmul_pch() {
19163        let a = _mm512_set1_pch(0.0, 1.0);
19164        let b = _mm512_set1_pch(0.0, 1.0);
19165        let r = _mm512_maskz_fmul_pch(0b0101010101010101, a, b);
19166        let e = _mm512_setr_ph(
19167            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19168            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19169        );
19170        assert_eq_m512h(r, e);
19171    }
19172
19173    #[simd_test(enable = "avx512fp16")]
19174    fn test_mm512_fmul_round_pch() {
19175        let a = _mm512_set1_pch(0.0, 1.0);
19176        let b = _mm512_set1_pch(0.0, 1.0);
19177        let r = _mm512_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19178        let e = _mm512_set1_pch(-1.0, 0.0);
19179        assert_eq_m512h(r, e);
19180    }
19181
19182    #[simd_test(enable = "avx512fp16")]
19183    fn test_mm512_mask_fmul_round_pch() {
19184        let a = _mm512_set1_pch(0.0, 1.0);
19185        let b = _mm512_set1_pch(0.0, 1.0);
19186        let src = _mm512_setr_ph(
19187            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19188            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19189            32.0, 33.0,
19190        );
19191        let r = _mm512_mask_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19192            src,
19193            0b0101010101010101,
19194            a,
19195            b,
19196        );
19197        let e = _mm512_setr_ph(
19198            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19199            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19200            33.0,
19201        );
19202        assert_eq_m512h(r, e);
19203    }
19204
19205    #[simd_test(enable = "avx512fp16")]
19206    fn test_mm512_maskz_fmul_round_pch() {
19207        let a = _mm512_set1_pch(0.0, 1.0);
19208        let b = _mm512_set1_pch(0.0, 1.0);
19209        let r = _mm512_maskz_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19210            0b0101010101010101,
19211            a,
19212            b,
19213        );
19214        let e = _mm512_setr_ph(
19215            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19216            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19217        );
19218        assert_eq_m512h(r, e);
19219    }
19220
19221    #[simd_test(enable = "avx512fp16,avx512vl")]
19222    fn test_mm_fmul_round_sch() {
19223        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19224        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19225        let r = _mm_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19226        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19227        assert_eq_m128h(r, e);
19228    }
19229
19230    #[simd_test(enable = "avx512fp16,avx512vl")]
19231    fn test_mm_mask_fmul_round_sch() {
19232        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19233        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19234        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19235        let r = _mm_mask_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19236            src, 0, a, b,
19237        );
19238        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19239        assert_eq_m128h(r, e);
19240    }
19241
19242    #[simd_test(enable = "avx512fp16,avx512vl")]
19243    fn test_mm_maskz_fmul_round_sch() {
19244        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19245        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19246        let r =
19247            _mm_maskz_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19248        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19249        assert_eq_m128h(r, e);
19250    }
19251
19252    #[simd_test(enable = "avx512fp16,avx512vl")]
19253    fn test_mm_fmul_sch() {
19254        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19255        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19256        let r = _mm_fmul_sch(a, b);
19257        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19258        assert_eq_m128h(r, e);
19259    }
19260
19261    #[simd_test(enable = "avx512fp16,avx512vl")]
19262    fn test_mm_mask_fmul_sch() {
19263        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19264        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19265        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19266        let r = _mm_mask_fmul_sch(src, 0, a, b);
19267        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19268        assert_eq_m128h(r, e);
19269    }
19270
19271    #[simd_test(enable = "avx512fp16,avx512vl")]
19272    fn test_mm_maskz_fmul_sch() {
19273        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19274        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19275        let r = _mm_maskz_fmul_sch(0, a, b);
19276        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19277        assert_eq_m128h(r, e);
19278    }
19279
19280    #[simd_test(enable = "avx512fp16,avx512vl")]
19281    fn test_mm_cmul_pch() {
19282        let a = _mm_set1_pch(0.0, 1.0);
19283        let b = _mm_set1_pch(0.0, -1.0);
19284        let r = _mm_cmul_pch(a, b);
19285        let e = _mm_set1_pch(-1.0, 0.0);
19286        assert_eq_m128h(r, e);
19287    }
19288
19289    #[simd_test(enable = "avx512fp16,avx512vl")]
19290    fn test_mm_mask_cmul_pch() {
19291        let a = _mm_set1_pch(0.0, 1.0);
19292        let b = _mm_set1_pch(0.0, -1.0);
19293        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19294        let r = _mm_mask_cmul_pch(src, 0b0101, a, b);
19295        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19296        assert_eq_m128h(r, e);
19297    }
19298
19299    #[simd_test(enable = "avx512fp16,avx512vl")]
19300    fn test_mm_maskz_cmul_pch() {
19301        let a = _mm_set1_pch(0.0, 1.0);
19302        let b = _mm_set1_pch(0.0, -1.0);
19303        let r = _mm_maskz_cmul_pch(0b0101, a, b);
19304        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19305        assert_eq_m128h(r, e);
19306    }
19307
19308    #[simd_test(enable = "avx512fp16,avx512vl")]
19309    fn test_mm256_cmul_pch() {
19310        let a = _mm256_set1_pch(0.0, 1.0);
19311        let b = _mm256_set1_pch(0.0, -1.0);
19312        let r = _mm256_cmul_pch(a, b);
19313        let e = _mm256_set1_pch(-1.0, 0.0);
19314        assert_eq_m256h(r, e);
19315    }
19316
19317    #[simd_test(enable = "avx512fp16,avx512vl")]
19318    fn test_mm256_mask_cmul_pch() {
19319        let a = _mm256_set1_pch(0.0, 1.0);
19320        let b = _mm256_set1_pch(0.0, -1.0);
19321        let src = _mm256_setr_ph(
19322            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19323        );
19324        let r = _mm256_mask_cmul_pch(src, 0b01010101, a, b);
19325        let e = _mm256_setr_ph(
19326            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19327        );
19328        assert_eq_m256h(r, e);
19329    }
19330
19331    #[simd_test(enable = "avx512fp16,avx512vl")]
19332    fn test_mm256_maskz_cmul_pch() {
19333        let a = _mm256_set1_pch(0.0, 1.0);
19334        let b = _mm256_set1_pch(0.0, -1.0);
19335        let r = _mm256_maskz_cmul_pch(0b01010101, a, b);
19336        let e = _mm256_setr_ph(
19337            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19338        );
19339        assert_eq_m256h(r, e);
19340    }
19341
19342    #[simd_test(enable = "avx512fp16")]
19343    fn test_mm512_cmul_pch() {
19344        let a = _mm512_set1_pch(0.0, 1.0);
19345        let b = _mm512_set1_pch(0.0, -1.0);
19346        let r = _mm512_cmul_pch(a, b);
19347        let e = _mm512_set1_pch(-1.0, 0.0);
19348        assert_eq_m512h(r, e);
19349    }
19350
19351    #[simd_test(enable = "avx512fp16")]
19352    fn test_mm512_mask_cmul_pch() {
19353        let a = _mm512_set1_pch(0.0, 1.0);
19354        let b = _mm512_set1_pch(0.0, -1.0);
19355        let src = _mm512_setr_ph(
19356            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19357            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19358            32.0, 33.0,
19359        );
19360        let r = _mm512_mask_cmul_pch(src, 0b0101010101010101, a, b);
19361        let e = _mm512_setr_ph(
19362            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19363            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19364            33.0,
19365        );
19366        assert_eq_m512h(r, e);
19367    }
19368
19369    #[simd_test(enable = "avx512fp16")]
19370    fn test_mm512_maskz_cmul_pch() {
19371        let a = _mm512_set1_pch(0.0, 1.0);
19372        let b = _mm512_set1_pch(0.0, -1.0);
19373        let r = _mm512_maskz_cmul_pch(0b0101010101010101, a, b);
19374        let e = _mm512_setr_ph(
19375            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19376            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19377        );
19378        assert_eq_m512h(r, e);
19379    }
19380
19381    #[simd_test(enable = "avx512fp16")]
19382    fn test_mm512_cmul_round_pch() {
19383        let a = _mm512_set1_pch(0.0, 1.0);
19384        let b = _mm512_set1_pch(0.0, -1.0);
19385        let r = _mm512_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19386        let e = _mm512_set1_pch(-1.0, 0.0);
19387        assert_eq_m512h(r, e);
19388    }
19389
19390    #[simd_test(enable = "avx512fp16")]
19391    fn test_mm512_mask_cmul_round_pch() {
19392        let a = _mm512_set1_pch(0.0, 1.0);
19393        let b = _mm512_set1_pch(0.0, -1.0);
19394        let src = _mm512_setr_ph(
19395            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19396            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19397            32.0, 33.0,
19398        );
19399        let r = _mm512_mask_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19400            src,
19401            0b0101010101010101,
19402            a,
19403            b,
19404        );
19405        let e = _mm512_setr_ph(
19406            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19407            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19408            33.0,
19409        );
19410        assert_eq_m512h(r, e);
19411    }
19412
19413    #[simd_test(enable = "avx512fp16")]
19414    fn test_mm512_maskz_cmul_round_pch() {
19415        let a = _mm512_set1_pch(0.0, 1.0);
19416        let b = _mm512_set1_pch(0.0, -1.0);
19417        let r = _mm512_maskz_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19418            0b0101010101010101,
19419            a,
19420            b,
19421        );
19422        let e = _mm512_setr_ph(
19423            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19424            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19425        );
19426        assert_eq_m512h(r, e);
19427    }
19428
19429    #[simd_test(enable = "avx512fp16,avx512vl")]
19430    fn test_mm_cmul_sch() {
19431        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19432        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19433        let r = _mm_cmul_sch(a, b);
19434        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19435        assert_eq_m128h(r, e);
19436    }
19437
19438    #[simd_test(enable = "avx512fp16,avx512vl")]
19439    fn test_mm_mask_cmul_sch() {
19440        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19441        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19442        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19443        let r = _mm_mask_cmul_sch(src, 0, a, b);
19444        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19445        assert_eq_m128h(r, e);
19446    }
19447
19448    #[simd_test(enable = "avx512fp16,avx512vl")]
19449    fn test_mm_maskz_cmul_sch() {
19450        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19451        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19452        let r = _mm_maskz_cmul_sch(0, a, b);
19453        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19454        assert_eq_m128h(r, e);
19455    }
19456
19457    #[simd_test(enable = "avx512fp16,avx512vl")]
19458    fn test_mm_cmul_round_sch() {
19459        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19460        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19461        let r = _mm_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19462        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19463        assert_eq_m128h(r, e);
19464    }
19465
19466    #[simd_test(enable = "avx512fp16,avx512vl")]
19467    fn test_mm_mask_cmul_round_sch() {
19468        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19469        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19470        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19471        let r = _mm_mask_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19472            src, 0, a, b,
19473        );
19474        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19475        assert_eq_m128h(r, e);
19476    }
19477
19478    #[simd_test(enable = "avx512fp16,avx512vl")]
19479    fn test_mm_maskz_cmul_round_sch() {
19480        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19481        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19482        let r =
19483            _mm_maskz_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19484        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19485        assert_eq_m128h(r, e);
19486    }
19487
19488    #[simd_test(enable = "avx512fp16,avx512vl")]
19489    fn test_mm_fcmul_pch() {
19490        let a = _mm_set1_pch(0.0, 1.0);
19491        let b = _mm_set1_pch(0.0, -1.0);
19492        let r = _mm_fcmul_pch(a, b);
19493        let e = _mm_set1_pch(-1.0, 0.0);
19494        assert_eq_m128h(r, e);
19495    }
19496
19497    #[simd_test(enable = "avx512fp16,avx512vl")]
19498    fn test_mm_mask_fcmul_pch() {
19499        let a = _mm_set1_pch(0.0, 1.0);
19500        let b = _mm_set1_pch(0.0, -1.0);
19501        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19502        let r = _mm_mask_fcmul_pch(src, 0b0101, a, b);
19503        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19504        assert_eq_m128h(r, e);
19505    }
19506
19507    #[simd_test(enable = "avx512fp16,avx512vl")]
19508    fn test_mm_maskz_fcmul_pch() {
19509        let a = _mm_set1_pch(0.0, 1.0);
19510        let b = _mm_set1_pch(0.0, -1.0);
19511        let r = _mm_maskz_fcmul_pch(0b0101, a, b);
19512        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19513        assert_eq_m128h(r, e);
19514    }
19515
19516    #[simd_test(enable = "avx512fp16,avx512vl")]
19517    fn test_mm256_fcmul_pch() {
19518        let a = _mm256_set1_pch(0.0, 1.0);
19519        let b = _mm256_set1_pch(0.0, -1.0);
19520        let r = _mm256_fcmul_pch(a, b);
19521        let e = _mm256_set1_pch(-1.0, 0.0);
19522        assert_eq_m256h(r, e);
19523    }
19524
19525    #[simd_test(enable = "avx512fp16,avx512vl")]
19526    fn test_mm256_mask_fcmul_pch() {
19527        let a = _mm256_set1_pch(0.0, 1.0);
19528        let b = _mm256_set1_pch(0.0, -1.0);
19529        let src = _mm256_setr_ph(
19530            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19531        );
19532        let r = _mm256_mask_fcmul_pch(src, 0b01010101, a, b);
19533        let e = _mm256_setr_ph(
19534            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19535        );
19536        assert_eq_m256h(r, e);
19537    }
19538
19539    #[simd_test(enable = "avx512fp16,avx512vl")]
19540    fn test_mm256_maskz_fcmul_pch() {
19541        let a = _mm256_set1_pch(0.0, 1.0);
19542        let b = _mm256_set1_pch(0.0, -1.0);
19543        let r = _mm256_maskz_fcmul_pch(0b01010101, a, b);
19544        let e = _mm256_setr_ph(
19545            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19546        );
19547        assert_eq_m256h(r, e);
19548    }
19549
19550    #[simd_test(enable = "avx512fp16")]
19551    fn test_mm512_fcmul_pch() {
19552        let a = _mm512_set1_pch(0.0, 1.0);
19553        let b = _mm512_set1_pch(0.0, -1.0);
19554        let r = _mm512_fcmul_pch(a, b);
19555        let e = _mm512_set1_pch(-1.0, 0.0);
19556        assert_eq_m512h(r, e);
19557    }
19558
19559    #[simd_test(enable = "avx512fp16")]
19560    fn test_mm512_mask_fcmul_pch() {
19561        let a = _mm512_set1_pch(0.0, 1.0);
19562        let b = _mm512_set1_pch(0.0, -1.0);
19563        let src = _mm512_setr_ph(
19564            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19565            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19566            32.0, 33.0,
19567        );
19568        let r = _mm512_mask_fcmul_pch(src, 0b0101010101010101, a, b);
19569        let e = _mm512_setr_ph(
19570            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19571            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19572            33.0,
19573        );
19574        assert_eq_m512h(r, e);
19575    }
19576
19577    #[simd_test(enable = "avx512fp16")]
19578    fn test_mm512_maskz_fcmul_pch() {
19579        let a = _mm512_set1_pch(0.0, 1.0);
19580        let b = _mm512_set1_pch(0.0, -1.0);
19581        let r = _mm512_maskz_fcmul_pch(0b0101010101010101, a, b);
19582        let e = _mm512_setr_ph(
19583            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19584            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19585        );
19586        assert_eq_m512h(r, e);
19587    }
19588
19589    #[simd_test(enable = "avx512fp16")]
19590    fn test_mm512_fcmul_round_pch() {
19591        let a = _mm512_set1_pch(0.0, 1.0);
19592        let b = _mm512_set1_pch(0.0, -1.0);
19593        let r = _mm512_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19594        let e = _mm512_set1_pch(-1.0, 0.0);
19595        assert_eq_m512h(r, e);
19596    }
19597
19598    #[simd_test(enable = "avx512fp16")]
19599    fn test_mm512_mask_fcmul_round_pch() {
19600        let a = _mm512_set1_pch(0.0, 1.0);
19601        let b = _mm512_set1_pch(0.0, -1.0);
19602        let src = _mm512_setr_ph(
19603            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19604            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19605            32.0, 33.0,
19606        );
19607        let r = _mm512_mask_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19608            src,
19609            0b0101010101010101,
19610            a,
19611            b,
19612        );
19613        let e = _mm512_setr_ph(
19614            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19615            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19616            33.0,
19617        );
19618        assert_eq_m512h(r, e);
19619    }
19620
19621    #[simd_test(enable = "avx512fp16")]
19622    fn test_mm512_maskz_fcmul_round_pch() {
19623        let a = _mm512_set1_pch(0.0, 1.0);
19624        let b = _mm512_set1_pch(0.0, -1.0);
19625        let r = _mm512_maskz_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19626            0b0101010101010101,
19627            a,
19628            b,
19629        );
19630        let e = _mm512_setr_ph(
19631            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19632            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19633        );
19634        assert_eq_m512h(r, e);
19635    }
19636
19637    #[simd_test(enable = "avx512fp16,avx512vl")]
19638    fn test_mm_fcmul_sch() {
19639        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19640        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19641        let r = _mm_fcmul_sch(a, b);
19642        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19643        assert_eq_m128h(r, e);
19644    }
19645
19646    #[simd_test(enable = "avx512fp16,avx512vl")]
19647    fn test_mm_mask_fcmul_sch() {
19648        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19649        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19650        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19651        let r = _mm_mask_fcmul_sch(src, 0, a, b);
19652        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19653        assert_eq_m128h(r, e);
19654    }
19655
19656    #[simd_test(enable = "avx512fp16,avx512vl")]
19657    fn test_mm_maskz_fcmul_sch() {
19658        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19659        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19660        let r = _mm_maskz_fcmul_sch(0, a, b);
19661        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19662        assert_eq_m128h(r, e);
19663    }
19664
19665    #[simd_test(enable = "avx512fp16,avx512vl")]
19666    fn test_mm_fcmul_round_sch() {
19667        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19668        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19669        let r = _mm_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19670        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19671        assert_eq_m128h(r, e);
19672    }
19673
19674    #[simd_test(enable = "avx512fp16,avx512vl")]
19675    fn test_mm_mask_fcmul_round_sch() {
19676        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19677        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19678        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19679        let r = _mm_mask_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19680            src, 0, a, b,
19681        );
19682        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19683        assert_eq_m128h(r, e);
19684    }
19685
19686    #[simd_test(enable = "avx512fp16,avx512vl")]
19687    fn test_mm_maskz_fcmul_round_sch() {
19688        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19689        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19690        let r =
19691            _mm_maskz_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19692        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19693        assert_eq_m128h(r, e);
19694    }
19695
19696    #[simd_test(enable = "avx512fp16,avx512vl")]
19697    const fn test_mm_abs_ph() {
19698        let a = _mm_set_ph(-1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0);
19699        let r = _mm_abs_ph(a);
19700        let e = _mm_set_ph(1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0);
19701        assert_eq_m128h(r, e);
19702    }
19703
19704    #[simd_test(enable = "avx512fp16,avx512vl")]
19705    const fn test_mm256_abs_ph() {
19706        let a = _mm256_set_ph(
19707            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19708            -14.0,
19709        );
19710        let r = _mm256_abs_ph(a);
19711        let e = _mm256_set_ph(
19712            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19713        );
19714        assert_eq_m256h(r, e);
19715    }
19716
19717    #[simd_test(enable = "avx512fp16")]
19718    const fn test_mm512_abs_ph() {
19719        let a = _mm512_set_ph(
19720            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19721            -14.0, 15.0, -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0,
19722            27.0, -28.0, 29.0, -30.0,
19723        );
19724        let r = _mm512_abs_ph(a);
19725        let e = _mm512_set_ph(
19726            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19727            15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0,
19728            29.0, 30.0,
19729        );
19730        assert_eq_m512h(r, e);
19731    }
19732
19733    #[simd_test(enable = "avx512fp16,avx512vl")]
19734    const fn test_mm_conj_pch() {
19735        let a = _mm_set1_pch(0.0, 1.0);
19736        let r = _mm_conj_pch(a);
19737        let e = _mm_set1_pch(0.0, -1.0);
19738        assert_eq_m128h(r, e);
19739    }
19740
19741    #[simd_test(enable = "avx512fp16,avx512vl")]
19742    const fn test_mm_mask_conj_pch() {
19743        let a = _mm_set1_pch(0.0, 1.0);
19744        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19745        let r = _mm_mask_conj_pch(src, 0b0101, a);
19746        let e = _mm_setr_ph(0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0);
19747        assert_eq_m128h(r, e);
19748    }
19749
19750    #[simd_test(enable = "avx512fp16,avx512vl")]
19751    const fn test_mm_maskz_conj_pch() {
19752        let a = _mm_set1_pch(0.0, 1.0);
19753        let r = _mm_maskz_conj_pch(0b0101, a);
19754        let e = _mm_setr_ph(0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
19755        assert_eq_m128h(r, e);
19756    }
19757
19758    #[simd_test(enable = "avx512fp16,avx512vl")]
19759    const fn test_mm256_conj_pch() {
19760        let a = _mm256_set1_pch(0.0, 1.0);
19761        let r = _mm256_conj_pch(a);
19762        let e = _mm256_set1_pch(0.0, -1.0);
19763        assert_eq_m256h(r, e);
19764    }
19765
19766    #[simd_test(enable = "avx512fp16,avx512vl")]
19767    const fn test_mm256_mask_conj_pch() {
19768        let a = _mm256_set1_pch(0.0, 1.0);
19769        let src = _mm256_setr_ph(
19770            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19771        );
19772        let r = _mm256_mask_conj_pch(src, 0b01010101, a);
19773        let e = _mm256_setr_ph(
19774            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19775        );
19776        assert_eq_m256h(r, e);
19777    }
19778
19779    #[simd_test(enable = "avx512fp16,avx512vl")]
19780    const fn test_mm256_maskz_conj_pch() {
19781        let a = _mm256_set1_pch(0.0, 1.0);
19782        let r = _mm256_maskz_conj_pch(0b01010101, a);
19783        let e = _mm256_setr_ph(
19784            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19785        );
19786        assert_eq_m256h(r, e);
19787    }
19788
19789    #[simd_test(enable = "avx512fp16")]
19790    const fn test_mm512_conj_pch() {
19791        let a = _mm512_set1_pch(0.0, 1.0);
19792        let r = _mm512_conj_pch(a);
19793        let e = _mm512_set1_pch(0.0, -1.0);
19794        assert_eq_m512h(r, e);
19795    }
19796
19797    #[simd_test(enable = "avx512fp16")]
19798    const fn test_mm512_mask_conj_pch() {
19799        let a = _mm512_set1_pch(0.0, 1.0);
19800        let src = _mm512_setr_ph(
19801            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19802            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19803            32.0, 33.0,
19804        );
19805        let r = _mm512_mask_conj_pch(src, 0b0101010101010101, a);
19806        let e = _mm512_setr_ph(
19807            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19808            0.0, -1.0, 20.0, 21.0, 0.0, -1.0, 24.0, 25.0, 0.0, -1.0, 28.0, 29.0, 0.0, -1.0, 32.0,
19809            33.0,
19810        );
19811        assert_eq_m512h(r, e);
19812    }
19813
19814    #[simd_test(enable = "avx512fp16")]
19815    const fn test_mm512_maskz_conj_pch() {
19816        let a = _mm512_set1_pch(0.0, 1.0);
19817        let r = _mm512_maskz_conj_pch(0b0101010101010101, a);
19818        let e = _mm512_setr_ph(
19819            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19820            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19821        );
19822        assert_eq_m512h(r, e);
19823    }
19824
19825    #[simd_test(enable = "avx512fp16,avx512vl")]
19826    fn test_mm_fmadd_pch() {
19827        let a = _mm_set1_pch(0.0, 1.0);
19828        let b = _mm_set1_pch(0.0, 2.0);
19829        let c = _mm_set1_pch(0.0, 3.0);
19830        let r = _mm_fmadd_pch(a, b, c);
19831        let e = _mm_set1_pch(-2.0, 3.0);
19832        assert_eq_m128h(r, e);
19833    }
19834
19835    #[simd_test(enable = "avx512fp16,avx512vl")]
19836    fn test_mm_mask_fmadd_pch() {
19837        let a = _mm_set1_pch(0.0, 1.0);
19838        let b = _mm_set1_pch(0.0, 2.0);
19839        let c = _mm_set1_pch(0.0, 3.0);
19840        let r = _mm_mask_fmadd_pch(a, 0b0101, b, c);
19841        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0);
19842        assert_eq_m128h(r, e);
19843    }
19844
19845    #[simd_test(enable = "avx512fp16,avx512vl")]
19846    fn test_mm_mask3_fmadd_pch() {
19847        let a = _mm_set1_pch(0.0, 1.0);
19848        let b = _mm_set1_pch(0.0, 2.0);
19849        let c = _mm_set1_pch(0.0, 3.0);
19850        let r = _mm_mask3_fmadd_pch(a, b, c, 0b0101);
19851        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0);
19852        assert_eq_m128h(r, e);
19853    }
19854
19855    #[simd_test(enable = "avx512fp16,avx512vl")]
19856    fn test_mm_maskz_fmadd_pch() {
19857        let a = _mm_set1_pch(0.0, 1.0);
19858        let b = _mm_set1_pch(0.0, 2.0);
19859        let c = _mm_set1_pch(0.0, 3.0);
19860        let r = _mm_maskz_fmadd_pch(0b0101, a, b, c);
19861        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0);
19862        assert_eq_m128h(r, e);
19863    }
19864
19865    #[simd_test(enable = "avx512fp16,avx512vl")]
19866    fn test_mm256_fmadd_pch() {
19867        let a = _mm256_set1_pch(0.0, 1.0);
19868        let b = _mm256_set1_pch(0.0, 2.0);
19869        let c = _mm256_set1_pch(0.0, 3.0);
19870        let r = _mm256_fmadd_pch(a, b, c);
19871        let e = _mm256_set1_pch(-2.0, 3.0);
19872        assert_eq_m256h(r, e);
19873    }
19874
19875    #[simd_test(enable = "avx512fp16,avx512vl")]
19876    fn test_mm256_mask_fmadd_pch() {
19877        let a = _mm256_set1_pch(0.0, 1.0);
19878        let b = _mm256_set1_pch(0.0, 2.0);
19879        let c = _mm256_set1_pch(0.0, 3.0);
19880        let r = _mm256_mask_fmadd_pch(a, 0b01010101, b, c);
19881        let e = _mm256_setr_ph(
19882            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19883        );
19884        assert_eq_m256h(r, e);
19885    }
19886
19887    #[simd_test(enable = "avx512fp16,avx512vl")]
19888    fn test_mm256_mask3_fmadd_pch() {
19889        let a = _mm256_set1_pch(0.0, 1.0);
19890        let b = _mm256_set1_pch(0.0, 2.0);
19891        let c = _mm256_set1_pch(0.0, 3.0);
19892        let r = _mm256_mask3_fmadd_pch(a, b, c, 0b01010101);
19893        let e = _mm256_setr_ph(
19894            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19895        );
19896        assert_eq_m256h(r, e);
19897    }
19898
19899    #[simd_test(enable = "avx512fp16,avx512vl")]
19900    fn test_mm256_maskz_fmadd_pch() {
19901        let a = _mm256_set1_pch(0.0, 1.0);
19902        let b = _mm256_set1_pch(0.0, 2.0);
19903        let c = _mm256_set1_pch(0.0, 3.0);
19904        let r = _mm256_maskz_fmadd_pch(0b01010101, a, b, c);
19905        let e = _mm256_setr_ph(
19906            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19907        );
19908        assert_eq_m256h(r, e);
19909    }
19910
19911    #[simd_test(enable = "avx512fp16")]
19912    fn test_mm512_fmadd_pch() {
19913        let a = _mm512_set1_pch(0.0, 1.0);
19914        let b = _mm512_set1_pch(0.0, 2.0);
19915        let c = _mm512_set1_pch(0.0, 3.0);
19916        let r = _mm512_fmadd_pch(a, b, c);
19917        let e = _mm512_set1_pch(-2.0, 3.0);
19918        assert_eq_m512h(r, e);
19919    }
19920
19921    #[simd_test(enable = "avx512fp16")]
19922    fn test_mm512_mask_fmadd_pch() {
19923        let a = _mm512_set1_pch(0.0, 1.0);
19924        let b = _mm512_set1_pch(0.0, 2.0);
19925        let c = _mm512_set1_pch(0.0, 3.0);
19926        let r = _mm512_mask_fmadd_pch(a, 0b0101010101010101, b, c);
19927        let e = _mm512_setr_ph(
19928            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19929            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19930        );
19931        assert_eq_m512h(r, e);
19932    }
19933
19934    #[simd_test(enable = "avx512fp16")]
19935    fn test_mm512_mask3_fmadd_pch() {
19936        let a = _mm512_set1_pch(0.0, 1.0);
19937        let b = _mm512_set1_pch(0.0, 2.0);
19938        let c = _mm512_set1_pch(0.0, 3.0);
19939        let r = _mm512_mask3_fmadd_pch(a, b, c, 0b0101010101010101);
19940        let e = _mm512_setr_ph(
19941            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19942            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19943        );
19944        assert_eq_m512h(r, e);
19945    }
19946
19947    #[simd_test(enable = "avx512fp16")]
19948    fn test_mm512_maskz_fmadd_pch() {
19949        let a = _mm512_set1_pch(0.0, 1.0);
19950        let b = _mm512_set1_pch(0.0, 2.0);
19951        let c = _mm512_set1_pch(0.0, 3.0);
19952        let r = _mm512_maskz_fmadd_pch(0b0101010101010101, a, b, c);
19953        let e = _mm512_setr_ph(
19954            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19955            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19956        );
19957        assert_eq_m512h(r, e);
19958    }
19959
19960    #[simd_test(enable = "avx512fp16")]
19961    fn test_mm512_fmadd_round_pch() {
19962        let a = _mm512_set1_pch(0.0, 1.0);
19963        let b = _mm512_set1_pch(0.0, 2.0);
19964        let c = _mm512_set1_pch(0.0, 3.0);
19965        let r =
19966            _mm512_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19967        let e = _mm512_set1_pch(-2.0, 3.0);
19968        assert_eq_m512h(r, e);
19969    }
19970
19971    #[simd_test(enable = "avx512fp16")]
19972    fn test_mm512_mask_fmadd_round_pch() {
19973        let a = _mm512_set1_pch(0.0, 1.0);
19974        let b = _mm512_set1_pch(0.0, 2.0);
19975        let c = _mm512_set1_pch(0.0, 3.0);
19976        let r = _mm512_mask_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19977            a,
19978            0b0101010101010101,
19979            b,
19980            c,
19981        );
19982        let e = _mm512_setr_ph(
19983            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19984            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19985        );
19986        assert_eq_m512h(r, e);
19987    }
19988
19989    #[simd_test(enable = "avx512fp16")]
19990    fn test_mm512_mask3_fmadd_round_pch() {
19991        let a = _mm512_set1_pch(0.0, 1.0);
19992        let b = _mm512_set1_pch(0.0, 2.0);
19993        let c = _mm512_set1_pch(0.0, 3.0);
19994        let r = _mm512_mask3_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19995            a,
19996            b,
19997            c,
19998            0b0101010101010101,
19999        );
20000        let e = _mm512_setr_ph(
20001            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
20002            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
20003        );
20004        assert_eq_m512h(r, e);
20005    }
20006
20007    #[simd_test(enable = "avx512fp16")]
20008    fn test_mm512_maskz_fmadd_round_pch() {
20009        let a = _mm512_set1_pch(0.0, 1.0);
20010        let b = _mm512_set1_pch(0.0, 2.0);
20011        let c = _mm512_set1_pch(0.0, 3.0);
20012        let r = _mm512_maskz_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20013            0b0101010101010101,
20014            a,
20015            b,
20016            c,
20017        );
20018        let e = _mm512_setr_ph(
20019            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
20020            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
20021        );
20022        assert_eq_m512h(r, e);
20023    }
20024
20025    #[simd_test(enable = "avx512fp16,avx512vl")]
20026    fn test_mm_fmadd_sch() {
20027        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20028        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20029        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20030        let r = _mm_fmadd_sch(a, b, c);
20031        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20032        assert_eq_m128h(r, e);
20033    }
20034
20035    #[simd_test(enable = "avx512fp16,avx512vl")]
20036    fn test_mm_mask_fmadd_sch() {
20037        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20038        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20039        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20040        let r = _mm_mask_fmadd_sch(a, 0, b, c);
20041        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20042        assert_eq_m128h(r, e);
20043        let r = _mm_mask_fmadd_sch(a, 1, b, c);
20044        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20045        assert_eq_m128h(r, e);
20046    }
20047
20048    #[simd_test(enable = "avx512fp16,avx512vl")]
20049    fn test_mm_mask3_fmadd_sch() {
20050        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20051        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20052        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20053        let r = _mm_mask3_fmadd_sch(a, b, c, 0);
20054        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20055        assert_eq_m128h(r, e);
20056        let r = _mm_mask3_fmadd_sch(a, b, c, 1);
20057        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20058        assert_eq_m128h(r, e);
20059    }
20060
20061    #[simd_test(enable = "avx512fp16,avx512vl")]
20062    fn test_mm_maskz_fmadd_sch() {
20063        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20064        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20065        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20066        let r = _mm_maskz_fmadd_sch(0, a, b, c);
20067        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20068        assert_eq_m128h(r, e);
20069        let r = _mm_maskz_fmadd_sch(1, a, b, c);
20070        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20071        assert_eq_m128h(r, e);
20072    }
20073
20074    #[simd_test(enable = "avx512fp16,avx512vl")]
20075    fn test_mm_fmadd_round_sch() {
20076        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20077        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20078        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20079        let r = _mm_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20080        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20081        assert_eq_m128h(r, e);
20082    }
20083
20084    #[simd_test(enable = "avx512fp16,avx512vl")]
20085    fn test_mm_mask_fmadd_round_sch() {
20086        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20087        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20088        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20089        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20090            a, 0, b, c,
20091        );
20092        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20093        assert_eq_m128h(r, e);
20094        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20095            a, 1, b, c,
20096        );
20097        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20098        assert_eq_m128h(r, e);
20099    }
20100
20101    #[simd_test(enable = "avx512fp16,avx512vl")]
20102    fn test_mm_mask3_fmadd_round_sch() {
20103        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20104        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20105        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20106        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20107            a, b, c, 0,
20108        );
20109        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20110        assert_eq_m128h(r, e);
20111        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20112            a, b, c, 1,
20113        );
20114        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20115        assert_eq_m128h(r, e);
20116    }
20117
20118    #[simd_test(enable = "avx512fp16,avx512vl")]
20119    fn test_mm_maskz_fmadd_round_sch() {
20120        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20121        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20122        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20123        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20124            0, a, b, c,
20125        );
20126        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20127        assert_eq_m128h(r, e);
20128        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20129            1, a, b, c,
20130        );
20131        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20132        assert_eq_m128h(r, e);
20133    }
20134
20135    #[simd_test(enable = "avx512fp16,avx512vl")]
20136    fn test_mm_fcmadd_pch() {
20137        let a = _mm_set1_pch(0.0, 1.0);
20138        let b = _mm_set1_pch(0.0, 2.0);
20139        let c = _mm_set1_pch(0.0, 3.0);
20140        let r = _mm_fcmadd_pch(a, b, c);
20141        let e = _mm_set1_pch(2.0, 3.0);
20142        assert_eq_m128h(r, e);
20143    }
20144
20145    #[simd_test(enable = "avx512fp16,avx512vl")]
20146    fn test_mm_mask_fcmadd_pch() {
20147        let a = _mm_set1_pch(0.0, 1.0);
20148        let b = _mm_set1_pch(0.0, 2.0);
20149        let c = _mm_set1_pch(0.0, 3.0);
20150        let r = _mm_mask_fcmadd_pch(a, 0b0101, b, c);
20151        let e = _mm_setr_ph(2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0);
20152        assert_eq_m128h(r, e);
20153    }
20154
20155    #[simd_test(enable = "avx512fp16,avx512vl")]
20156    fn test_mm_mask3_fcmadd_pch() {
20157        let a = _mm_set1_pch(0.0, 1.0);
20158        let b = _mm_set1_pch(0.0, 2.0);
20159        let c = _mm_set1_pch(0.0, 3.0);
20160        let r = _mm_mask3_fcmadd_pch(a, b, c, 0b0101);
20161        let e = _mm_setr_ph(2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0);
20162        assert_eq_m128h(r, e);
20163    }
20164
20165    #[simd_test(enable = "avx512fp16,avx512vl")]
20166    fn test_mm_maskz_fcmadd_pch() {
20167        let a = _mm_set1_pch(0.0, 1.0);
20168        let b = _mm_set1_pch(0.0, 2.0);
20169        let c = _mm_set1_pch(0.0, 3.0);
20170        let r = _mm_maskz_fcmadd_pch(0b0101, a, b, c);
20171        let e = _mm_setr_ph(2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0);
20172        assert_eq_m128h(r, e);
20173    }
20174
20175    #[simd_test(enable = "avx512fp16,avx512vl")]
20176    fn test_mm256_fcmadd_pch() {
20177        let a = _mm256_set1_pch(0.0, 1.0);
20178        let b = _mm256_set1_pch(0.0, 2.0);
20179        let c = _mm256_set1_pch(0.0, 3.0);
20180        let r = _mm256_fcmadd_pch(a, b, c);
20181        let e = _mm256_set1_pch(2.0, 3.0);
20182        assert_eq_m256h(r, e);
20183    }
20184
20185    #[simd_test(enable = "avx512fp16,avx512vl")]
20186    fn test_mm256_mask_fcmadd_pch() {
20187        let a = _mm256_set1_pch(0.0, 1.0);
20188        let b = _mm256_set1_pch(0.0, 2.0);
20189        let c = _mm256_set1_pch(0.0, 3.0);
20190        let r = _mm256_mask_fcmadd_pch(a, 0b01010101, b, c);
20191        let e = _mm256_setr_ph(
20192            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
20193        );
20194        assert_eq_m256h(r, e);
20195    }
20196
20197    #[simd_test(enable = "avx512fp16,avx512vl")]
20198    fn test_mm256_mask3_fcmadd_pch() {
20199        let a = _mm256_set1_pch(0.0, 1.0);
20200        let b = _mm256_set1_pch(0.0, 2.0);
20201        let c = _mm256_set1_pch(0.0, 3.0);
20202        let r = _mm256_mask3_fcmadd_pch(a, b, c, 0b01010101);
20203        let e = _mm256_setr_ph(
20204            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
20205        );
20206        assert_eq_m256h(r, e);
20207    }
20208
20209    #[simd_test(enable = "avx512fp16,avx512vl")]
20210    fn test_mm256_maskz_fcmadd_pch() {
20211        let a = _mm256_set1_pch(0.0, 1.0);
20212        let b = _mm256_set1_pch(0.0, 2.0);
20213        let c = _mm256_set1_pch(0.0, 3.0);
20214        let r = _mm256_maskz_fcmadd_pch(0b01010101, a, b, c);
20215        let e = _mm256_setr_ph(
20216            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20217        );
20218        assert_eq_m256h(r, e);
20219    }
20220
20221    #[simd_test(enable = "avx512fp16")]
20222    fn test_mm512_fcmadd_pch() {
20223        let a = _mm512_set1_pch(0.0, 1.0);
20224        let b = _mm512_set1_pch(0.0, 2.0);
20225        let c = _mm512_set1_pch(0.0, 3.0);
20226        let r = _mm512_fcmadd_pch(a, b, c);
20227        let e = _mm512_set1_pch(2.0, 3.0);
20228        assert_eq_m512h(r, e);
20229    }
20230
20231    #[simd_test(enable = "avx512fp16")]
20232    fn test_mm512_mask_fcmadd_pch() {
20233        let a = _mm512_set1_pch(0.0, 1.0);
20234        let b = _mm512_set1_pch(0.0, 2.0);
20235        let c = _mm512_set1_pch(0.0, 3.0);
20236        let r = _mm512_mask_fcmadd_pch(a, 0b0101010101010101, b, c);
20237        let e = _mm512_setr_ph(
20238            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
20239            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
20240        );
20241        assert_eq_m512h(r, e);
20242    }
20243
20244    #[simd_test(enable = "avx512fp16")]
20245    fn test_mm512_mask3_fcmadd_pch() {
20246        let a = _mm512_set1_pch(0.0, 1.0);
20247        let b = _mm512_set1_pch(0.0, 2.0);
20248        let c = _mm512_set1_pch(0.0, 3.0);
20249        let r = _mm512_mask3_fcmadd_pch(a, b, c, 0b0101010101010101);
20250        let e = _mm512_setr_ph(
20251            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
20252            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
20253        );
20254        assert_eq_m512h(r, e);
20255    }
20256
20257    #[simd_test(enable = "avx512fp16")]
20258    fn test_mm512_maskz_fcmadd_pch() {
20259        let a = _mm512_set1_pch(0.0, 1.0);
20260        let b = _mm512_set1_pch(0.0, 2.0);
20261        let c = _mm512_set1_pch(0.0, 3.0);
20262        let r = _mm512_maskz_fcmadd_pch(0b0101010101010101, a, b, c);
20263        let e = _mm512_setr_ph(
20264            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
20265            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20266        );
20267        assert_eq_m512h(r, e);
20268    }
20269
20270    #[simd_test(enable = "avx512fp16")]
20271    fn test_mm512_fcmadd_round_pch() {
20272        let a = _mm512_set1_pch(0.0, 1.0);
20273        let b = _mm512_set1_pch(0.0, 2.0);
20274        let c = _mm512_set1_pch(0.0, 3.0);
20275        let r =
20276            _mm512_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20277        let e = _mm512_set1_pch(2.0, 3.0);
20278        assert_eq_m512h(r, e);
20279    }
20280
20281    #[simd_test(enable = "avx512fp16")]
20282    fn test_mm512_mask_fcmadd_round_pch() {
20283        let a = _mm512_set1_pch(0.0, 1.0);
20284        let b = _mm512_set1_pch(0.0, 2.0);
20285        let c = _mm512_set1_pch(0.0, 3.0);
20286        let r = _mm512_mask_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20287            a,
20288            0b0101010101010101,
20289            b,
20290            c,
20291        );
20292        let e = _mm512_setr_ph(
20293            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
20294            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
20295        );
20296        assert_eq_m512h(r, e);
20297    }
20298
20299    #[simd_test(enable = "avx512fp16")]
20300    fn test_mm512_mask3_fcmadd_round_pch() {
20301        let a = _mm512_set1_pch(0.0, 1.0);
20302        let b = _mm512_set1_pch(0.0, 2.0);
20303        let c = _mm512_set1_pch(0.0, 3.0);
20304        let r = _mm512_mask3_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20305            a,
20306            b,
20307            c,
20308            0b0101010101010101,
20309        );
20310        let e = _mm512_setr_ph(
20311            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
20312            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
20313        );
20314        assert_eq_m512h(r, e);
20315    }
20316
20317    #[simd_test(enable = "avx512fp16")]
20318    fn test_mm512_maskz_fcmadd_round_pch() {
20319        let a = _mm512_set1_pch(0.0, 1.0);
20320        let b = _mm512_set1_pch(0.0, 2.0);
20321        let c = _mm512_set1_pch(0.0, 3.0);
20322        let r = _mm512_maskz_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20323            0b0101010101010101,
20324            a,
20325            b,
20326            c,
20327        );
20328        let e = _mm512_setr_ph(
20329            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
20330            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20331        );
20332        assert_eq_m512h(r, e);
20333    }
20334
20335    #[simd_test(enable = "avx512fp16,avx512vl")]
20336    fn test_mm_fcmadd_sch() {
20337        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20338        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20339        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20340        let r = _mm_fcmadd_sch(a, b, c);
20341        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20342        assert_eq_m128h(r, e);
20343    }
20344
20345    #[simd_test(enable = "avx512fp16,avx512vl")]
20346    fn test_mm_mask_fcmadd_sch() {
20347        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20348        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20349        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20350        let r = _mm_mask_fcmadd_sch(a, 0, b, c);
20351        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20352        assert_eq_m128h(r, e);
20353        let r = _mm_mask_fcmadd_sch(a, 1, b, c);
20354        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20355        assert_eq_m128h(r, e);
20356    }
20357
20358    #[simd_test(enable = "avx512fp16,avx512vl")]
20359    fn test_mm_mask3_fcmadd_sch() {
20360        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20361        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20362        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20363        let r = _mm_mask3_fcmadd_sch(a, b, c, 0);
20364        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20365        assert_eq_m128h(r, e);
20366        let r = _mm_mask3_fcmadd_sch(a, b, c, 1);
20367        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20368        assert_eq_m128h(r, e);
20369    }
20370
20371    #[simd_test(enable = "avx512fp16,avx512vl")]
20372    fn test_mm_maskz_fcmadd_sch() {
20373        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20374        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20375        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20376        let r = _mm_maskz_fcmadd_sch(0, a, b, c);
20377        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20378        assert_eq_m128h(r, e);
20379        let r = _mm_maskz_fcmadd_sch(1, a, b, c);
20380        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20381        assert_eq_m128h(r, e);
20382    }
20383
20384    #[simd_test(enable = "avx512fp16,avx512vl")]
20385    fn test_mm_fcmadd_round_sch() {
20386        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20387        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20388        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20389        let r = _mm_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20390        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20391        assert_eq_m128h(r, e);
20392    }
20393
20394    #[simd_test(enable = "avx512fp16,avx512vl")]
20395    fn test_mm_mask_fcmadd_round_sch() {
20396        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20397        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20398        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20399        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20400            a, 0, b, c,
20401        );
20402        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20403        assert_eq_m128h(r, e);
20404        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20405            a, 1, b, c,
20406        );
20407        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20408        assert_eq_m128h(r, e);
20409    }
20410
20411    #[simd_test(enable = "avx512fp16,avx512vl")]
20412    fn test_mm_mask3_fcmadd_round_sch() {
20413        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20414        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20415        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20416        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20417            a, b, c, 0,
20418        );
20419        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20420        assert_eq_m128h(r, e);
20421        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20422            a, b, c, 1,
20423        );
20424        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20425        assert_eq_m128h(r, e);
20426    }
20427
20428    #[simd_test(enable = "avx512fp16,avx512vl")]
20429    fn test_mm_maskz_fcmadd_round_sch() {
20430        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20431        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20432        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20433        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20434            0, a, b, c,
20435        );
20436        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20437        assert_eq_m128h(r, e);
20438        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20439            1, a, b, c,
20440        );
20441        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20442        assert_eq_m128h(r, e);
20443    }
20444
20445    #[simd_test(enable = "avx512fp16,avx512vl")]
20446    const fn test_mm_fmadd_ph() {
20447        let a = _mm_set1_ph(1.0);
20448        let b = _mm_set1_ph(2.0);
20449        let c = _mm_set1_ph(3.0);
20450        let r = _mm_fmadd_ph(a, b, c);
20451        let e = _mm_set1_ph(5.0);
20452        assert_eq_m128h(r, e);
20453    }
20454
20455    #[simd_test(enable = "avx512fp16,avx512vl")]
20456    const fn test_mm_mask_fmadd_ph() {
20457        let a = _mm_set1_ph(1.0);
20458        let b = _mm_set1_ph(2.0);
20459        let c = _mm_set1_ph(3.0);
20460        let r = _mm_mask_fmadd_ph(a, 0b01010101, b, c);
20461        let e = _mm_set_ph(1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0);
20462        assert_eq_m128h(r, e);
20463    }
20464
20465    #[simd_test(enable = "avx512fp16,avx512vl")]
20466    const fn test_mm_mask3_fmadd_ph() {
20467        let a = _mm_set1_ph(1.0);
20468        let b = _mm_set1_ph(2.0);
20469        let c = _mm_set1_ph(3.0);
20470        let r = _mm_mask3_fmadd_ph(a, b, c, 0b01010101);
20471        let e = _mm_set_ph(3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0);
20472        assert_eq_m128h(r, e);
20473    }
20474
20475    #[simd_test(enable = "avx512fp16,avx512vl")]
20476    const fn test_mm_maskz_fmadd_ph() {
20477        let a = _mm_set1_ph(1.0);
20478        let b = _mm_set1_ph(2.0);
20479        let c = _mm_set1_ph(3.0);
20480        let r = _mm_maskz_fmadd_ph(0b01010101, a, b, c);
20481        let e = _mm_set_ph(0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0);
20482        assert_eq_m128h(r, e);
20483    }
20484
20485    #[simd_test(enable = "avx512fp16,avx512vl")]
20486    const fn test_mm256_fmadd_ph() {
20487        let a = _mm256_set1_ph(1.0);
20488        let b = _mm256_set1_ph(2.0);
20489        let c = _mm256_set1_ph(3.0);
20490        let r = _mm256_fmadd_ph(a, b, c);
20491        let e = _mm256_set1_ph(5.0);
20492        assert_eq_m256h(r, e);
20493    }
20494
20495    #[simd_test(enable = "avx512fp16,avx512vl")]
20496    const fn test_mm256_mask_fmadd_ph() {
20497        let a = _mm256_set1_ph(1.0);
20498        let b = _mm256_set1_ph(2.0);
20499        let c = _mm256_set1_ph(3.0);
20500        let r = _mm256_mask_fmadd_ph(a, 0b0101010101010101, b, c);
20501        let e = _mm256_set_ph(
20502            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20503        );
20504        assert_eq_m256h(r, e);
20505    }
20506
20507    #[simd_test(enable = "avx512fp16,avx512vl")]
20508    const fn test_mm256_mask3_fmadd_ph() {
20509        let a = _mm256_set1_ph(1.0);
20510        let b = _mm256_set1_ph(2.0);
20511        let c = _mm256_set1_ph(3.0);
20512        let r = _mm256_mask3_fmadd_ph(a, b, c, 0b0101010101010101);
20513        let e = _mm256_set_ph(
20514            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20515        );
20516        assert_eq_m256h(r, e);
20517    }
20518
20519    #[simd_test(enable = "avx512fp16,avx512vl")]
20520    const fn test_mm256_maskz_fmadd_ph() {
20521        let a = _mm256_set1_ph(1.0);
20522        let b = _mm256_set1_ph(2.0);
20523        let c = _mm256_set1_ph(3.0);
20524        let r = _mm256_maskz_fmadd_ph(0b0101010101010101, a, b, c);
20525        let e = _mm256_set_ph(
20526            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20527        );
20528        assert_eq_m256h(r, e);
20529    }
20530
20531    #[simd_test(enable = "avx512fp16")]
20532    const fn test_mm512_fmadd_ph() {
20533        let a = _mm512_set1_ph(1.0);
20534        let b = _mm512_set1_ph(2.0);
20535        let c = _mm512_set1_ph(3.0);
20536        let r = _mm512_fmadd_ph(a, b, c);
20537        let e = _mm512_set1_ph(5.0);
20538        assert_eq_m512h(r, e);
20539    }
20540
20541    #[simd_test(enable = "avx512fp16")]
20542    const fn test_mm512_mask_fmadd_ph() {
20543        let a = _mm512_set1_ph(1.0);
20544        let b = _mm512_set1_ph(2.0);
20545        let c = _mm512_set1_ph(3.0);
20546        let r = _mm512_mask_fmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20547        let e = _mm512_set_ph(
20548            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20549            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20550        );
20551        assert_eq_m512h(r, e);
20552    }
20553
20554    #[simd_test(enable = "avx512fp16")]
20555    const fn test_mm512_mask3_fmadd_ph() {
20556        let a = _mm512_set1_ph(1.0);
20557        let b = _mm512_set1_ph(2.0);
20558        let c = _mm512_set1_ph(3.0);
20559        let r = _mm512_mask3_fmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20560        let e = _mm512_set_ph(
20561            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20562            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20563        );
20564        assert_eq_m512h(r, e);
20565    }
20566
20567    #[simd_test(enable = "avx512fp16")]
20568    const fn test_mm512_maskz_fmadd_ph() {
20569        let a = _mm512_set1_ph(1.0);
20570        let b = _mm512_set1_ph(2.0);
20571        let c = _mm512_set1_ph(3.0);
20572        let r = _mm512_maskz_fmadd_ph(0b01010101010101010101010101010101, a, b, c);
20573        let e = _mm512_set_ph(
20574            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20575            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20576        );
20577        assert_eq_m512h(r, e);
20578    }
20579
20580    #[simd_test(enable = "avx512fp16")]
20581    fn test_mm512_fmadd_round_ph() {
20582        let a = _mm512_set1_ph(1.0);
20583        let b = _mm512_set1_ph(2.0);
20584        let c = _mm512_set1_ph(3.0);
20585        let r = _mm512_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20586        let e = _mm512_set1_ph(5.0);
20587        assert_eq_m512h(r, e);
20588    }
20589
20590    #[simd_test(enable = "avx512fp16")]
20591    fn test_mm512_mask_fmadd_round_ph() {
20592        let a = _mm512_set1_ph(1.0);
20593        let b = _mm512_set1_ph(2.0);
20594        let c = _mm512_set1_ph(3.0);
20595        let r = _mm512_mask_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20596            a,
20597            0b01010101010101010101010101010101,
20598            b,
20599            c,
20600        );
20601        let e = _mm512_set_ph(
20602            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20603            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20604        );
20605        assert_eq_m512h(r, e);
20606    }
20607
20608    #[simd_test(enable = "avx512fp16")]
20609    fn test_mm512_mask3_fmadd_round_ph() {
20610        let a = _mm512_set1_ph(1.0);
20611        let b = _mm512_set1_ph(2.0);
20612        let c = _mm512_set1_ph(3.0);
20613        let r = _mm512_mask3_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20614            a,
20615            b,
20616            c,
20617            0b01010101010101010101010101010101,
20618        );
20619        let e = _mm512_set_ph(
20620            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20621            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20622        );
20623        assert_eq_m512h(r, e);
20624    }
20625
20626    #[simd_test(enable = "avx512fp16")]
20627    fn test_mm512_maskz_fmadd_round_ph() {
20628        let a = _mm512_set1_ph(1.0);
20629        let b = _mm512_set1_ph(2.0);
20630        let c = _mm512_set1_ph(3.0);
20631        let r = _mm512_maskz_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20632            0b01010101010101010101010101010101,
20633            a,
20634            b,
20635            c,
20636        );
20637        let e = _mm512_set_ph(
20638            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20639            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20640        );
20641        assert_eq_m512h(r, e);
20642    }
20643
20644    #[simd_test(enable = "avx512fp16,avx512vl")]
20645    const fn test_mm_fmadd_sh() {
20646        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20647        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20648        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20649        let r = _mm_fmadd_sh(a, b, c);
20650        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20651        assert_eq_m128h(r, e);
20652    }
20653
20654    #[simd_test(enable = "avx512fp16,avx512vl")]
20655    const fn test_mm_mask_fmadd_sh() {
20656        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20657        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20658        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20659        let r = _mm_mask_fmadd_sh(a, 0, b, c);
20660        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20661        assert_eq_m128h(r, e);
20662        let r = _mm_mask_fmadd_sh(a, 1, b, c);
20663        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20664        assert_eq_m128h(r, e);
20665    }
20666
20667    #[simd_test(enable = "avx512fp16,avx512vl")]
20668    const fn test_mm_mask3_fmadd_sh() {
20669        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20670        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20671        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20672        let r = _mm_mask3_fmadd_sh(a, b, c, 0);
20673        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20674        assert_eq_m128h(r, e);
20675        let r = _mm_mask3_fmadd_sh(a, b, c, 1);
20676        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20677        assert_eq_m128h(r, e);
20678    }
20679
20680    #[simd_test(enable = "avx512fp16,avx512vl")]
20681    const fn test_mm_maskz_fmadd_sh() {
20682        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20683        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20684        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20685        let r = _mm_maskz_fmadd_sh(0, a, b, c);
20686        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20687        assert_eq_m128h(r, e);
20688        let r = _mm_maskz_fmadd_sh(1, a, b, c);
20689        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20690        assert_eq_m128h(r, e);
20691    }
20692
20693    #[simd_test(enable = "avx512fp16,avx512vl")]
20694    fn test_mm_fmadd_round_sh() {
20695        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20696        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20697        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20698        let r = _mm_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20699        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20700        assert_eq_m128h(r, e);
20701    }
20702
20703    #[simd_test(enable = "avx512fp16,avx512vl")]
20704    fn test_mm_mask_fmadd_round_sh() {
20705        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20706        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20707        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20708        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20709            a, 0, b, c,
20710        );
20711        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20712        assert_eq_m128h(r, e);
20713        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20714            a, 1, b, c,
20715        );
20716        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20717        assert_eq_m128h(r, e);
20718    }
20719
20720    #[simd_test(enable = "avx512fp16,avx512vl")]
20721    fn test_mm_mask3_fmadd_round_sh() {
20722        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20723        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20724        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20725        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20726            a, b, c, 0,
20727        );
20728        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20729        assert_eq_m128h(r, e);
20730        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20731            a, b, c, 1,
20732        );
20733        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20734        assert_eq_m128h(r, e);
20735    }
20736
20737    #[simd_test(enable = "avx512fp16,avx512vl")]
20738    fn test_mm_maskz_fmadd_round_sh() {
20739        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20740        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20741        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20742        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20743            0, a, b, c,
20744        );
20745        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20746        assert_eq_m128h(r, e);
20747        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20748            1, a, b, c,
20749        );
20750        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20751        assert_eq_m128h(r, e);
20752    }
20753
20754    #[simd_test(enable = "avx512fp16,avx512vl")]
20755    const fn test_mm_fmsub_ph() {
20756        let a = _mm_set1_ph(1.0);
20757        let b = _mm_set1_ph(2.0);
20758        let c = _mm_set1_ph(3.0);
20759        let r = _mm_fmsub_ph(a, b, c);
20760        let e = _mm_set1_ph(-1.0);
20761        assert_eq_m128h(r, e);
20762    }
20763
20764    #[simd_test(enable = "avx512fp16,avx512vl")]
20765    const fn test_mm_mask_fmsub_ph() {
20766        let a = _mm_set1_ph(1.0);
20767        let b = _mm_set1_ph(2.0);
20768        let c = _mm_set1_ph(3.0);
20769        let r = _mm_mask_fmsub_ph(a, 0b01010101, b, c);
20770        let e = _mm_set_ph(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
20771        assert_eq_m128h(r, e);
20772    }
20773
20774    #[simd_test(enable = "avx512fp16,avx512vl")]
20775    const fn test_mm_mask3_fmsub_ph() {
20776        let a = _mm_set1_ph(1.0);
20777        let b = _mm_set1_ph(2.0);
20778        let c = _mm_set1_ph(3.0);
20779        let r = _mm_mask3_fmsub_ph(a, b, c, 0b01010101);
20780        let e = _mm_set_ph(3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0);
20781        assert_eq_m128h(r, e);
20782    }
20783
20784    #[simd_test(enable = "avx512fp16,avx512vl")]
20785    const fn test_mm_maskz_fmsub_ph() {
20786        let a = _mm_set1_ph(1.0);
20787        let b = _mm_set1_ph(2.0);
20788        let c = _mm_set1_ph(3.0);
20789        let r = _mm_maskz_fmsub_ph(0b01010101, a, b, c);
20790        let e = _mm_set_ph(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0);
20791        assert_eq_m128h(r, e);
20792    }
20793
20794    #[simd_test(enable = "avx512fp16,avx512vl")]
20795    const fn test_mm256_fmsub_ph() {
20796        let a = _mm256_set1_ph(1.0);
20797        let b = _mm256_set1_ph(2.0);
20798        let c = _mm256_set1_ph(3.0);
20799        let r = _mm256_fmsub_ph(a, b, c);
20800        let e = _mm256_set1_ph(-1.0);
20801        assert_eq_m256h(r, e);
20802    }
20803
20804    #[simd_test(enable = "avx512fp16,avx512vl")]
20805    const fn test_mm256_mask_fmsub_ph() {
20806        let a = _mm256_set1_ph(1.0);
20807        let b = _mm256_set1_ph(2.0);
20808        let c = _mm256_set1_ph(3.0);
20809        let r = _mm256_mask_fmsub_ph(a, 0b0101010101010101, b, c);
20810        let e = _mm256_set_ph(
20811            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20812        );
20813        assert_eq_m256h(r, e);
20814    }
20815
20816    #[simd_test(enable = "avx512fp16,avx512vl")]
20817    const fn test_mm256_mask3_fmsub_ph() {
20818        let a = _mm256_set1_ph(1.0);
20819        let b = _mm256_set1_ph(2.0);
20820        let c = _mm256_set1_ph(3.0);
20821        let r = _mm256_mask3_fmsub_ph(a, b, c, 0b0101010101010101);
20822        let e = _mm256_set_ph(
20823            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20824        );
20825        assert_eq_m256h(r, e);
20826    }
20827
20828    #[simd_test(enable = "avx512fp16,avx512vl")]
20829    const fn test_mm256_maskz_fmsub_ph() {
20830        let a = _mm256_set1_ph(1.0);
20831        let b = _mm256_set1_ph(2.0);
20832        let c = _mm256_set1_ph(3.0);
20833        let r = _mm256_maskz_fmsub_ph(0b0101010101010101, a, b, c);
20834        let e = _mm256_set_ph(
20835            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20836        );
20837        assert_eq_m256h(r, e);
20838    }
20839
20840    #[simd_test(enable = "avx512fp16")]
20841    const fn test_mm512_fmsub_ph() {
20842        let a = _mm512_set1_ph(1.0);
20843        let b = _mm512_set1_ph(2.0);
20844        let c = _mm512_set1_ph(3.0);
20845        let r = _mm512_fmsub_ph(a, b, c);
20846        let e = _mm512_set1_ph(-1.0);
20847        assert_eq_m512h(r, e);
20848    }
20849
20850    #[simd_test(enable = "avx512fp16")]
20851    const fn test_mm512_mask_fmsub_ph() {
20852        let a = _mm512_set1_ph(1.0);
20853        let b = _mm512_set1_ph(2.0);
20854        let c = _mm512_set1_ph(3.0);
20855        let r = _mm512_mask_fmsub_ph(a, 0b01010101010101010101010101010101, b, c);
20856        let e = _mm512_set_ph(
20857            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20858            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20859        );
20860        assert_eq_m512h(r, e);
20861    }
20862
20863    #[simd_test(enable = "avx512fp16")]
20864    const fn test_mm512_mask3_fmsub_ph() {
20865        let a = _mm512_set1_ph(1.0);
20866        let b = _mm512_set1_ph(2.0);
20867        let c = _mm512_set1_ph(3.0);
20868        let r = _mm512_mask3_fmsub_ph(a, b, c, 0b01010101010101010101010101010101);
20869        let e = _mm512_set_ph(
20870            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20871            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20872        );
20873        assert_eq_m512h(r, e);
20874    }
20875
20876    #[simd_test(enable = "avx512fp16")]
20877    const fn test_mm512_maskz_fmsub_ph() {
20878        let a = _mm512_set1_ph(1.0);
20879        let b = _mm512_set1_ph(2.0);
20880        let c = _mm512_set1_ph(3.0);
20881        let r = _mm512_maskz_fmsub_ph(0b01010101010101010101010101010101, a, b, c);
20882        let e = _mm512_set_ph(
20883            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20884            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20885        );
20886        assert_eq_m512h(r, e);
20887    }
20888
20889    #[simd_test(enable = "avx512fp16")]
20890    fn test_mm512_fmsub_round_ph() {
20891        let a = _mm512_set1_ph(1.0);
20892        let b = _mm512_set1_ph(2.0);
20893        let c = _mm512_set1_ph(3.0);
20894        let r = _mm512_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20895        let e = _mm512_set1_ph(-1.0);
20896        assert_eq_m512h(r, e);
20897    }
20898
20899    #[simd_test(enable = "avx512fp16")]
20900    fn test_mm512_mask_fmsub_round_ph() {
20901        let a = _mm512_set1_ph(1.0);
20902        let b = _mm512_set1_ph(2.0);
20903        let c = _mm512_set1_ph(3.0);
20904        let r = _mm512_mask_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20905            a,
20906            0b01010101010101010101010101010101,
20907            b,
20908            c,
20909        );
20910        let e = _mm512_set_ph(
20911            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20912            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20913        );
20914        assert_eq_m512h(r, e);
20915    }
20916
20917    #[simd_test(enable = "avx512fp16")]
20918    fn test_mm512_mask3_fmsub_round_ph() {
20919        let a = _mm512_set1_ph(1.0);
20920        let b = _mm512_set1_ph(2.0);
20921        let c = _mm512_set1_ph(3.0);
20922        let r = _mm512_mask3_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20923            a,
20924            b,
20925            c,
20926            0b01010101010101010101010101010101,
20927        );
20928        let e = _mm512_set_ph(
20929            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20930            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20931        );
20932        assert_eq_m512h(r, e);
20933    }
20934
20935    #[simd_test(enable = "avx512fp16")]
20936    fn test_mm512_maskz_fmsub_round_ph() {
20937        let a = _mm512_set1_ph(1.0);
20938        let b = _mm512_set1_ph(2.0);
20939        let c = _mm512_set1_ph(3.0);
20940        let r = _mm512_maskz_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20941            0b01010101010101010101010101010101,
20942            a,
20943            b,
20944            c,
20945        );
20946        let e = _mm512_set_ph(
20947            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20948            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20949        );
20950        assert_eq_m512h(r, e);
20951    }
20952
20953    #[simd_test(enable = "avx512fp16,avx512vl")]
20954    const fn test_mm_fmsub_sh() {
20955        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20956        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20957        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20958        let r = _mm_fmsub_sh(a, b, c);
20959        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20960        assert_eq_m128h(r, e);
20961    }
20962
20963    #[simd_test(enable = "avx512fp16,avx512vl")]
20964    const fn test_mm_mask_fmsub_sh() {
20965        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20966        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20967        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20968        let r = _mm_mask_fmsub_sh(a, 0, b, c);
20969        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20970        assert_eq_m128h(r, e);
20971        let r = _mm_mask_fmsub_sh(a, 1, b, c);
20972        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20973        assert_eq_m128h(r, e);
20974    }
20975
20976    #[simd_test(enable = "avx512fp16,avx512vl")]
20977    const fn test_mm_mask3_fmsub_sh() {
20978        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20979        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20980        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20981        let r = _mm_mask3_fmsub_sh(a, b, c, 0);
20982        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20983        assert_eq_m128h(r, e);
20984        let r = _mm_mask3_fmsub_sh(a, b, c, 1);
20985        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20986        assert_eq_m128h(r, e);
20987    }
20988
20989    #[simd_test(enable = "avx512fp16,avx512vl")]
20990    const fn test_mm_maskz_fmsub_sh() {
20991        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20992        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20993        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20994        let r = _mm_maskz_fmsub_sh(0, a, b, c);
20995        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20996        assert_eq_m128h(r, e);
20997        let r = _mm_maskz_fmsub_sh(1, a, b, c);
20998        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20999        assert_eq_m128h(r, e);
21000    }
21001
21002    #[simd_test(enable = "avx512fp16,avx512vl")]
21003    fn test_mm_fmsub_round_sh() {
21004        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21005        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21006        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21007        let r = _mm_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21008        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
21009        assert_eq_m128h(r, e);
21010    }
21011
21012    #[simd_test(enable = "avx512fp16,avx512vl")]
21013    fn test_mm_mask_fmsub_round_sh() {
21014        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21015        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21016        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21017        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21018            a, 0, b, c,
21019        );
21020        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21021        assert_eq_m128h(r, e);
21022        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21023            a, 1, b, c,
21024        );
21025        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
21026        assert_eq_m128h(r, e);
21027    }
21028
21029    #[simd_test(enable = "avx512fp16,avx512vl")]
21030    fn test_mm_mask3_fmsub_round_sh() {
21031        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21032        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21033        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21034        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21035            a, b, c, 0,
21036        );
21037        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21038        assert_eq_m128h(r, e);
21039        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21040            a, b, c, 1,
21041        );
21042        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
21043        assert_eq_m128h(r, e);
21044    }
21045
21046    #[simd_test(enable = "avx512fp16,avx512vl")]
21047    fn test_mm_maskz_fmsub_round_sh() {
21048        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21049        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21050        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21051        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21052            0, a, b, c,
21053        );
21054        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21055        assert_eq_m128h(r, e);
21056        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21057            1, a, b, c,
21058        );
21059        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
21060        assert_eq_m128h(r, e);
21061    }
21062
21063    #[simd_test(enable = "avx512fp16,avx512vl")]
21064    const fn test_mm_fnmadd_ph() {
21065        let a = _mm_set1_ph(1.0);
21066        let b = _mm_set1_ph(2.0);
21067        let c = _mm_set1_ph(3.0);
21068        let r = _mm_fnmadd_ph(a, b, c);
21069        let e = _mm_set1_ph(1.0);
21070        assert_eq_m128h(r, e);
21071    }
21072
21073    #[simd_test(enable = "avx512fp16,avx512vl")]
21074    const fn test_mm_mask_fnmadd_ph() {
21075        let a = _mm_set1_ph(1.0);
21076        let b = _mm_set1_ph(2.0);
21077        let c = _mm_set1_ph(3.0);
21078        let r = _mm_mask_fnmadd_ph(a, 0b01010101, b, c);
21079        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
21080        assert_eq_m128h(r, e);
21081    }
21082
21083    #[simd_test(enable = "avx512fp16,avx512vl")]
21084    const fn test_mm_mask3_fnmadd_ph() {
21085        let a = _mm_set1_ph(1.0);
21086        let b = _mm_set1_ph(2.0);
21087        let c = _mm_set1_ph(3.0);
21088        let r = _mm_mask3_fnmadd_ph(a, b, c, 0b01010101);
21089        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
21090        assert_eq_m128h(r, e);
21091    }
21092
21093    #[simd_test(enable = "avx512fp16,avx512vl")]
21094    const fn test_mm_maskz_fnmadd_ph() {
21095        let a = _mm_set1_ph(1.0);
21096        let b = _mm_set1_ph(2.0);
21097        let c = _mm_set1_ph(3.0);
21098        let r = _mm_maskz_fnmadd_ph(0b01010101, a, b, c);
21099        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
21100        assert_eq_m128h(r, e);
21101    }
21102
21103    #[simd_test(enable = "avx512fp16,avx512vl")]
21104    const fn test_mm256_fnmadd_ph() {
21105        let a = _mm256_set1_ph(1.0);
21106        let b = _mm256_set1_ph(2.0);
21107        let c = _mm256_set1_ph(3.0);
21108        let r = _mm256_fnmadd_ph(a, b, c);
21109        let e = _mm256_set1_ph(1.0);
21110        assert_eq_m256h(r, e);
21111    }
21112
21113    #[simd_test(enable = "avx512fp16,avx512vl")]
21114    const fn test_mm256_mask_fnmadd_ph() {
21115        let a = _mm256_set1_ph(1.0);
21116        let b = _mm256_set1_ph(2.0);
21117        let c = _mm256_set1_ph(3.0);
21118        let r = _mm256_mask_fnmadd_ph(a, 0b0101010101010101, b, c);
21119        let e = _mm256_set_ph(
21120            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21121        );
21122        assert_eq_m256h(r, e);
21123    }
21124
21125    #[simd_test(enable = "avx512fp16,avx512vl")]
21126    const fn test_mm256_mask3_fnmadd_ph() {
21127        let a = _mm256_set1_ph(1.0);
21128        let b = _mm256_set1_ph(2.0);
21129        let c = _mm256_set1_ph(3.0);
21130        let r = _mm256_mask3_fnmadd_ph(a, b, c, 0b0101010101010101);
21131        let e = _mm256_set_ph(
21132            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
21133        );
21134        assert_eq_m256h(r, e);
21135    }
21136
21137    #[simd_test(enable = "avx512fp16,avx512vl")]
21138    const fn test_mm256_maskz_fnmadd_ph() {
21139        let a = _mm256_set1_ph(1.0);
21140        let b = _mm256_set1_ph(2.0);
21141        let c = _mm256_set1_ph(3.0);
21142        let r = _mm256_maskz_fnmadd_ph(0b0101010101010101, a, b, c);
21143        let e = _mm256_set_ph(
21144            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
21145        );
21146        assert_eq_m256h(r, e);
21147    }
21148
21149    #[simd_test(enable = "avx512fp16")]
21150    const fn test_mm512_fnmadd_ph() {
21151        let a = _mm512_set1_ph(1.0);
21152        let b = _mm512_set1_ph(2.0);
21153        let c = _mm512_set1_ph(3.0);
21154        let r = _mm512_fnmadd_ph(a, b, c);
21155        let e = _mm512_set1_ph(1.0);
21156        assert_eq_m512h(r, e);
21157    }
21158
21159    #[simd_test(enable = "avx512fp16")]
21160    const fn test_mm512_mask_fnmadd_ph() {
21161        let a = _mm512_set1_ph(1.0);
21162        let b = _mm512_set1_ph(2.0);
21163        let c = _mm512_set1_ph(3.0);
21164        let r = _mm512_mask_fnmadd_ph(a, 0b01010101010101010101010101010101, b, c);
21165        let e = _mm512_set_ph(
21166            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21167            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21168        );
21169        assert_eq_m512h(r, e);
21170    }
21171
21172    #[simd_test(enable = "avx512fp16")]
21173    const fn test_mm512_mask3_fnmadd_ph() {
21174        let a = _mm512_set1_ph(1.0);
21175        let b = _mm512_set1_ph(2.0);
21176        let c = _mm512_set1_ph(3.0);
21177        let r = _mm512_mask3_fnmadd_ph(a, b, c, 0b01010101010101010101010101010101);
21178        let e = _mm512_set_ph(
21179            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
21180            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
21181        );
21182        assert_eq_m512h(r, e);
21183    }
21184
21185    #[simd_test(enable = "avx512fp16")]
21186    const fn test_mm512_maskz_fnmadd_ph() {
21187        let a = _mm512_set1_ph(1.0);
21188        let b = _mm512_set1_ph(2.0);
21189        let c = _mm512_set1_ph(3.0);
21190        let r = _mm512_maskz_fnmadd_ph(0b01010101010101010101010101010101, a, b, c);
21191        let e = _mm512_set_ph(
21192            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
21193            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
21194        );
21195        assert_eq_m512h(r, e);
21196    }
21197
21198    #[simd_test(enable = "avx512fp16")]
21199    fn test_mm512_fnmadd_round_ph() {
21200        let a = _mm512_set1_ph(1.0);
21201        let b = _mm512_set1_ph(2.0);
21202        let c = _mm512_set1_ph(3.0);
21203        let r =
21204            _mm512_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21205        let e = _mm512_set1_ph(1.0);
21206        assert_eq_m512h(r, e);
21207    }
21208
21209    #[simd_test(enable = "avx512fp16")]
21210    fn test_mm512_mask_fnmadd_round_ph() {
21211        let a = _mm512_set1_ph(1.0);
21212        let b = _mm512_set1_ph(2.0);
21213        let c = _mm512_set1_ph(3.0);
21214        let r = _mm512_mask_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21215            a,
21216            0b01010101010101010101010101010101,
21217            b,
21218            c,
21219        );
21220        let e = _mm512_set_ph(
21221            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21222            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21223        );
21224        assert_eq_m512h(r, e);
21225    }
21226
21227    #[simd_test(enable = "avx512fp16")]
21228    fn test_mm512_mask3_fnmadd_round_ph() {
21229        let a = _mm512_set1_ph(1.0);
21230        let b = _mm512_set1_ph(2.0);
21231        let c = _mm512_set1_ph(3.0);
21232        let r = _mm512_mask3_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21233            a,
21234            b,
21235            c,
21236            0b01010101010101010101010101010101,
21237        );
21238        let e = _mm512_set_ph(
21239            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
21240            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
21241        );
21242        assert_eq_m512h(r, e);
21243    }
21244
21245    #[simd_test(enable = "avx512fp16")]
21246    fn test_mm512_maskz_fnmadd_round_ph() {
21247        let a = _mm512_set1_ph(1.0);
21248        let b = _mm512_set1_ph(2.0);
21249        let c = _mm512_set1_ph(3.0);
21250        let r = _mm512_maskz_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21251            0b01010101010101010101010101010101,
21252            a,
21253            b,
21254            c,
21255        );
21256        let e = _mm512_set_ph(
21257            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
21258            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
21259        );
21260        assert_eq_m512h(r, e);
21261    }
21262
21263    #[simd_test(enable = "avx512fp16,avx512vl")]
21264    const fn test_mm_fnmadd_sh() {
21265        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21266        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21267        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21268        let r = _mm_fnmadd_sh(a, b, c);
21269        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21270        assert_eq_m128h(r, e);
21271    }
21272
21273    #[simd_test(enable = "avx512fp16,avx512vl")]
21274    const fn test_mm_mask_fnmadd_sh() {
21275        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21276        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21277        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21278        let r = _mm_mask_fnmadd_sh(a, 0, b, c);
21279        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21280        assert_eq_m128h(r, e);
21281        let r = _mm_mask_fnmadd_sh(a, 1, b, c);
21282        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21283        assert_eq_m128h(r, e);
21284    }
21285
21286    #[simd_test(enable = "avx512fp16,avx512vl")]
21287    const fn test_mm_mask3_fnmadd_sh() {
21288        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21289        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21290        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21291        let r = _mm_mask3_fnmadd_sh(a, b, c, 0);
21292        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21293        assert_eq_m128h(r, e);
21294        let r = _mm_mask3_fnmadd_sh(a, b, c, 1);
21295        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
21296        assert_eq_m128h(r, e);
21297    }
21298
21299    #[simd_test(enable = "avx512fp16,avx512vl")]
21300    const fn test_mm_maskz_fnmadd_sh() {
21301        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21302        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21303        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21304        let r = _mm_maskz_fnmadd_sh(0, a, b, c);
21305        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21306        assert_eq_m128h(r, e);
21307        let r = _mm_maskz_fnmadd_sh(1, a, b, c);
21308        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21309        assert_eq_m128h(r, e);
21310    }
21311
21312    #[simd_test(enable = "avx512fp16,avx512vl")]
21313    fn test_mm_fnmadd_round_sh() {
21314        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21315        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21316        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21317        let r = _mm_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21318        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21319        assert_eq_m128h(r, e);
21320    }
21321
21322    #[simd_test(enable = "avx512fp16,avx512vl")]
21323    fn test_mm_mask_fnmadd_round_sh() {
21324        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21325        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21326        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21327        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21328            a, 0, b, c,
21329        );
21330        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21331        assert_eq_m128h(r, e);
21332        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21333            a, 1, b, c,
21334        );
21335        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21336        assert_eq_m128h(r, e);
21337    }
21338
21339    #[simd_test(enable = "avx512fp16,avx512vl")]
21340    fn test_mm_mask3_fnmadd_round_sh() {
21341        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21342        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21343        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21344        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21345            a, b, c, 0,
21346        );
21347        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21348        assert_eq_m128h(r, e);
21349        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21350            a, b, c, 1,
21351        );
21352        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
21353        assert_eq_m128h(r, e);
21354    }
21355
21356    #[simd_test(enable = "avx512fp16,avx512vl")]
21357    fn test_mm_maskz_fnmadd_round_sh() {
21358        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21359        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21360        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21361        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21362            0, a, b, c,
21363        );
21364        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21365        assert_eq_m128h(r, e);
21366        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21367            1, a, b, c,
21368        );
21369        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21370        assert_eq_m128h(r, e);
21371    }
21372
21373    #[simd_test(enable = "avx512fp16,avx512vl")]
21374    const fn test_mm_fnmsub_ph() {
21375        let a = _mm_set1_ph(1.0);
21376        let b = _mm_set1_ph(2.0);
21377        let c = _mm_set1_ph(3.0);
21378        let r = _mm_fnmsub_ph(a, b, c);
21379        let e = _mm_set1_ph(-5.0);
21380        assert_eq_m128h(r, e);
21381    }
21382
21383    #[simd_test(enable = "avx512fp16,avx512vl")]
21384    const fn test_mm_mask_fnmsub_ph() {
21385        let a = _mm_set1_ph(1.0);
21386        let b = _mm_set1_ph(2.0);
21387        let c = _mm_set1_ph(3.0);
21388        let r = _mm_mask_fnmsub_ph(a, 0b01010101, b, c);
21389        let e = _mm_set_ph(1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0);
21390        assert_eq_m128h(r, e);
21391    }
21392
21393    #[simd_test(enable = "avx512fp16,avx512vl")]
21394    const fn test_mm_mask3_fnmsub_ph() {
21395        let a = _mm_set1_ph(1.0);
21396        let b = _mm_set1_ph(2.0);
21397        let c = _mm_set1_ph(3.0);
21398        let r = _mm_mask3_fnmsub_ph(a, b, c, 0b01010101);
21399        let e = _mm_set_ph(3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0);
21400        assert_eq_m128h(r, e);
21401    }
21402
21403    #[simd_test(enable = "avx512fp16,avx512vl")]
21404    const fn test_mm_maskz_fnmsub_ph() {
21405        let a = _mm_set1_ph(1.0);
21406        let b = _mm_set1_ph(2.0);
21407        let c = _mm_set1_ph(3.0);
21408        let r = _mm_maskz_fnmsub_ph(0b01010101, a, b, c);
21409        let e = _mm_set_ph(0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0);
21410        assert_eq_m128h(r, e);
21411    }
21412
21413    #[simd_test(enable = "avx512fp16,avx512vl")]
21414    const fn test_mm256_fnmsub_ph() {
21415        let a = _mm256_set1_ph(1.0);
21416        let b = _mm256_set1_ph(2.0);
21417        let c = _mm256_set1_ph(3.0);
21418        let r = _mm256_fnmsub_ph(a, b, c);
21419        let e = _mm256_set1_ph(-5.0);
21420        assert_eq_m256h(r, e);
21421    }
21422
21423    #[simd_test(enable = "avx512fp16,avx512vl")]
21424    const fn test_mm256_mask_fnmsub_ph() {
21425        let a = _mm256_set1_ph(1.0);
21426        let b = _mm256_set1_ph(2.0);
21427        let c = _mm256_set1_ph(3.0);
21428        let r = _mm256_mask_fnmsub_ph(a, 0b0101010101010101, b, c);
21429        let e = _mm256_set_ph(
21430            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21431        );
21432        assert_eq_m256h(r, e);
21433    }
21434
21435    #[simd_test(enable = "avx512fp16,avx512vl")]
21436    const fn test_mm256_mask3_fnmsub_ph() {
21437        let a = _mm256_set1_ph(1.0);
21438        let b = _mm256_set1_ph(2.0);
21439        let c = _mm256_set1_ph(3.0);
21440        let r = _mm256_mask3_fnmsub_ph(a, b, c, 0b0101010101010101);
21441        let e = _mm256_set_ph(
21442            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21443        );
21444        assert_eq_m256h(r, e);
21445    }
21446
21447    #[simd_test(enable = "avx512fp16,avx512vl")]
21448    const fn test_mm256_maskz_fnmsub_ph() {
21449        let a = _mm256_set1_ph(1.0);
21450        let b = _mm256_set1_ph(2.0);
21451        let c = _mm256_set1_ph(3.0);
21452        let r = _mm256_maskz_fnmsub_ph(0b0101010101010101, a, b, c);
21453        let e = _mm256_set_ph(
21454            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21455        );
21456        assert_eq_m256h(r, e);
21457    }
21458
21459    #[simd_test(enable = "avx512fp16")]
21460    const fn test_mm512_fnmsub_ph() {
21461        let a = _mm512_set1_ph(1.0);
21462        let b = _mm512_set1_ph(2.0);
21463        let c = _mm512_set1_ph(3.0);
21464        let r = _mm512_fnmsub_ph(a, b, c);
21465        let e = _mm512_set1_ph(-5.0);
21466        assert_eq_m512h(r, e);
21467    }
21468
21469    #[simd_test(enable = "avx512fp16")]
21470    const fn test_mm512_mask_fnmsub_ph() {
21471        let a = _mm512_set1_ph(1.0);
21472        let b = _mm512_set1_ph(2.0);
21473        let c = _mm512_set1_ph(3.0);
21474        let r = _mm512_mask_fnmsub_ph(a, 0b01010101010101010101010101010101, b, c);
21475        let e = _mm512_set_ph(
21476            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21477            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21478        );
21479        assert_eq_m512h(r, e);
21480    }
21481
21482    #[simd_test(enable = "avx512fp16")]
21483    const fn test_mm512_mask3_fnmsub_ph() {
21484        let a = _mm512_set1_ph(1.0);
21485        let b = _mm512_set1_ph(2.0);
21486        let c = _mm512_set1_ph(3.0);
21487        let r = _mm512_mask3_fnmsub_ph(a, b, c, 0b01010101010101010101010101010101);
21488        let e = _mm512_set_ph(
21489            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21490            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21491        );
21492        assert_eq_m512h(r, e);
21493    }
21494
21495    #[simd_test(enable = "avx512fp16")]
21496    const fn test_mm512_maskz_fnmsub_ph() {
21497        let a = _mm512_set1_ph(1.0);
21498        let b = _mm512_set1_ph(2.0);
21499        let c = _mm512_set1_ph(3.0);
21500        let r = _mm512_maskz_fnmsub_ph(0b01010101010101010101010101010101, a, b, c);
21501        let e = _mm512_set_ph(
21502            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21503            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21504        );
21505        assert_eq_m512h(r, e);
21506    }
21507
21508    #[simd_test(enable = "avx512fp16")]
21509    fn test_mm512_fnmsub_round_ph() {
21510        let a = _mm512_set1_ph(1.0);
21511        let b = _mm512_set1_ph(2.0);
21512        let c = _mm512_set1_ph(3.0);
21513        let r =
21514            _mm512_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21515        let e = _mm512_set1_ph(-5.0);
21516        assert_eq_m512h(r, e);
21517    }
21518
21519    #[simd_test(enable = "avx512fp16")]
21520    fn test_mm512_mask_fnmsub_round_ph() {
21521        let a = _mm512_set1_ph(1.0);
21522        let b = _mm512_set1_ph(2.0);
21523        let c = _mm512_set1_ph(3.0);
21524        let r = _mm512_mask_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21525            a,
21526            0b01010101010101010101010101010101,
21527            b,
21528            c,
21529        );
21530        let e = _mm512_set_ph(
21531            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21532            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21533        );
21534        assert_eq_m512h(r, e);
21535    }
21536
21537    #[simd_test(enable = "avx512fp16")]
21538    fn test_mm512_mask3_fnmsub_round_ph() {
21539        let a = _mm512_set1_ph(1.0);
21540        let b = _mm512_set1_ph(2.0);
21541        let c = _mm512_set1_ph(3.0);
21542        let r = _mm512_mask3_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21543            a,
21544            b,
21545            c,
21546            0b01010101010101010101010101010101,
21547        );
21548        let e = _mm512_set_ph(
21549            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21550            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21551        );
21552        assert_eq_m512h(r, e);
21553    }
21554
21555    #[simd_test(enable = "avx512fp16")]
21556    fn test_mm512_maskz_fnmsub_round_ph() {
21557        let a = _mm512_set1_ph(1.0);
21558        let b = _mm512_set1_ph(2.0);
21559        let c = _mm512_set1_ph(3.0);
21560        let r = _mm512_maskz_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21561            0b01010101010101010101010101010101,
21562            a,
21563            b,
21564            c,
21565        );
21566        let e = _mm512_set_ph(
21567            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21568            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21569        );
21570        assert_eq_m512h(r, e);
21571    }
21572
21573    #[simd_test(enable = "avx512fp16,avx512vl")]
21574    const fn test_mm_fnmsub_sh() {
21575        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21576        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21577        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21578        let r = _mm_fnmsub_sh(a, b, c);
21579        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21580        assert_eq_m128h(r, e);
21581    }
21582
21583    #[simd_test(enable = "avx512fp16,avx512vl")]
21584    const fn test_mm_mask_fnmsub_sh() {
21585        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21586        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21587        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21588        let r = _mm_mask_fnmsub_sh(a, 0, b, c);
21589        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21590        assert_eq_m128h(r, e);
21591        let r = _mm_mask_fnmsub_sh(a, 1, b, c);
21592        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21593        assert_eq_m128h(r, e);
21594    }
21595
21596    #[simd_test(enable = "avx512fp16,avx512vl")]
21597    const fn test_mm_mask3_fnmsub_sh() {
21598        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21599        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21600        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21601        let r = _mm_mask3_fnmsub_sh(a, b, c, 0);
21602        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21603        assert_eq_m128h(r, e);
21604        let r = _mm_mask3_fnmsub_sh(a, b, c, 1);
21605        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21606        assert_eq_m128h(r, e);
21607    }
21608
21609    #[simd_test(enable = "avx512fp16,avx512vl")]
21610    const fn test_mm_maskz_fnmsub_sh() {
21611        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21612        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21613        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21614        let r = _mm_maskz_fnmsub_sh(0, a, b, c);
21615        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21616        assert_eq_m128h(r, e);
21617        let r = _mm_maskz_fnmsub_sh(1, a, b, c);
21618        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21619        assert_eq_m128h(r, e);
21620    }
21621
21622    #[simd_test(enable = "avx512fp16,avx512vl")]
21623    fn test_mm_fnmsub_round_sh() {
21624        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21625        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21626        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21627        let r = _mm_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21628        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21629        assert_eq_m128h(r, e);
21630    }
21631
21632    #[simd_test(enable = "avx512fp16,avx512vl")]
21633    fn test_mm_mask_fnmsub_round_sh() {
21634        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21635        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21636        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21637        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21638            a, 0, b, c,
21639        );
21640        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21641        assert_eq_m128h(r, e);
21642        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21643            a, 1, b, c,
21644        );
21645        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21646        assert_eq_m128h(r, e);
21647    }
21648
21649    #[simd_test(enable = "avx512fp16,avx512vl")]
21650    fn test_mm_mask3_fnmsub_round_sh() {
21651        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21652        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21653        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21654        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21655            a, b, c, 0,
21656        );
21657        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21658        assert_eq_m128h(r, e);
21659        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21660            a, b, c, 1,
21661        );
21662        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21663        assert_eq_m128h(r, e);
21664    }
21665
21666    #[simd_test(enable = "avx512fp16,avx512vl")]
21667    fn test_mm_maskz_fnmsub_round_sh() {
21668        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21669        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21670        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21671        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21672            0, a, b, c,
21673        );
21674        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21675        assert_eq_m128h(r, e);
21676        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21677            1, a, b, c,
21678        );
21679        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21680        assert_eq_m128h(r, e);
21681    }
21682
21683    #[simd_test(enable = "avx512fp16,avx512vl")]
21684    const fn test_mm_fmaddsub_ph() {
21685        let a = _mm_set1_ph(1.0);
21686        let b = _mm_set1_ph(2.0);
21687        let c = _mm_set1_ph(3.0);
21688        let r = _mm_fmaddsub_ph(a, b, c);
21689        let e = _mm_set_ph(5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0);
21690        assert_eq_m128h(r, e);
21691    }
21692
21693    #[simd_test(enable = "avx512fp16,avx512vl")]
21694    const fn test_mm_mask_fmaddsub_ph() {
21695        let a = _mm_set1_ph(1.0);
21696        let b = _mm_set1_ph(2.0);
21697        let c = _mm_set1_ph(3.0);
21698        let r = _mm_mask_fmaddsub_ph(a, 0b00110011, b, c);
21699        let e = _mm_set_ph(1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0);
21700        assert_eq_m128h(r, e);
21701    }
21702
21703    #[simd_test(enable = "avx512fp16,avx512vl")]
21704    const fn test_mm_mask3_fmaddsub_ph() {
21705        let a = _mm_set1_ph(1.0);
21706        let b = _mm_set1_ph(2.0);
21707        let c = _mm_set1_ph(3.0);
21708        let r = _mm_mask3_fmaddsub_ph(a, b, c, 0b00110011);
21709        let e = _mm_set_ph(3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0);
21710        assert_eq_m128h(r, e);
21711    }
21712
21713    #[simd_test(enable = "avx512fp16,avx512vl")]
21714    const fn test_mm_maskz_fmaddsub_ph() {
21715        let a = _mm_set1_ph(1.0);
21716        let b = _mm_set1_ph(2.0);
21717        let c = _mm_set1_ph(3.0);
21718        let r = _mm_maskz_fmaddsub_ph(0b00110011, a, b, c);
21719        let e = _mm_set_ph(0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0);
21720        assert_eq_m128h(r, e);
21721    }
21722
21723    #[simd_test(enable = "avx512fp16,avx512vl")]
21724    const fn test_mm256_fmaddsub_ph() {
21725        let a = _mm256_set1_ph(1.0);
21726        let b = _mm256_set1_ph(2.0);
21727        let c = _mm256_set1_ph(3.0);
21728        let r = _mm256_fmaddsub_ph(a, b, c);
21729        let e = _mm256_set_ph(
21730            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21731        );
21732        assert_eq_m256h(r, e);
21733    }
21734
21735    #[simd_test(enable = "avx512fp16,avx512vl")]
21736    const fn test_mm256_mask_fmaddsub_ph() {
21737        let a = _mm256_set1_ph(1.0);
21738        let b = _mm256_set1_ph(2.0);
21739        let c = _mm256_set1_ph(3.0);
21740        let r = _mm256_mask_fmaddsub_ph(a, 0b0011001100110011, b, c);
21741        let e = _mm256_set_ph(
21742            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21743        );
21744        assert_eq_m256h(r, e);
21745    }
21746
21747    #[simd_test(enable = "avx512fp16,avx512vl")]
21748    const fn test_mm256_mask3_fmaddsub_ph() {
21749        let a = _mm256_set1_ph(1.0);
21750        let b = _mm256_set1_ph(2.0);
21751        let c = _mm256_set1_ph(3.0);
21752        let r = _mm256_mask3_fmaddsub_ph(a, b, c, 0b0011001100110011);
21753        let e = _mm256_set_ph(
21754            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21755        );
21756        assert_eq_m256h(r, e);
21757    }
21758
21759    #[simd_test(enable = "avx512fp16,avx512vl")]
21760    const fn test_mm256_maskz_fmaddsub_ph() {
21761        let a = _mm256_set1_ph(1.0);
21762        let b = _mm256_set1_ph(2.0);
21763        let c = _mm256_set1_ph(3.0);
21764        let r = _mm256_maskz_fmaddsub_ph(0b0011001100110011, a, b, c);
21765        let e = _mm256_set_ph(
21766            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21767        );
21768        assert_eq_m256h(r, e);
21769    }
21770
21771    #[simd_test(enable = "avx512fp16")]
21772    const fn test_mm512_fmaddsub_ph() {
21773        let a = _mm512_set1_ph(1.0);
21774        let b = _mm512_set1_ph(2.0);
21775        let c = _mm512_set1_ph(3.0);
21776        let r = _mm512_fmaddsub_ph(a, b, c);
21777        let e = _mm512_set_ph(
21778            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21779            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21780        );
21781        assert_eq_m512h(r, e);
21782    }
21783
21784    #[simd_test(enable = "avx512fp16")]
21785    const fn test_mm512_mask_fmaddsub_ph() {
21786        let a = _mm512_set1_ph(1.0);
21787        let b = _mm512_set1_ph(2.0);
21788        let c = _mm512_set1_ph(3.0);
21789        let r = _mm512_mask_fmaddsub_ph(a, 0b00110011001100110011001100110011, b, c);
21790        let e = _mm512_set_ph(
21791            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21792            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21793        );
21794        assert_eq_m512h(r, e);
21795    }
21796
21797    #[simd_test(enable = "avx512fp16")]
21798    const fn test_mm512_mask3_fmaddsub_ph() {
21799        let a = _mm512_set1_ph(1.0);
21800        let b = _mm512_set1_ph(2.0);
21801        let c = _mm512_set1_ph(3.0);
21802        let r = _mm512_mask3_fmaddsub_ph(a, b, c, 0b00110011001100110011001100110011);
21803        let e = _mm512_set_ph(
21804            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21805            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21806        );
21807        assert_eq_m512h(r, e);
21808    }
21809
21810    #[simd_test(enable = "avx512fp16")]
21811    const fn test_mm512_maskz_fmaddsub_ph() {
21812        let a = _mm512_set1_ph(1.0);
21813        let b = _mm512_set1_ph(2.0);
21814        let c = _mm512_set1_ph(3.0);
21815        let r = _mm512_maskz_fmaddsub_ph(0b00110011001100110011001100110011, a, b, c);
21816        let e = _mm512_set_ph(
21817            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21818            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21819        );
21820        assert_eq_m512h(r, e);
21821    }
21822
21823    #[simd_test(enable = "avx512fp16")]
21824    fn test_mm512_fmaddsub_round_ph() {
21825        let a = _mm512_set1_ph(1.0);
21826        let b = _mm512_set1_ph(2.0);
21827        let c = _mm512_set1_ph(3.0);
21828        let r =
21829            _mm512_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21830        let e = _mm512_set_ph(
21831            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21832            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21833        );
21834        assert_eq_m512h(r, e);
21835    }
21836
21837    #[simd_test(enable = "avx512fp16")]
21838    fn test_mm512_mask_fmaddsub_round_ph() {
21839        let a = _mm512_set1_ph(1.0);
21840        let b = _mm512_set1_ph(2.0);
21841        let c = _mm512_set1_ph(3.0);
21842        let r = _mm512_mask_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21843            a,
21844            0b00110011001100110011001100110011,
21845            b,
21846            c,
21847        );
21848        let e = _mm512_set_ph(
21849            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21850            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21851        );
21852        assert_eq_m512h(r, e);
21853    }
21854
21855    #[simd_test(enable = "avx512fp16")]
21856    fn test_mm512_mask3_fmaddsub_round_ph() {
21857        let a = _mm512_set1_ph(1.0);
21858        let b = _mm512_set1_ph(2.0);
21859        let c = _mm512_set1_ph(3.0);
21860        let r = _mm512_mask3_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21861            a,
21862            b,
21863            c,
21864            0b00110011001100110011001100110011,
21865        );
21866        let e = _mm512_set_ph(
21867            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21868            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21869        );
21870        assert_eq_m512h(r, e);
21871    }
21872
21873    #[simd_test(enable = "avx512fp16")]
21874    fn test_mm512_maskz_fmaddsub_round_ph() {
21875        let a = _mm512_set1_ph(1.0);
21876        let b = _mm512_set1_ph(2.0);
21877        let c = _mm512_set1_ph(3.0);
21878        let r = _mm512_maskz_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21879            0b00110011001100110011001100110011,
21880            a,
21881            b,
21882            c,
21883        );
21884        let e = _mm512_set_ph(
21885            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21886            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21887        );
21888        assert_eq_m512h(r, e);
21889    }
21890
21891    #[simd_test(enable = "avx512fp16,avx512vl")]
21892    const fn test_mm_fmsubadd_ph() {
21893        let a = _mm_set1_ph(1.0);
21894        let b = _mm_set1_ph(2.0);
21895        let c = _mm_set1_ph(3.0);
21896        let r = _mm_fmsubadd_ph(a, b, c);
21897        let e = _mm_set_ph(-1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0);
21898        assert_eq_m128h(r, e);
21899    }
21900
21901    #[simd_test(enable = "avx512fp16,avx512vl")]
21902    const fn test_mm_mask_fmsubadd_ph() {
21903        let a = _mm_set1_ph(1.0);
21904        let b = _mm_set1_ph(2.0);
21905        let c = _mm_set1_ph(3.0);
21906        let r = _mm_mask_fmsubadd_ph(a, 0b00110011, b, c);
21907        let e = _mm_set_ph(1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0);
21908        assert_eq_m128h(r, e);
21909    }
21910
21911    #[simd_test(enable = "avx512fp16,avx512vl")]
21912    const fn test_mm_mask3_fmsubadd_ph() {
21913        let a = _mm_set1_ph(1.0);
21914        let b = _mm_set1_ph(2.0);
21915        let c = _mm_set1_ph(3.0);
21916        let r = _mm_mask3_fmsubadd_ph(a, b, c, 0b00110011);
21917        let e = _mm_set_ph(3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0);
21918        assert_eq_m128h(r, e);
21919    }
21920
21921    #[simd_test(enable = "avx512fp16,avx512vl")]
21922    const fn test_mm_maskz_fmsubadd_ph() {
21923        let a = _mm_set1_ph(1.0);
21924        let b = _mm_set1_ph(2.0);
21925        let c = _mm_set1_ph(3.0);
21926        let r = _mm_maskz_fmsubadd_ph(0b00110011, a, b, c);
21927        let e = _mm_set_ph(0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0);
21928        assert_eq_m128h(r, e);
21929    }
21930
21931    #[simd_test(enable = "avx512fp16,avx512vl")]
21932    const fn test_mm256_fmsubadd_ph() {
21933        let a = _mm256_set1_ph(1.0);
21934        let b = _mm256_set1_ph(2.0);
21935        let c = _mm256_set1_ph(3.0);
21936        let r = _mm256_fmsubadd_ph(a, b, c);
21937        let e = _mm256_set_ph(
21938            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21939        );
21940        assert_eq_m256h(r, e);
21941    }
21942
21943    #[simd_test(enable = "avx512fp16,avx512vl")]
21944    const fn test_mm256_mask_fmsubadd_ph() {
21945        let a = _mm256_set1_ph(1.0);
21946        let b = _mm256_set1_ph(2.0);
21947        let c = _mm256_set1_ph(3.0);
21948        let r = _mm256_mask_fmsubadd_ph(a, 0b0011001100110011, b, c);
21949        let e = _mm256_set_ph(
21950            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21951        );
21952        assert_eq_m256h(r, e);
21953    }
21954
21955    #[simd_test(enable = "avx512fp16,avx512vl")]
21956    const fn test_mm256_mask3_fmsubadd_ph() {
21957        let a = _mm256_set1_ph(1.0);
21958        let b = _mm256_set1_ph(2.0);
21959        let c = _mm256_set1_ph(3.0);
21960        let r = _mm256_mask3_fmsubadd_ph(a, b, c, 0b0011001100110011);
21961        let e = _mm256_set_ph(
21962            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21963        );
21964        assert_eq_m256h(r, e);
21965    }
21966
21967    #[simd_test(enable = "avx512fp16,avx512vl")]
21968    const fn test_mm256_maskz_fmsubadd_ph() {
21969        let a = _mm256_set1_ph(1.0);
21970        let b = _mm256_set1_ph(2.0);
21971        let c = _mm256_set1_ph(3.0);
21972        let r = _mm256_maskz_fmsubadd_ph(0b0011001100110011, a, b, c);
21973        let e = _mm256_set_ph(
21974            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21975        );
21976        assert_eq_m256h(r, e);
21977    }
21978
21979    #[simd_test(enable = "avx512fp16")]
21980    const fn test_mm512_fmsubadd_ph() {
21981        let a = _mm512_set1_ph(1.0);
21982        let b = _mm512_set1_ph(2.0);
21983        let c = _mm512_set1_ph(3.0);
21984        let r = _mm512_fmsubadd_ph(a, b, c);
21985        let e = _mm512_set_ph(
21986            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21987            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21988        );
21989        assert_eq_m512h(r, e);
21990    }
21991
21992    #[simd_test(enable = "avx512fp16")]
21993    const fn test_mm512_mask_fmsubadd_ph() {
21994        let a = _mm512_set1_ph(1.0);
21995        let b = _mm512_set1_ph(2.0);
21996        let c = _mm512_set1_ph(3.0);
21997        let r = _mm512_mask_fmsubadd_ph(a, 0b00110011001100110011001100110011, b, c);
21998        let e = _mm512_set_ph(
21999            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
22000            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
22001        );
22002        assert_eq_m512h(r, e);
22003    }
22004
22005    #[simd_test(enable = "avx512fp16")]
22006    const fn test_mm512_mask3_fmsubadd_ph() {
22007        let a = _mm512_set1_ph(1.0);
22008        let b = _mm512_set1_ph(2.0);
22009        let c = _mm512_set1_ph(3.0);
22010        let r = _mm512_mask3_fmsubadd_ph(a, b, c, 0b00110011001100110011001100110011);
22011        let e = _mm512_set_ph(
22012            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
22013            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
22014        );
22015        assert_eq_m512h(r, e);
22016    }
22017
22018    #[simd_test(enable = "avx512fp16")]
22019    const fn test_mm512_maskz_fmsubadd_ph() {
22020        let a = _mm512_set1_ph(1.0);
22021        let b = _mm512_set1_ph(2.0);
22022        let c = _mm512_set1_ph(3.0);
22023        let r = _mm512_maskz_fmsubadd_ph(0b00110011001100110011001100110011, a, b, c);
22024        let e = _mm512_set_ph(
22025            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
22026            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
22027        );
22028        assert_eq_m512h(r, e);
22029    }
22030
22031    #[simd_test(enable = "avx512fp16")]
22032    fn test_mm512_fmsubadd_round_ph() {
22033        let a = _mm512_set1_ph(1.0);
22034        let b = _mm512_set1_ph(2.0);
22035        let c = _mm512_set1_ph(3.0);
22036        let r =
22037            _mm512_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
22038        let e = _mm512_set_ph(
22039            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
22040            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
22041        );
22042        assert_eq_m512h(r, e);
22043    }
22044
22045    #[simd_test(enable = "avx512fp16")]
22046    fn test_mm512_mask_fmsubadd_round_ph() {
22047        let a = _mm512_set1_ph(1.0);
22048        let b = _mm512_set1_ph(2.0);
22049        let c = _mm512_set1_ph(3.0);
22050        let r = _mm512_mask_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22051            a,
22052            0b00110011001100110011001100110011,
22053            b,
22054            c,
22055        );
22056        let e = _mm512_set_ph(
22057            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
22058            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
22059        );
22060        assert_eq_m512h(r, e);
22061    }
22062
22063    #[simd_test(enable = "avx512fp16")]
22064    fn test_mm512_mask3_fmsubadd_round_ph() {
22065        let a = _mm512_set1_ph(1.0);
22066        let b = _mm512_set1_ph(2.0);
22067        let c = _mm512_set1_ph(3.0);
22068        let r = _mm512_mask3_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22069            a,
22070            b,
22071            c,
22072            0b00110011001100110011001100110011,
22073        );
22074        let e = _mm512_set_ph(
22075            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
22076            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
22077        );
22078        assert_eq_m512h(r, e);
22079    }
22080
22081    #[simd_test(enable = "avx512fp16")]
22082    fn test_mm512_maskz_fmsubadd_round_ph() {
22083        let a = _mm512_set1_ph(1.0);
22084        let b = _mm512_set1_ph(2.0);
22085        let c = _mm512_set1_ph(3.0);
22086        let r = _mm512_maskz_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22087            0b00110011001100110011001100110011,
22088            a,
22089            b,
22090            c,
22091        );
22092        let e = _mm512_set_ph(
22093            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
22094            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
22095        );
22096        assert_eq_m512h(r, e);
22097    }
22098
22099    #[simd_test(enable = "avx512fp16,avx512vl")]
22100    fn test_mm_rcp_ph() {
22101        let a = _mm_set1_ph(2.0);
22102        let r = _mm_rcp_ph(a);
22103        let e = _mm_set1_ph(0.5);
22104        assert_eq_m128h(r, e);
22105    }
22106
22107    #[simd_test(enable = "avx512fp16,avx512vl")]
22108    fn test_mm_mask_rcp_ph() {
22109        let a = _mm_set1_ph(2.0);
22110        let src = _mm_set1_ph(1.0);
22111        let r = _mm_mask_rcp_ph(src, 0b01010101, a);
22112        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
22113        assert_eq_m128h(r, e);
22114    }
22115
22116    #[simd_test(enable = "avx512fp16,avx512vl")]
22117    fn test_mm_maskz_rcp_ph() {
22118        let a = _mm_set1_ph(2.0);
22119        let r = _mm_maskz_rcp_ph(0b01010101, a);
22120        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
22121        assert_eq_m128h(r, e);
22122    }
22123
22124    #[simd_test(enable = "avx512fp16,avx512vl")]
22125    fn test_mm256_rcp_ph() {
22126        let a = _mm256_set1_ph(2.0);
22127        let r = _mm256_rcp_ph(a);
22128        let e = _mm256_set1_ph(0.5);
22129        assert_eq_m256h(r, e);
22130    }
22131
22132    #[simd_test(enable = "avx512fp16,avx512vl")]
22133    fn test_mm256_mask_rcp_ph() {
22134        let a = _mm256_set1_ph(2.0);
22135        let src = _mm256_set1_ph(1.0);
22136        let r = _mm256_mask_rcp_ph(src, 0b0101010101010101, a);
22137        let e = _mm256_set_ph(
22138            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22139        );
22140        assert_eq_m256h(r, e);
22141    }
22142
22143    #[simd_test(enable = "avx512fp16,avx512vl")]
22144    fn test_mm256_maskz_rcp_ph() {
22145        let a = _mm256_set1_ph(2.0);
22146        let r = _mm256_maskz_rcp_ph(0b0101010101010101, a);
22147        let e = _mm256_set_ph(
22148            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22149        );
22150        assert_eq_m256h(r, e);
22151    }
22152
22153    #[simd_test(enable = "avx512fp16")]
22154    fn test_mm512_rcp_ph() {
22155        let a = _mm512_set1_ph(2.0);
22156        let r = _mm512_rcp_ph(a);
22157        let e = _mm512_set1_ph(0.5);
22158        assert_eq_m512h(r, e);
22159    }
22160
22161    #[simd_test(enable = "avx512fp16")]
22162    fn test_mm512_mask_rcp_ph() {
22163        let a = _mm512_set1_ph(2.0);
22164        let src = _mm512_set1_ph(1.0);
22165        let r = _mm512_mask_rcp_ph(src, 0b01010101010101010101010101010101, a);
22166        let e = _mm512_set_ph(
22167            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
22168            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22169        );
22170        assert_eq_m512h(r, e);
22171    }
22172
22173    #[simd_test(enable = "avx512fp16")]
22174    fn test_mm512_maskz_rcp_ph() {
22175        let a = _mm512_set1_ph(2.0);
22176        let r = _mm512_maskz_rcp_ph(0b01010101010101010101010101010101, a);
22177        let e = _mm512_set_ph(
22178            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
22179            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22180        );
22181        assert_eq_m512h(r, e);
22182    }
22183
22184    #[simd_test(enable = "avx512fp16,avx512vl")]
22185    fn test_mm_rcp_sh() {
22186        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22187        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22188        let r = _mm_rcp_sh(a, b);
22189        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22190        assert_eq_m128h(r, e);
22191    }
22192
22193    #[simd_test(enable = "avx512fp16,avx512vl")]
22194    fn test_mm_mask_rcp_sh() {
22195        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22196        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22197        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22198        let r = _mm_mask_rcp_sh(src, 0, a, b);
22199        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22200        assert_eq_m128h(r, e);
22201        let r = _mm_mask_rcp_sh(src, 1, a, b);
22202        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22203        assert_eq_m128h(r, e);
22204    }
22205
22206    #[simd_test(enable = "avx512fp16,avx512vl")]
22207    fn test_mm_maskz_rcp_sh() {
22208        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22209        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22210        let r = _mm_maskz_rcp_sh(0, a, b);
22211        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22212        assert_eq_m128h(r, e);
22213        let r = _mm_maskz_rcp_sh(1, a, b);
22214        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22215        assert_eq_m128h(r, e);
22216    }
22217
22218    #[simd_test(enable = "avx512fp16,avx512vl")]
22219    fn test_mm_rsqrt_ph() {
22220        let a = _mm_set1_ph(4.0);
22221        let r = _mm_rsqrt_ph(a);
22222        let e = _mm_set1_ph(0.5);
22223        assert_eq_m128h(r, e);
22224    }
22225
22226    #[simd_test(enable = "avx512fp16,avx512vl")]
22227    fn test_mm_mask_rsqrt_ph() {
22228        let a = _mm_set1_ph(4.0);
22229        let src = _mm_set1_ph(1.0);
22230        let r = _mm_mask_rsqrt_ph(src, 0b01010101, a);
22231        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
22232        assert_eq_m128h(r, e);
22233    }
22234
22235    #[simd_test(enable = "avx512fp16,avx512vl")]
22236    fn test_mm_maskz_rsqrt_ph() {
22237        let a = _mm_set1_ph(4.0);
22238        let r = _mm_maskz_rsqrt_ph(0b01010101, a);
22239        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
22240        assert_eq_m128h(r, e);
22241    }
22242
22243    #[simd_test(enable = "avx512fp16,avx512vl")]
22244    fn test_mm256_rsqrt_ph() {
22245        let a = _mm256_set1_ph(4.0);
22246        let r = _mm256_rsqrt_ph(a);
22247        let e = _mm256_set1_ph(0.5);
22248        assert_eq_m256h(r, e);
22249    }
22250
22251    #[simd_test(enable = "avx512fp16,avx512vl")]
22252    fn test_mm256_mask_rsqrt_ph() {
22253        let a = _mm256_set1_ph(4.0);
22254        let src = _mm256_set1_ph(1.0);
22255        let r = _mm256_mask_rsqrt_ph(src, 0b0101010101010101, a);
22256        let e = _mm256_set_ph(
22257            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22258        );
22259        assert_eq_m256h(r, e);
22260    }
22261
22262    #[simd_test(enable = "avx512fp16,avx512vl")]
22263    fn test_mm256_maskz_rsqrt_ph() {
22264        let a = _mm256_set1_ph(4.0);
22265        let r = _mm256_maskz_rsqrt_ph(0b0101010101010101, a);
22266        let e = _mm256_set_ph(
22267            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22268        );
22269        assert_eq_m256h(r, e);
22270    }
22271
22272    #[simd_test(enable = "avx512fp16")]
22273    fn test_mm512_rsqrt_ph() {
22274        let a = _mm512_set1_ph(4.0);
22275        let r = _mm512_rsqrt_ph(a);
22276        let e = _mm512_set1_ph(0.5);
22277        assert_eq_m512h(r, e);
22278    }
22279
22280    #[simd_test(enable = "avx512fp16")]
22281    fn test_mm512_mask_rsqrt_ph() {
22282        let a = _mm512_set1_ph(4.0);
22283        let src = _mm512_set1_ph(1.0);
22284        let r = _mm512_mask_rsqrt_ph(src, 0b01010101010101010101010101010101, a);
22285        let e = _mm512_set_ph(
22286            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
22287            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22288        );
22289        assert_eq_m512h(r, e);
22290    }
22291
22292    #[simd_test(enable = "avx512fp16")]
22293    fn test_mm512_maskz_rsqrt_ph() {
22294        let a = _mm512_set1_ph(4.0);
22295        let r = _mm512_maskz_rsqrt_ph(0b01010101010101010101010101010101, a);
22296        let e = _mm512_set_ph(
22297            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
22298            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22299        );
22300        assert_eq_m512h(r, e);
22301    }
22302
22303    #[simd_test(enable = "avx512fp16,avx512vl")]
22304    fn test_mm_rsqrt_sh() {
22305        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22306        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22307        let r = _mm_rsqrt_sh(a, b);
22308        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22309        assert_eq_m128h(r, e);
22310    }
22311
22312    #[simd_test(enable = "avx512fp16,avx512vl")]
22313    fn test_mm_mask_rsqrt_sh() {
22314        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22315        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22316        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22317        let r = _mm_mask_rsqrt_sh(src, 0, a, b);
22318        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22319        assert_eq_m128h(r, e);
22320        let r = _mm_mask_rsqrt_sh(src, 1, a, b);
22321        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22322        assert_eq_m128h(r, e);
22323    }
22324
22325    #[simd_test(enable = "avx512fp16,avx512vl")]
22326    fn test_mm_maskz_rsqrt_sh() {
22327        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22328        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22329        let r = _mm_maskz_rsqrt_sh(0, a, b);
22330        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22331        assert_eq_m128h(r, e);
22332        let r = _mm_maskz_rsqrt_sh(1, a, b);
22333        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22334        assert_eq_m128h(r, e);
22335    }
22336
22337    #[simd_test(enable = "avx512fp16,avx512vl")]
22338    fn test_mm_sqrt_ph() {
22339        let a = _mm_set1_ph(4.0);
22340        let r = _mm_sqrt_ph(a);
22341        let e = _mm_set1_ph(2.0);
22342        assert_eq_m128h(r, e);
22343    }
22344
22345    #[simd_test(enable = "avx512fp16,avx512vl")]
22346    fn test_mm_mask_sqrt_ph() {
22347        let a = _mm_set1_ph(4.0);
22348        let src = _mm_set1_ph(1.0);
22349        let r = _mm_mask_sqrt_ph(src, 0b01010101, a);
22350        let e = _mm_set_ph(1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0);
22351        assert_eq_m128h(r, e);
22352    }
22353
22354    #[simd_test(enable = "avx512fp16,avx512vl")]
22355    fn test_mm_maskz_sqrt_ph() {
22356        let a = _mm_set1_ph(4.0);
22357        let r = _mm_maskz_sqrt_ph(0b01010101, a);
22358        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22359        assert_eq_m128h(r, e);
22360    }
22361
22362    #[simd_test(enable = "avx512fp16,avx512vl")]
22363    fn test_mm256_sqrt_ph() {
22364        let a = _mm256_set1_ph(4.0);
22365        let r = _mm256_sqrt_ph(a);
22366        let e = _mm256_set1_ph(2.0);
22367        assert_eq_m256h(r, e);
22368    }
22369
22370    #[simd_test(enable = "avx512fp16,avx512vl")]
22371    fn test_mm256_mask_sqrt_ph() {
22372        let a = _mm256_set1_ph(4.0);
22373        let src = _mm256_set1_ph(1.0);
22374        let r = _mm256_mask_sqrt_ph(src, 0b0101010101010101, a);
22375        let e = _mm256_set_ph(
22376            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22377        );
22378        assert_eq_m256h(r, e);
22379    }
22380
22381    #[simd_test(enable = "avx512fp16,avx512vl")]
22382    fn test_mm256_maskz_sqrt_ph() {
22383        let a = _mm256_set1_ph(4.0);
22384        let r = _mm256_maskz_sqrt_ph(0b0101010101010101, a);
22385        let e = _mm256_set_ph(
22386            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22387        );
22388        assert_eq_m256h(r, e);
22389    }
22390
22391    #[simd_test(enable = "avx512fp16")]
22392    fn test_mm512_sqrt_ph() {
22393        let a = _mm512_set1_ph(4.0);
22394        let r = _mm512_sqrt_ph(a);
22395        let e = _mm512_set1_ph(2.0);
22396        assert_eq_m512h(r, e);
22397    }
22398
22399    #[simd_test(enable = "avx512fp16")]
22400    fn test_mm512_mask_sqrt_ph() {
22401        let a = _mm512_set1_ph(4.0);
22402        let src = _mm512_set1_ph(1.0);
22403        let r = _mm512_mask_sqrt_ph(src, 0b01010101010101010101010101010101, a);
22404        let e = _mm512_set_ph(
22405            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22406            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22407        );
22408        assert_eq_m512h(r, e);
22409    }
22410
22411    #[simd_test(enable = "avx512fp16")]
22412    fn test_mm512_maskz_sqrt_ph() {
22413        let a = _mm512_set1_ph(4.0);
22414        let r = _mm512_maskz_sqrt_ph(0b01010101010101010101010101010101, a);
22415        let e = _mm512_set_ph(
22416            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22417            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22418        );
22419        assert_eq_m512h(r, e);
22420    }
22421
22422    #[simd_test(enable = "avx512fp16")]
22423    fn test_mm512_sqrt_round_ph() {
22424        let a = _mm512_set1_ph(4.0);
22425        let r = _mm512_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
22426        let e = _mm512_set1_ph(2.0);
22427        assert_eq_m512h(r, e);
22428    }
22429
22430    #[simd_test(enable = "avx512fp16")]
22431    fn test_mm512_mask_sqrt_round_ph() {
22432        let a = _mm512_set1_ph(4.0);
22433        let src = _mm512_set1_ph(1.0);
22434        let r = _mm512_mask_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22435            src,
22436            0b01010101010101010101010101010101,
22437            a,
22438        );
22439        let e = _mm512_set_ph(
22440            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22441            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22442        );
22443        assert_eq_m512h(r, e);
22444    }
22445
22446    #[simd_test(enable = "avx512fp16")]
22447    fn test_mm512_maskz_sqrt_round_ph() {
22448        let a = _mm512_set1_ph(4.0);
22449        let r = _mm512_maskz_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22450            0b01010101010101010101010101010101,
22451            a,
22452        );
22453        let e = _mm512_set_ph(
22454            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22455            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22456        );
22457        assert_eq_m512h(r, e);
22458    }
22459
22460    #[simd_test(enable = "avx512fp16,avx512vl")]
22461    fn test_mm_sqrt_sh() {
22462        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22463        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22464        let r = _mm_sqrt_sh(a, b);
22465        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22466        assert_eq_m128h(r, e);
22467    }
22468
22469    #[simd_test(enable = "avx512fp16,avx512vl")]
22470    fn test_mm_mask_sqrt_sh() {
22471        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22472        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22473        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22474        let r = _mm_mask_sqrt_sh(src, 0, a, b);
22475        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22476        assert_eq_m128h(r, e);
22477        let r = _mm_mask_sqrt_sh(src, 1, a, b);
22478        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22479        assert_eq_m128h(r, e);
22480    }
22481
22482    #[simd_test(enable = "avx512fp16,avx512vl")]
22483    fn test_mm_maskz_sqrt_sh() {
22484        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22485        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22486        let r = _mm_maskz_sqrt_sh(0, a, b);
22487        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22488        assert_eq_m128h(r, e);
22489        let r = _mm_maskz_sqrt_sh(1, a, b);
22490        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22491        assert_eq_m128h(r, e);
22492    }
22493
22494    #[simd_test(enable = "avx512fp16,avx512vl")]
22495    fn test_mm_sqrt_round_sh() {
22496        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22497        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22498        let r = _mm_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22499        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22500        assert_eq_m128h(r, e);
22501    }
22502
22503    #[simd_test(enable = "avx512fp16,avx512vl")]
22504    fn test_mm_mask_sqrt_round_sh() {
22505        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22506        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22507        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22508        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22509            src, 0, a, b,
22510        );
22511        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22512        assert_eq_m128h(r, e);
22513        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22514            src, 1, a, b,
22515        );
22516        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22517        assert_eq_m128h(r, e);
22518    }
22519
22520    #[simd_test(enable = "avx512fp16,avx512vl")]
22521    fn test_mm_maskz_sqrt_round_sh() {
22522        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22523        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22524        let r =
22525            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22526        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22527        assert_eq_m128h(r, e);
22528        let r =
22529            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22530        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22531        assert_eq_m128h(r, e);
22532    }
22533
22534    #[simd_test(enable = "avx512fp16,avx512vl")]
22535    fn test_mm_max_ph() {
22536        let a = _mm_set1_ph(2.0);
22537        let b = _mm_set1_ph(1.0);
22538        let r = _mm_max_ph(a, b);
22539        let e = _mm_set1_ph(2.0);
22540        assert_eq_m128h(r, e);
22541    }
22542
22543    #[simd_test(enable = "avx512fp16,avx512vl")]
22544    fn test_mm_mask_max_ph() {
22545        let a = _mm_set1_ph(2.0);
22546        let b = _mm_set1_ph(1.0);
22547        let src = _mm_set1_ph(3.0);
22548        let r = _mm_mask_max_ph(src, 0b01010101, a, b);
22549        let e = _mm_set_ph(3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0);
22550        assert_eq_m128h(r, e);
22551    }
22552
22553    #[simd_test(enable = "avx512fp16,avx512vl")]
22554    fn test_mm_maskz_max_ph() {
22555        let a = _mm_set1_ph(2.0);
22556        let b = _mm_set1_ph(1.0);
22557        let r = _mm_maskz_max_ph(0b01010101, a, b);
22558        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22559        assert_eq_m128h(r, e);
22560    }
22561
22562    #[simd_test(enable = "avx512fp16,avx512vl")]
22563    fn test_mm256_max_ph() {
22564        let a = _mm256_set1_ph(2.0);
22565        let b = _mm256_set1_ph(1.0);
22566        let r = _mm256_max_ph(a, b);
22567        let e = _mm256_set1_ph(2.0);
22568        assert_eq_m256h(r, e);
22569    }
22570
22571    #[simd_test(enable = "avx512fp16,avx512vl")]
22572    fn test_mm256_mask_max_ph() {
22573        let a = _mm256_set1_ph(2.0);
22574        let b = _mm256_set1_ph(1.0);
22575        let src = _mm256_set1_ph(3.0);
22576        let r = _mm256_mask_max_ph(src, 0b0101010101010101, a, b);
22577        let e = _mm256_set_ph(
22578            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22579        );
22580        assert_eq_m256h(r, e);
22581    }
22582
22583    #[simd_test(enable = "avx512fp16,avx512vl")]
22584    fn test_mm256_maskz_max_ph() {
22585        let a = _mm256_set1_ph(2.0);
22586        let b = _mm256_set1_ph(1.0);
22587        let r = _mm256_maskz_max_ph(0b0101010101010101, a, b);
22588        let e = _mm256_set_ph(
22589            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22590        );
22591        assert_eq_m256h(r, e);
22592    }
22593
22594    #[simd_test(enable = "avx512fp16")]
22595    fn test_mm512_max_ph() {
22596        let a = _mm512_set1_ph(2.0);
22597        let b = _mm512_set1_ph(1.0);
22598        let r = _mm512_max_ph(a, b);
22599        let e = _mm512_set1_ph(2.0);
22600        assert_eq_m512h(r, e);
22601    }
22602
22603    #[simd_test(enable = "avx512fp16")]
22604    fn test_mm512_mask_max_ph() {
22605        let a = _mm512_set1_ph(2.0);
22606        let b = _mm512_set1_ph(1.0);
22607        let src = _mm512_set1_ph(3.0);
22608        let r = _mm512_mask_max_ph(src, 0b01010101010101010101010101010101, a, b);
22609        let e = _mm512_set_ph(
22610            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22611            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22612        );
22613        assert_eq_m512h(r, e);
22614    }
22615
22616    #[simd_test(enable = "avx512fp16")]
22617    fn test_mm512_maskz_max_ph() {
22618        let a = _mm512_set1_ph(2.0);
22619        let b = _mm512_set1_ph(1.0);
22620        let r = _mm512_maskz_max_ph(0b01010101010101010101010101010101, a, b);
22621        let e = _mm512_set_ph(
22622            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22623            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22624        );
22625        assert_eq_m512h(r, e);
22626    }
22627
22628    #[simd_test(enable = "avx512fp16")]
22629    fn test_mm512_max_round_ph() {
22630        let a = _mm512_set1_ph(2.0);
22631        let b = _mm512_set1_ph(1.0);
22632        let r = _mm512_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22633        let e = _mm512_set1_ph(2.0);
22634        assert_eq_m512h(r, e);
22635    }
22636
22637    #[simd_test(enable = "avx512fp16")]
22638    fn test_mm512_mask_max_round_ph() {
22639        let a = _mm512_set1_ph(2.0);
22640        let b = _mm512_set1_ph(1.0);
22641        let src = _mm512_set1_ph(3.0);
22642        let r = _mm512_mask_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22643            src,
22644            0b01010101010101010101010101010101,
22645            a,
22646            b,
22647        );
22648        let e = _mm512_set_ph(
22649            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22650            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22651        );
22652        assert_eq_m512h(r, e);
22653    }
22654
22655    #[simd_test(enable = "avx512fp16")]
22656    fn test_mm512_maskz_max_round_ph() {
22657        let a = _mm512_set1_ph(2.0);
22658        let b = _mm512_set1_ph(1.0);
22659        let r = _mm512_maskz_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22660            0b01010101010101010101010101010101,
22661            a,
22662            b,
22663        );
22664        let e = _mm512_set_ph(
22665            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22666            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22667        );
22668        assert_eq_m512h(r, e);
22669    }
22670
22671    #[simd_test(enable = "avx512fp16,avx512vl")]
22672    fn test_mm_max_sh() {
22673        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22674        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22675        let r = _mm_max_sh(a, b);
22676        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22677        assert_eq_m128h(r, e);
22678    }
22679
22680    #[simd_test(enable = "avx512fp16,avx512vl")]
22681    fn test_mm_mask_max_sh() {
22682        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22683        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22684        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22685        let r = _mm_mask_max_sh(src, 0, a, b);
22686        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22687        assert_eq_m128h(r, e);
22688        let r = _mm_mask_max_sh(src, 1, a, b);
22689        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22690        assert_eq_m128h(r, e);
22691    }
22692
22693    #[simd_test(enable = "avx512fp16,avx512vl")]
22694    fn test_mm_maskz_max_sh() {
22695        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22696        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22697        let r = _mm_maskz_max_sh(0, a, b);
22698        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22699        assert_eq_m128h(r, e);
22700        let r = _mm_maskz_max_sh(1, a, b);
22701        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22702        assert_eq_m128h(r, e);
22703    }
22704
22705    #[simd_test(enable = "avx512fp16,avx512vl")]
22706    fn test_mm_max_round_sh() {
22707        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22708        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22709        let r = _mm_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22710        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22711        assert_eq_m128h(r, e);
22712    }
22713
22714    #[simd_test(enable = "avx512fp16,avx512vl")]
22715    fn test_mm_mask_max_round_sh() {
22716        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22717        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22718        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22719        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22720            src, 0, a, b,
22721        );
22722        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22723        assert_eq_m128h(r, e);
22724        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22725            src, 1, a, b,
22726        );
22727        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22728        assert_eq_m128h(r, e);
22729    }
22730
22731    #[simd_test(enable = "avx512fp16,avx512vl")]
22732    fn test_mm_maskz_max_round_sh() {
22733        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22734        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22735        let r =
22736            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22737        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22738        assert_eq_m128h(r, e);
22739        let r =
22740            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22741        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22742        assert_eq_m128h(r, e);
22743    }
22744
22745    #[simd_test(enable = "avx512fp16,avx512vl")]
22746    fn test_mm_min_ph() {
22747        let a = _mm_set1_ph(2.0);
22748        let b = _mm_set1_ph(1.0);
22749        let r = _mm_min_ph(a, b);
22750        let e = _mm_set1_ph(1.0);
22751        assert_eq_m128h(r, e);
22752    }
22753
22754    #[simd_test(enable = "avx512fp16,avx512vl")]
22755    fn test_mm_mask_min_ph() {
22756        let a = _mm_set1_ph(2.0);
22757        let b = _mm_set1_ph(1.0);
22758        let src = _mm_set1_ph(3.0);
22759        let r = _mm_mask_min_ph(src, 0b01010101, a, b);
22760        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
22761        assert_eq_m128h(r, e);
22762    }
22763
22764    #[simd_test(enable = "avx512fp16,avx512vl")]
22765    fn test_mm_maskz_min_ph() {
22766        let a = _mm_set1_ph(2.0);
22767        let b = _mm_set1_ph(1.0);
22768        let r = _mm_maskz_min_ph(0b01010101, a, b);
22769        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22770        assert_eq_m128h(r, e);
22771    }
22772
22773    #[simd_test(enable = "avx512fp16,avx512vl")]
22774    fn test_mm256_min_ph() {
22775        let a = _mm256_set1_ph(2.0);
22776        let b = _mm256_set1_ph(1.0);
22777        let r = _mm256_min_ph(a, b);
22778        let e = _mm256_set1_ph(1.0);
22779        assert_eq_m256h(r, e);
22780    }
22781
22782    #[simd_test(enable = "avx512fp16,avx512vl")]
22783    fn test_mm256_mask_min_ph() {
22784        let a = _mm256_set1_ph(2.0);
22785        let b = _mm256_set1_ph(1.0);
22786        let src = _mm256_set1_ph(3.0);
22787        let r = _mm256_mask_min_ph(src, 0b0101010101010101, a, b);
22788        let e = _mm256_set_ph(
22789            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22790        );
22791        assert_eq_m256h(r, e);
22792    }
22793
22794    #[simd_test(enable = "avx512fp16,avx512vl")]
22795    fn test_mm256_maskz_min_ph() {
22796        let a = _mm256_set1_ph(2.0);
22797        let b = _mm256_set1_ph(1.0);
22798        let r = _mm256_maskz_min_ph(0b0101010101010101, a, b);
22799        let e = _mm256_set_ph(
22800            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22801        );
22802        assert_eq_m256h(r, e);
22803    }
22804
22805    #[simd_test(enable = "avx512fp16")]
22806    fn test_mm512_min_ph() {
22807        let a = _mm512_set1_ph(2.0);
22808        let b = _mm512_set1_ph(1.0);
22809        let r = _mm512_min_ph(a, b);
22810        let e = _mm512_set1_ph(1.0);
22811        assert_eq_m512h(r, e);
22812    }
22813
22814    #[simd_test(enable = "avx512fp16")]
22815    fn test_mm512_mask_min_ph() {
22816        let a = _mm512_set1_ph(2.0);
22817        let b = _mm512_set1_ph(1.0);
22818        let src = _mm512_set1_ph(3.0);
22819        let r = _mm512_mask_min_ph(src, 0b01010101010101010101010101010101, a, b);
22820        let e = _mm512_set_ph(
22821            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22822            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22823        );
22824        assert_eq_m512h(r, e);
22825    }
22826
22827    #[simd_test(enable = "avx512fp16")]
22828    fn test_mm512_maskz_min_ph() {
22829        let a = _mm512_set1_ph(2.0);
22830        let b = _mm512_set1_ph(1.0);
22831        let r = _mm512_maskz_min_ph(0b01010101010101010101010101010101, a, b);
22832        let e = _mm512_set_ph(
22833            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22834            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22835        );
22836        assert_eq_m512h(r, e);
22837    }
22838
22839    #[simd_test(enable = "avx512fp16")]
22840    fn test_mm512_min_round_ph() {
22841        let a = _mm512_set1_ph(2.0);
22842        let b = _mm512_set1_ph(1.0);
22843        let r = _mm512_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22844        let e = _mm512_set1_ph(1.0);
22845        assert_eq_m512h(r, e);
22846    }
22847
22848    #[simd_test(enable = "avx512fp16")]
22849    fn test_mm512_mask_min_round_ph() {
22850        let a = _mm512_set1_ph(2.0);
22851        let b = _mm512_set1_ph(1.0);
22852        let src = _mm512_set1_ph(3.0);
22853        let r = _mm512_mask_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22854            src,
22855            0b01010101010101010101010101010101,
22856            a,
22857            b,
22858        );
22859        let e = _mm512_set_ph(
22860            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22861            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22862        );
22863        assert_eq_m512h(r, e);
22864    }
22865
22866    #[simd_test(enable = "avx512fp16")]
22867    fn test_mm512_maskz_min_round_ph() {
22868        let a = _mm512_set1_ph(2.0);
22869        let b = _mm512_set1_ph(1.0);
22870        let r = _mm512_maskz_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22871            0b01010101010101010101010101010101,
22872            a,
22873            b,
22874        );
22875        let e = _mm512_set_ph(
22876            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22877            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22878        );
22879        assert_eq_m512h(r, e);
22880    }
22881
22882    #[simd_test(enable = "avx512fp16,avx512vl")]
22883    fn test_mm_min_sh() {
22884        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22885        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22886        let r = _mm_min_sh(a, b);
22887        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22888        assert_eq_m128h(r, e);
22889    }
22890
22891    #[simd_test(enable = "avx512fp16,avx512vl")]
22892    fn test_mm_mask_min_sh() {
22893        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22894        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22895        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22896        let r = _mm_mask_min_sh(src, 0, a, b);
22897        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22898        assert_eq_m128h(r, e);
22899        let r = _mm_mask_min_sh(src, 1, a, b);
22900        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22901        assert_eq_m128h(r, e);
22902    }
22903
22904    #[simd_test(enable = "avx512fp16,avx512vl")]
22905    fn test_mm_maskz_min_sh() {
22906        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22907        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22908        let r = _mm_maskz_min_sh(0, a, b);
22909        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22910        assert_eq_m128h(r, e);
22911        let r = _mm_maskz_min_sh(1, a, b);
22912        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22913        assert_eq_m128h(r, e);
22914    }
22915
22916    #[simd_test(enable = "avx512fp16,avx512vl")]
22917    fn test_mm_min_round_sh() {
22918        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22919        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22920        let r = _mm_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22921        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22922        assert_eq_m128h(r, e);
22923    }
22924
22925    #[simd_test(enable = "avx512fp16,avx512vl")]
22926    fn test_mm_mask_min_round_sh() {
22927        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22928        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22929        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22930        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22931            src, 0, a, b,
22932        );
22933        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22934        assert_eq_m128h(r, e);
22935        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22936            src, 1, a, b,
22937        );
22938        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22939        assert_eq_m128h(r, e);
22940    }
22941
22942    #[simd_test(enable = "avx512fp16,avx512vl")]
22943    fn test_mm_maskz_min_round_sh() {
22944        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22945        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22946        let r =
22947            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22948        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22949        assert_eq_m128h(r, e);
22950        let r =
22951            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22952        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22953        assert_eq_m128h(r, e);
22954    }
22955
22956    #[simd_test(enable = "avx512fp16,avx512vl")]
22957    fn test_mm_getexp_ph() {
22958        let a = _mm_set1_ph(3.0);
22959        let r = _mm_getexp_ph(a);
22960        let e = _mm_set1_ph(1.0);
22961        assert_eq_m128h(r, e);
22962    }
22963
22964    #[simd_test(enable = "avx512fp16,avx512vl")]
22965    fn test_mm_mask_getexp_ph() {
22966        let a = _mm_set1_ph(3.0);
22967        let src = _mm_set1_ph(4.0);
22968        let r = _mm_mask_getexp_ph(src, 0b01010101, a);
22969        let e = _mm_set_ph(4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0);
22970        assert_eq_m128h(r, e);
22971    }
22972
22973    #[simd_test(enable = "avx512fp16,avx512vl")]
22974    fn test_mm_maskz_getexp_ph() {
22975        let a = _mm_set1_ph(3.0);
22976        let r = _mm_maskz_getexp_ph(0b01010101, a);
22977        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22978        assert_eq_m128h(r, e);
22979    }
22980
22981    #[simd_test(enable = "avx512fp16,avx512vl")]
22982    fn test_mm256_getexp_ph() {
22983        let a = _mm256_set1_ph(3.0);
22984        let r = _mm256_getexp_ph(a);
22985        let e = _mm256_set1_ph(1.0);
22986        assert_eq_m256h(r, e);
22987    }
22988
22989    #[simd_test(enable = "avx512fp16,avx512vl")]
22990    fn test_mm256_mask_getexp_ph() {
22991        let a = _mm256_set1_ph(3.0);
22992        let src = _mm256_set1_ph(4.0);
22993        let r = _mm256_mask_getexp_ph(src, 0b0101010101010101, a);
22994        let e = _mm256_set_ph(
22995            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22996        );
22997        assert_eq_m256h(r, e);
22998    }
22999
23000    #[simd_test(enable = "avx512fp16,avx512vl")]
23001    fn test_mm256_maskz_getexp_ph() {
23002        let a = _mm256_set1_ph(3.0);
23003        let r = _mm256_maskz_getexp_ph(0b0101010101010101, a);
23004        let e = _mm256_set_ph(
23005            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23006        );
23007        assert_eq_m256h(r, e);
23008    }
23009
23010    #[simd_test(enable = "avx512fp16")]
23011    fn test_mm512_getexp_ph() {
23012        let a = _mm512_set1_ph(3.0);
23013        let r = _mm512_getexp_ph(a);
23014        let e = _mm512_set1_ph(1.0);
23015        assert_eq_m512h(r, e);
23016    }
23017
23018    #[simd_test(enable = "avx512fp16")]
23019    fn test_mm512_mask_getexp_ph() {
23020        let a = _mm512_set1_ph(3.0);
23021        let src = _mm512_set1_ph(4.0);
23022        let r = _mm512_mask_getexp_ph(src, 0b01010101010101010101010101010101, a);
23023        let e = _mm512_set_ph(
23024            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
23025            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
23026        );
23027        assert_eq_m512h(r, e);
23028    }
23029
23030    #[simd_test(enable = "avx512fp16")]
23031    fn test_mm512_maskz_getexp_ph() {
23032        let a = _mm512_set1_ph(3.0);
23033        let r = _mm512_maskz_getexp_ph(0b01010101010101010101010101010101, a);
23034        let e = _mm512_set_ph(
23035            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23036            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23037        );
23038        assert_eq_m512h(r, e);
23039    }
23040
23041    #[simd_test(enable = "avx512fp16")]
23042    fn test_mm512_getexp_round_ph() {
23043        let a = _mm512_set1_ph(3.0);
23044        let r = _mm512_getexp_round_ph::<_MM_FROUND_NO_EXC>(a);
23045        let e = _mm512_set1_ph(1.0);
23046        assert_eq_m512h(r, e);
23047    }
23048
23049    #[simd_test(enable = "avx512fp16")]
23050    fn test_mm512_mask_getexp_round_ph() {
23051        let a = _mm512_set1_ph(3.0);
23052        let src = _mm512_set1_ph(4.0);
23053        let r = _mm512_mask_getexp_round_ph::<_MM_FROUND_NO_EXC>(
23054            src,
23055            0b01010101010101010101010101010101,
23056            a,
23057        );
23058        let e = _mm512_set_ph(
23059            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
23060            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
23061        );
23062        assert_eq_m512h(r, e);
23063    }
23064
23065    #[simd_test(enable = "avx512fp16")]
23066    fn test_mm512_maskz_getexp_round_ph() {
23067        let a = _mm512_set1_ph(3.0);
23068        let r = _mm512_maskz_getexp_round_ph::<_MM_FROUND_NO_EXC>(
23069            0b01010101010101010101010101010101,
23070            a,
23071        );
23072        let e = _mm512_set_ph(
23073            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23074            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23075        );
23076        assert_eq_m512h(r, e);
23077    }
23078
23079    #[simd_test(enable = "avx512fp16,avx512vl")]
23080    fn test_mm_getexp_sh() {
23081        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23082        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23083        let r = _mm_getexp_sh(a, b);
23084        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23085        assert_eq_m128h(r, e);
23086    }
23087
23088    #[simd_test(enable = "avx512fp16,avx512vl")]
23089    fn test_mm_mask_getexp_sh() {
23090        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23091        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23092        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
23093        let r = _mm_mask_getexp_sh(src, 0, a, b);
23094        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23095        assert_eq_m128h(r, e);
23096        let r = _mm_mask_getexp_sh(src, 1, a, b);
23097        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23098        assert_eq_m128h(r, e);
23099    }
23100
23101    #[simd_test(enable = "avx512fp16,avx512vl")]
23102    fn test_mm_maskz_getexp_sh() {
23103        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23104        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23105        let r = _mm_maskz_getexp_sh(0, a, b);
23106        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23107        assert_eq_m128h(r, e);
23108        let r = _mm_maskz_getexp_sh(1, a, b);
23109        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23110        assert_eq_m128h(r, e);
23111    }
23112
23113    #[simd_test(enable = "avx512fp16,avx512vl")]
23114    fn test_mm_getexp_round_sh() {
23115        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23116        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23117        let r = _mm_getexp_round_sh::<_MM_FROUND_NO_EXC>(a, b);
23118        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23119        assert_eq_m128h(r, e);
23120    }
23121
23122    #[simd_test(enable = "avx512fp16,avx512vl")]
23123    fn test_mm_mask_getexp_round_sh() {
23124        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23125        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23126        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
23127        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 0, a, b);
23128        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23129        assert_eq_m128h(r, e);
23130        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 1, a, b);
23131        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23132        assert_eq_m128h(r, e);
23133    }
23134
23135    #[simd_test(enable = "avx512fp16,avx512vl")]
23136    fn test_mm_maskz_getexp_round_sh() {
23137        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23138        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23139        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(0, a, b);
23140        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23141        assert_eq_m128h(r, e);
23142        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(1, a, b);
23143        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23144        assert_eq_m128h(r, e);
23145    }
23146
23147    #[simd_test(enable = "avx512fp16,avx512vl")]
23148    fn test_mm_getmant_ph() {
23149        let a = _mm_set1_ph(10.0);
23150        let r = _mm_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
23151        let e = _mm_set1_ph(1.25);
23152        assert_eq_m128h(r, e);
23153    }
23154
23155    #[simd_test(enable = "avx512fp16,avx512vl")]
23156    fn test_mm_mask_getmant_ph() {
23157        let a = _mm_set1_ph(10.0);
23158        let src = _mm_set1_ph(20.0);
23159        let r = _mm_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0b01010101, a);
23160        let e = _mm_set_ph(20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25);
23161        assert_eq_m128h(r, e);
23162    }
23163
23164    #[simd_test(enable = "avx512fp16,avx512vl")]
23165    fn test_mm_maskz_getmant_ph() {
23166        let a = _mm_set1_ph(10.0);
23167        let r = _mm_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0b01010101, a);
23168        let e = _mm_set_ph(0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25);
23169        assert_eq_m128h(r, e);
23170    }
23171
23172    #[simd_test(enable = "avx512fp16,avx512vl")]
23173    fn test_mm256_getmant_ph() {
23174        let a = _mm256_set1_ph(10.0);
23175        let r = _mm256_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
23176        let e = _mm256_set1_ph(1.25);
23177        assert_eq_m256h(r, e);
23178    }
23179
23180    #[simd_test(enable = "avx512fp16,avx512vl")]
23181    fn test_mm256_mask_getmant_ph() {
23182        let a = _mm256_set1_ph(10.0);
23183        let src = _mm256_set1_ph(20.0);
23184        let r = _mm256_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
23185            src,
23186            0b0101010101010101,
23187            a,
23188        );
23189        let e = _mm256_set_ph(
23190            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23191            20.0, 1.25,
23192        );
23193        assert_eq_m256h(r, e);
23194    }
23195
23196    #[simd_test(enable = "avx512fp16,avx512vl")]
23197    fn test_mm256_maskz_getmant_ph() {
23198        let a = _mm256_set1_ph(10.0);
23199        let r = _mm256_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
23200            0b0101010101010101,
23201            a,
23202        );
23203        let e = _mm256_set_ph(
23204            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23205        );
23206        assert_eq_m256h(r, e);
23207    }
23208
23209    #[simd_test(enable = "avx512fp16")]
23210    fn test_mm512_getmant_ph() {
23211        let a = _mm512_set1_ph(10.0);
23212        let r = _mm512_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
23213        let e = _mm512_set1_ph(1.25);
23214        assert_eq_m512h(r, e);
23215    }
23216
23217    #[simd_test(enable = "avx512fp16")]
23218    fn test_mm512_mask_getmant_ph() {
23219        let a = _mm512_set1_ph(10.0);
23220        let src = _mm512_set1_ph(20.0);
23221        let r = _mm512_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
23222            src,
23223            0b01010101010101010101010101010101,
23224            a,
23225        );
23226        let e = _mm512_set_ph(
23227            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23228            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23229            20.0, 1.25, 20.0, 1.25,
23230        );
23231        assert_eq_m512h(r, e);
23232    }
23233
23234    #[simd_test(enable = "avx512fp16")]
23235    fn test_mm512_maskz_getmant_ph() {
23236        let a = _mm512_set1_ph(10.0);
23237        let r = _mm512_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
23238            0b01010101010101010101010101010101,
23239            a,
23240        );
23241        let e = _mm512_set_ph(
23242            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23243            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23244        );
23245        assert_eq_m512h(r, e);
23246    }
23247
23248    #[simd_test(enable = "avx512fp16")]
23249    fn test_mm512_getmant_round_ph() {
23250        let a = _mm512_set1_ph(10.0);
23251        let r =
23252            _mm512_getmant_round_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
23253                a,
23254            );
23255        let e = _mm512_set1_ph(1.25);
23256        assert_eq_m512h(r, e);
23257    }
23258
23259    #[simd_test(enable = "avx512fp16")]
23260    fn test_mm512_mask_getmant_round_ph() {
23261        let a = _mm512_set1_ph(10.0);
23262        let src = _mm512_set1_ph(20.0);
23263        let r = _mm512_mask_getmant_round_ph::<
23264            _MM_MANT_NORM_P75_1P5,
23265            _MM_MANT_SIGN_NAN,
23266            _MM_FROUND_NO_EXC,
23267        >(src, 0b01010101010101010101010101010101, a);
23268        let e = _mm512_set_ph(
23269            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23270            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23271            20.0, 1.25, 20.0, 1.25,
23272        );
23273        assert_eq_m512h(r, e);
23274    }
23275
23276    #[simd_test(enable = "avx512fp16")]
23277    fn test_mm512_maskz_getmant_round_ph() {
23278        let a = _mm512_set1_ph(10.0);
23279        let r = _mm512_maskz_getmant_round_ph::<
23280            _MM_MANT_NORM_P75_1P5,
23281            _MM_MANT_SIGN_NAN,
23282            _MM_FROUND_NO_EXC,
23283        >(0b01010101010101010101010101010101, a);
23284        let e = _mm512_set_ph(
23285            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23286            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23287        );
23288        assert_eq_m512h(r, e);
23289    }
23290
23291    #[simd_test(enable = "avx512fp16,avx512vl")]
23292    fn test_mm_getmant_sh() {
23293        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23294        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23295        let r = _mm_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a, b);
23296        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23297        assert_eq_m128h(r, e);
23298    }
23299
23300    #[simd_test(enable = "avx512fp16,avx512vl")]
23301    fn test_mm_mask_getmant_sh() {
23302        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23303        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23304        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
23305        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0, a, b);
23306        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
23307        assert_eq_m128h(r, e);
23308        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 1, a, b);
23309        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23310        assert_eq_m128h(r, e);
23311    }
23312
23313    #[simd_test(enable = "avx512fp16,avx512vl")]
23314    fn test_mm_maskz_getmant_sh() {
23315        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23316        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23317        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0, a, b);
23318        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23319        assert_eq_m128h(r, e);
23320        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(1, a, b);
23321        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23322        assert_eq_m128h(r, e);
23323    }
23324
23325    #[simd_test(enable = "avx512fp16,avx512vl")]
23326    fn test_mm_getmant_round_sh() {
23327        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23328        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23329        let r = _mm_getmant_round_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
23330            a, b,
23331        );
23332        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23333        assert_eq_m128h(r, e);
23334    }
23335
23336    #[simd_test(enable = "avx512fp16,avx512vl")]
23337    fn test_mm_mask_getmant_round_sh() {
23338        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23339        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23340        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
23341        let r = _mm_mask_getmant_round_sh::<
23342            _MM_MANT_NORM_P75_1P5,
23343            _MM_MANT_SIGN_NAN,
23344            _MM_FROUND_NO_EXC,
23345        >(src, 0, a, b);
23346        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
23347        assert_eq_m128h(r, e);
23348        let r = _mm_mask_getmant_round_sh::<
23349            _MM_MANT_NORM_P75_1P5,
23350            _MM_MANT_SIGN_NAN,
23351            _MM_FROUND_NO_EXC,
23352        >(src, 1, a, b);
23353        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23354        assert_eq_m128h(r, e);
23355    }
23356
23357    #[simd_test(enable = "avx512fp16,avx512vl")]
23358    fn test_mm_maskz_getmant_round_sh() {
23359        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23360        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23361        let r = _mm_maskz_getmant_round_sh::<
23362            _MM_MANT_NORM_P75_1P5,
23363            _MM_MANT_SIGN_NAN,
23364            _MM_FROUND_NO_EXC,
23365        >(0, a, b);
23366        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23367        assert_eq_m128h(r, e);
23368        let r = _mm_maskz_getmant_round_sh::<
23369            _MM_MANT_NORM_P75_1P5,
23370            _MM_MANT_SIGN_NAN,
23371            _MM_FROUND_NO_EXC,
23372        >(1, a, b);
23373        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23374        assert_eq_m128h(r, e);
23375    }
23376
23377    #[simd_test(enable = "avx512fp16,avx512vl")]
23378    fn test_mm_roundscale_ph() {
23379        let a = _mm_set1_ph(1.1);
23380        let r = _mm_roundscale_ph::<0>(a);
23381        let e = _mm_set1_ph(1.0);
23382        assert_eq_m128h(r, e);
23383    }
23384
23385    #[simd_test(enable = "avx512fp16,avx512vl")]
23386    fn test_mm_mask_roundscale_ph() {
23387        let a = _mm_set1_ph(1.1);
23388        let src = _mm_set1_ph(2.0);
23389        let r = _mm_mask_roundscale_ph::<0>(src, 0b01010101, a);
23390        let e = _mm_set_ph(2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0);
23391        assert_eq_m128h(r, e);
23392    }
23393
23394    #[simd_test(enable = "avx512fp16,avx512vl")]
23395    fn test_mm_maskz_roundscale_ph() {
23396        let a = _mm_set1_ph(1.1);
23397        let r = _mm_maskz_roundscale_ph::<0>(0b01010101, a);
23398        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
23399        assert_eq_m128h(r, e);
23400    }
23401
23402    #[simd_test(enable = "avx512fp16,avx512vl")]
23403    fn test_mm256_roundscale_ph() {
23404        let a = _mm256_set1_ph(1.1);
23405        let r = _mm256_roundscale_ph::<0>(a);
23406        let e = _mm256_set1_ph(1.0);
23407        assert_eq_m256h(r, e);
23408    }
23409
23410    #[simd_test(enable = "avx512fp16,avx512vl")]
23411    fn test_mm256_mask_roundscale_ph() {
23412        let a = _mm256_set1_ph(1.1);
23413        let src = _mm256_set1_ph(2.0);
23414        let r = _mm256_mask_roundscale_ph::<0>(src, 0b0101010101010101, a);
23415        let e = _mm256_set_ph(
23416            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23417        );
23418        assert_eq_m256h(r, e);
23419    }
23420
23421    #[simd_test(enable = "avx512fp16,avx512vl")]
23422    fn test_mm256_maskz_roundscale_ph() {
23423        let a = _mm256_set1_ph(1.1);
23424        let r = _mm256_maskz_roundscale_ph::<0>(0b0101010101010101, a);
23425        let e = _mm256_set_ph(
23426            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23427        );
23428        assert_eq_m256h(r, e);
23429    }
23430
23431    #[simd_test(enable = "avx512fp16")]
23432    fn test_mm512_roundscale_ph() {
23433        let a = _mm512_set1_ph(1.1);
23434        let r = _mm512_roundscale_ph::<0>(a);
23435        let e = _mm512_set1_ph(1.0);
23436        assert_eq_m512h(r, e);
23437    }
23438
23439    #[simd_test(enable = "avx512fp16")]
23440    fn test_mm512_mask_roundscale_ph() {
23441        let a = _mm512_set1_ph(1.1);
23442        let src = _mm512_set1_ph(2.0);
23443        let r = _mm512_mask_roundscale_ph::<0>(src, 0b01010101010101010101010101010101, a);
23444        let e = _mm512_set_ph(
23445            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23446            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23447        );
23448        assert_eq_m512h(r, e);
23449    }
23450
23451    #[simd_test(enable = "avx512fp16")]
23452    fn test_mm512_maskz_roundscale_ph() {
23453        let a = _mm512_set1_ph(1.1);
23454        let r = _mm512_maskz_roundscale_ph::<0>(0b01010101010101010101010101010101, a);
23455        let e = _mm512_set_ph(
23456            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23457            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23458        );
23459        assert_eq_m512h(r, e);
23460    }
23461
23462    #[simd_test(enable = "avx512fp16")]
23463    fn test_mm512_roundscale_round_ph() {
23464        let a = _mm512_set1_ph(1.1);
23465        let r = _mm512_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(a);
23466        let e = _mm512_set1_ph(1.0);
23467        assert_eq_m512h(r, e);
23468    }
23469
23470    #[simd_test(enable = "avx512fp16")]
23471    fn test_mm512_mask_roundscale_round_ph() {
23472        let a = _mm512_set1_ph(1.1);
23473        let src = _mm512_set1_ph(2.0);
23474        let r = _mm512_mask_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23475            src,
23476            0b01010101010101010101010101010101,
23477            a,
23478        );
23479        let e = _mm512_set_ph(
23480            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23481            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23482        );
23483        assert_eq_m512h(r, e);
23484    }
23485
23486    #[simd_test(enable = "avx512fp16")]
23487    fn test_mm512_maskz_roundscale_round_ph() {
23488        let a = _mm512_set1_ph(1.1);
23489        let r = _mm512_maskz_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23490            0b01010101010101010101010101010101,
23491            a,
23492        );
23493        let e = _mm512_set_ph(
23494            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23495            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23496        );
23497        assert_eq_m512h(r, e);
23498    }
23499
23500    #[simd_test(enable = "avx512fp16,avx512vl")]
23501    fn test_mm_roundscale_sh() {
23502        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23503        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23504        let r = _mm_roundscale_sh::<0>(a, b);
23505        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23506        assert_eq_m128h(r, e);
23507    }
23508
23509    #[simd_test(enable = "avx512fp16,avx512vl")]
23510    fn test_mm_mask_roundscale_sh() {
23511        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23512        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23513        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23514        let r = _mm_mask_roundscale_sh::<0>(src, 0, a, b);
23515        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23516        assert_eq_m128h(r, e);
23517        let r = _mm_mask_roundscale_sh::<0>(src, 1, a, b);
23518        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23519        assert_eq_m128h(r, e);
23520    }
23521
23522    #[simd_test(enable = "avx512fp16,avx512vl")]
23523    fn test_mm_maskz_roundscale_sh() {
23524        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23525        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23526        let r = _mm_maskz_roundscale_sh::<0>(0, a, b);
23527        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23528        assert_eq_m128h(r, e);
23529        let r = _mm_maskz_roundscale_sh::<0>(1, a, b);
23530        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23531        assert_eq_m128h(r, e);
23532    }
23533
23534    #[simd_test(enable = "avx512fp16,avx512vl")]
23535    fn test_mm_roundscale_round_sh() {
23536        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23537        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23538        let r = _mm_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
23539        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23540        assert_eq_m128h(r, e);
23541    }
23542
23543    #[simd_test(enable = "avx512fp16,avx512vl")]
23544    fn test_mm_mask_roundscale_round_sh() {
23545        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23546        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23547        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23548        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
23549        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23550        assert_eq_m128h(r, e);
23551        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
23552        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23553        assert_eq_m128h(r, e);
23554    }
23555
23556    #[simd_test(enable = "avx512fp16,avx512vl")]
23557    fn test_mm_maskz_roundscale_round_sh() {
23558        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23559        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23560        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
23561        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23562        assert_eq_m128h(r, e);
23563        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
23564        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23565        assert_eq_m128h(r, e);
23566    }
23567
23568    #[simd_test(enable = "avx512fp16,avx512vl")]
23569    fn test_mm_scalef_ph() {
23570        let a = _mm_set1_ph(1.);
23571        let b = _mm_set1_ph(3.);
23572        let r = _mm_scalef_ph(a, b);
23573        let e = _mm_set1_ph(8.0);
23574        assert_eq_m128h(r, e);
23575    }
23576
23577    #[simd_test(enable = "avx512fp16,avx512vl")]
23578    fn test_mm_mask_scalef_ph() {
23579        let a = _mm_set1_ph(1.);
23580        let b = _mm_set1_ph(3.);
23581        let src = _mm_set1_ph(2.);
23582        let r = _mm_mask_scalef_ph(src, 0b01010101, a, b);
23583        let e = _mm_set_ph(2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0);
23584        assert_eq_m128h(r, e);
23585    }
23586
23587    #[simd_test(enable = "avx512fp16,avx512vl")]
23588    fn test_mm_maskz_scalef_ph() {
23589        let a = _mm_set1_ph(1.);
23590        let b = _mm_set1_ph(3.);
23591        let r = _mm_maskz_scalef_ph(0b01010101, a, b);
23592        let e = _mm_set_ph(0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0);
23593        assert_eq_m128h(r, e);
23594    }
23595
23596    #[simd_test(enable = "avx512fp16,avx512vl")]
23597    fn test_mm256_scalef_ph() {
23598        let a = _mm256_set1_ph(1.);
23599        let b = _mm256_set1_ph(3.);
23600        let r = _mm256_scalef_ph(a, b);
23601        let e = _mm256_set1_ph(8.0);
23602        assert_eq_m256h(r, e);
23603    }
23604
23605    #[simd_test(enable = "avx512fp16,avx512vl")]
23606    fn test_mm256_mask_scalef_ph() {
23607        let a = _mm256_set1_ph(1.);
23608        let b = _mm256_set1_ph(3.);
23609        let src = _mm256_set1_ph(2.);
23610        let r = _mm256_mask_scalef_ph(src, 0b0101010101010101, a, b);
23611        let e = _mm256_set_ph(
23612            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23613        );
23614        assert_eq_m256h(r, e);
23615    }
23616
23617    #[simd_test(enable = "avx512fp16,avx512vl")]
23618    fn test_mm256_maskz_scalef_ph() {
23619        let a = _mm256_set1_ph(1.);
23620        let b = _mm256_set1_ph(3.);
23621        let r = _mm256_maskz_scalef_ph(0b0101010101010101, a, b);
23622        let e = _mm256_set_ph(
23623            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23624        );
23625        assert_eq_m256h(r, e);
23626    }
23627
23628    #[simd_test(enable = "avx512fp16")]
23629    fn test_mm512_scalef_ph() {
23630        let a = _mm512_set1_ph(1.);
23631        let b = _mm512_set1_ph(3.);
23632        let r = _mm512_scalef_ph(a, b);
23633        let e = _mm512_set1_ph(8.0);
23634        assert_eq_m512h(r, e);
23635    }
23636
23637    #[simd_test(enable = "avx512fp16")]
23638    fn test_mm512_mask_scalef_ph() {
23639        let a = _mm512_set1_ph(1.);
23640        let b = _mm512_set1_ph(3.);
23641        let src = _mm512_set1_ph(2.);
23642        let r = _mm512_mask_scalef_ph(src, 0b01010101010101010101010101010101, a, b);
23643        let e = _mm512_set_ph(
23644            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23645            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23646        );
23647        assert_eq_m512h(r, e);
23648    }
23649
23650    #[simd_test(enable = "avx512fp16")]
23651    fn test_mm512_maskz_scalef_ph() {
23652        let a = _mm512_set1_ph(1.);
23653        let b = _mm512_set1_ph(3.);
23654        let r = _mm512_maskz_scalef_ph(0b01010101010101010101010101010101, a, b);
23655        let e = _mm512_set_ph(
23656            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23657            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23658        );
23659        assert_eq_m512h(r, e);
23660    }
23661
23662    #[simd_test(enable = "avx512fp16")]
23663    fn test_mm512_scalef_round_ph() {
23664        let a = _mm512_set1_ph(1.);
23665        let b = _mm512_set1_ph(3.);
23666        let r = _mm512_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23667        let e = _mm512_set1_ph(8.0);
23668        assert_eq_m512h(r, e);
23669    }
23670
23671    #[simd_test(enable = "avx512fp16")]
23672    fn test_mm512_mask_scalef_round_ph() {
23673        let a = _mm512_set1_ph(1.);
23674        let b = _mm512_set1_ph(3.);
23675        let src = _mm512_set1_ph(2.);
23676        let r = _mm512_mask_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23677            src,
23678            0b01010101010101010101010101010101,
23679            a,
23680            b,
23681        );
23682        let e = _mm512_set_ph(
23683            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23684            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23685        );
23686        assert_eq_m512h(r, e);
23687    }
23688
23689    #[simd_test(enable = "avx512fp16")]
23690    fn test_mm512_maskz_scalef_round_ph() {
23691        let a = _mm512_set1_ph(1.);
23692        let b = _mm512_set1_ph(3.);
23693        let r = _mm512_maskz_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23694            0b01010101010101010101010101010101,
23695            a,
23696            b,
23697        );
23698        let e = _mm512_set_ph(
23699            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23700            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23701        );
23702        assert_eq_m512h(r, e);
23703    }
23704
23705    #[simd_test(enable = "avx512fp16,avx512vl")]
23706    fn test_mm_scalef_sh() {
23707        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23708        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23709        let r = _mm_scalef_sh(a, b);
23710        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23711        assert_eq_m128h(r, e);
23712    }
23713
23714    #[simd_test(enable = "avx512fp16,avx512vl")]
23715    fn test_mm_mask_scalef_sh() {
23716        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23717        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23718        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23719        let r = _mm_mask_scalef_sh(src, 0, a, b);
23720        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23721        assert_eq_m128h(r, e);
23722        let r = _mm_mask_scalef_sh(src, 1, a, b);
23723        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23724        assert_eq_m128h(r, e);
23725    }
23726
23727    #[simd_test(enable = "avx512fp16,avx512vl")]
23728    fn test_mm_maskz_scalef_sh() {
23729        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23730        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23731        let r = _mm_maskz_scalef_sh(0, a, b);
23732        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23733        assert_eq_m128h(r, e);
23734        let r = _mm_maskz_scalef_sh(1, a, b);
23735        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23736        assert_eq_m128h(r, e);
23737    }
23738
23739    #[simd_test(enable = "avx512fp16,avx512vl")]
23740    fn test_mm_scalef_round_sh() {
23741        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23742        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23743        let r = _mm_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23744        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23745        assert_eq_m128h(r, e);
23746    }
23747
23748    #[simd_test(enable = "avx512fp16,avx512vl")]
23749    fn test_mm_mask_scalef_round_sh() {
23750        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23751        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23752        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23753        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23754            src, 0, a, b,
23755        );
23756        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23757        assert_eq_m128h(r, e);
23758        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23759            src, 1, a, b,
23760        );
23761        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23762        assert_eq_m128h(r, e);
23763    }
23764
23765    #[simd_test(enable = "avx512fp16,avx512vl")]
23766    fn test_mm_maskz_scalef_round_sh() {
23767        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23768        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23769        let r =
23770            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
23771        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23772        assert_eq_m128h(r, e);
23773        let r =
23774            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
23775        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23776        assert_eq_m128h(r, e);
23777    }
23778
23779    #[simd_test(enable = "avx512fp16,avx512vl")]
23780    fn test_mm_reduce_ph() {
23781        let a = _mm_set1_ph(1.25);
23782        let r = _mm_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23783        let e = _mm_set1_ph(0.25);
23784        assert_eq_m128h(r, e);
23785    }
23786
23787    #[simd_test(enable = "avx512fp16,avx512vl")]
23788    fn test_mm_mask_reduce_ph() {
23789        let a = _mm_set1_ph(1.25);
23790        let src = _mm_set1_ph(2.0);
23791        let r = _mm_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01010101, a);
23792        let e = _mm_set_ph(2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25);
23793        assert_eq_m128h(r, e);
23794    }
23795
23796    #[simd_test(enable = "avx512fp16,avx512vl")]
23797    fn test_mm_maskz_reduce_ph() {
23798        let a = _mm_set1_ph(1.25);
23799        let r = _mm_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01010101, a);
23800        let e = _mm_set_ph(0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25);
23801        assert_eq_m128h(r, e);
23802    }
23803
23804    #[simd_test(enable = "avx512fp16,avx512vl")]
23805    fn test_mm256_reduce_ph() {
23806        let a = _mm256_set1_ph(1.25);
23807        let r = _mm256_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23808        let e = _mm256_set1_ph(0.25);
23809        assert_eq_m256h(r, e);
23810    }
23811
23812    #[simd_test(enable = "avx512fp16,avx512vl")]
23813    fn test_mm256_mask_reduce_ph() {
23814        let a = _mm256_set1_ph(1.25);
23815        let src = _mm256_set1_ph(2.0);
23816        let r = _mm256_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0101010101010101, a);
23817        let e = _mm256_set_ph(
23818            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23819        );
23820        assert_eq_m256h(r, e);
23821    }
23822
23823    #[simd_test(enable = "avx512fp16,avx512vl")]
23824    fn test_mm256_maskz_reduce_ph() {
23825        let a = _mm256_set1_ph(1.25);
23826        let r = _mm256_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0101010101010101, a);
23827        let e = _mm256_set_ph(
23828            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23829        );
23830        assert_eq_m256h(r, e);
23831    }
23832
23833    #[simd_test(enable = "avx512fp16")]
23834    fn test_mm512_reduce_ph() {
23835        let a = _mm512_set1_ph(1.25);
23836        let r = _mm512_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23837        let e = _mm512_set1_ph(0.25);
23838        assert_eq_m512h(r, e);
23839    }
23840
23841    #[simd_test(enable = "avx512fp16")]
23842    fn test_mm512_mask_reduce_ph() {
23843        let a = _mm512_set1_ph(1.25);
23844        let src = _mm512_set1_ph(2.0);
23845        let r = _mm512_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23846            src,
23847            0b01010101010101010101010101010101,
23848            a,
23849        );
23850        let e = _mm512_set_ph(
23851            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23852            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23853        );
23854        assert_eq_m512h(r, e);
23855    }
23856
23857    #[simd_test(enable = "avx512fp16")]
23858    fn test_mm512_maskz_reduce_ph() {
23859        let a = _mm512_set1_ph(1.25);
23860        let r = _mm512_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23861            0b01010101010101010101010101010101,
23862            a,
23863        );
23864        let e = _mm512_set_ph(
23865            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23866            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23867        );
23868        assert_eq_m512h(r, e);
23869    }
23870
23871    #[simd_test(enable = "avx512fp16")]
23872    fn test_mm512_reduce_round_ph() {
23873        let a = _mm512_set1_ph(1.25);
23874        let r = _mm512_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
23875        let e = _mm512_set1_ph(0.25);
23876        assert_eq_m512h(r, e);
23877    }
23878
23879    #[simd_test(enable = "avx512fp16")]
23880    fn test_mm512_mask_reduce_round_ph() {
23881        let a = _mm512_set1_ph(1.25);
23882        let src = _mm512_set1_ph(2.0);
23883        let r = _mm512_mask_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23884            src,
23885            0b01010101010101010101010101010101,
23886            a,
23887        );
23888        let e = _mm512_set_ph(
23889            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23890            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23891        );
23892        assert_eq_m512h(r, e);
23893    }
23894
23895    #[simd_test(enable = "avx512fp16")]
23896    fn test_mm512_maskz_reduce_round_ph() {
23897        let a = _mm512_set1_ph(1.25);
23898        let r = _mm512_maskz_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23899            0b01010101010101010101010101010101,
23900            a,
23901        );
23902        let e = _mm512_set_ph(
23903            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23904            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23905        );
23906        assert_eq_m512h(r, e);
23907    }
23908
23909    #[simd_test(enable = "avx512fp16,avx512vl")]
23910    fn test_mm_reduce_sh() {
23911        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23912        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23913        let r = _mm_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
23914        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23915        assert_eq_m128h(r, e);
23916    }
23917
23918    #[simd_test(enable = "avx512fp16,avx512vl")]
23919    fn test_mm_mask_reduce_sh() {
23920        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23921        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23922        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23923        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0, a, b);
23924        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23925        assert_eq_m128h(r, e);
23926        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 1, a, b);
23927        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23928        assert_eq_m128h(r, e);
23929    }
23930
23931    #[simd_test(enable = "avx512fp16,avx512vl")]
23932    fn test_mm_maskz_reduce_sh() {
23933        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23934        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23935        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(0, a, b);
23936        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23937        assert_eq_m128h(r, e);
23938        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(1, a, b);
23939        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23940        assert_eq_m128h(r, e);
23941    }
23942
23943    #[simd_test(enable = "avx512fp16,avx512vl")]
23944    fn test_mm_reduce_round_sh() {
23945        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23946        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23947        let r = _mm_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
23948        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23949        assert_eq_m128h(r, e);
23950    }
23951
23952    #[simd_test(enable = "avx512fp16,avx512vl")]
23953    fn test_mm_mask_reduce_round_sh() {
23954        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23955        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23956        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23957        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23958            src, 0, a, b,
23959        );
23960        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23961        assert_eq_m128h(r, e);
23962        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23963            src, 1, a, b,
23964        );
23965        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23966        assert_eq_m128h(r, e);
23967    }
23968
23969    #[simd_test(enable = "avx512fp16,avx512vl")]
23970    fn test_mm_maskz_reduce_round_sh() {
23971        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23972        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23973        let r =
23974            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0, a, b);
23975        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23976        assert_eq_m128h(r, e);
23977        let r =
23978            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(1, a, b);
23979        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23980        assert_eq_m128h(r, e);
23981    }
23982
23983    #[simd_test(enable = "avx512fp16,avx512vl")]
23984    const fn test_mm_reduce_add_ph() {
23985        let a = _mm_set1_ph(2.0);
23986        let r = _mm_reduce_add_ph(a);
23987        assert_eq!(r, 16.0);
23988    }
23989
23990    #[simd_test(enable = "avx512fp16,avx512vl")]
23991    const fn test_mm256_reduce_add_ph() {
23992        let a = _mm256_set1_ph(2.0);
23993        let r = _mm256_reduce_add_ph(a);
23994        assert_eq!(r, 32.0);
23995    }
23996
23997    #[simd_test(enable = "avx512fp16")]
23998    const fn test_mm512_reduce_add_ph() {
23999        let a = _mm512_set1_ph(2.0);
24000        let r = _mm512_reduce_add_ph(a);
24001        assert_eq!(r, 64.0);
24002    }
24003
24004    #[simd_test(enable = "avx512fp16,avx512vl")]
24005    const fn test_mm_reduce_mul_ph() {
24006        let a = _mm_set1_ph(2.0);
24007        let r = _mm_reduce_mul_ph(a);
24008        assert_eq!(r, 256.0);
24009    }
24010
24011    #[simd_test(enable = "avx512fp16,avx512vl")]
24012    const fn test_mm256_reduce_mul_ph() {
24013        let a = _mm256_set1_ph(1.2);
24014        let r = _mm256_reduce_mul_ph(a);
24015        assert_eq!(r, 18.5);
24016    }
24017
24018    #[simd_test(enable = "avx512fp16")]
24019    const fn test_mm512_reduce_mul_ph() {
24020        let a = _mm512_set1_ph(1.2);
24021        let r = _mm512_reduce_mul_ph(a);
24022        assert_eq!(r, 342.3);
24023    }
24024
24025    #[simd_test(enable = "avx512fp16,avx512vl")]
24026    fn test_mm_reduce_max_ph() {
24027        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24028        let r = _mm_reduce_max_ph(a);
24029        assert_eq!(r, 8.0);
24030    }
24031
24032    #[simd_test(enable = "avx512fp16,avx512vl")]
24033    fn test_mm256_reduce_max_ph() {
24034        let a = _mm256_set_ph(
24035            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24036        );
24037        let r = _mm256_reduce_max_ph(a);
24038        assert_eq!(r, 16.0);
24039    }
24040
24041    #[simd_test(enable = "avx512fp16")]
24042    fn test_mm512_reduce_max_ph() {
24043        let a = _mm512_set_ph(
24044            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24045            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24046            31.0, 32.0,
24047        );
24048        let r = _mm512_reduce_max_ph(a);
24049        assert_eq!(r, 32.0);
24050    }
24051
24052    #[simd_test(enable = "avx512fp16,avx512vl")]
24053    fn test_mm_reduce_min_ph() {
24054        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24055        let r = _mm_reduce_min_ph(a);
24056        assert_eq!(r, 1.0);
24057    }
24058
24059    #[simd_test(enable = "avx512fp16,avx512vl")]
24060    fn test_mm256_reduce_min_ph() {
24061        let a = _mm256_set_ph(
24062            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24063        );
24064        let r = _mm256_reduce_min_ph(a);
24065        assert_eq!(r, 1.0);
24066    }
24067
24068    #[simd_test(enable = "avx512fp16")]
24069    fn test_mm512_reduce_min_ph() {
24070        let a = _mm512_set_ph(
24071            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24072            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24073            31.0, 32.0,
24074        );
24075        let r = _mm512_reduce_min_ph(a);
24076        assert_eq!(r, 1.0);
24077    }
24078
24079    #[simd_test(enable = "avx512fp16,avx512vl")]
24080    fn test_mm_fpclass_ph_mask() {
24081        let a = _mm_set_ph(
24082            1.,
24083            f16::INFINITY,
24084            f16::NEG_INFINITY,
24085            0.0,
24086            -0.0,
24087            -2.0,
24088            f16::NAN,
24089            5.9e-8, // Denormal
24090        );
24091        let r = _mm_fpclass_ph_mask::<0x18>(a); // infinities
24092        assert_eq!(r, 0b01100000);
24093    }
24094
24095    #[simd_test(enable = "avx512fp16,avx512vl")]
24096    fn test_mm_mask_fpclass_ph_mask() {
24097        let a = _mm_set_ph(
24098            1.,
24099            f16::INFINITY,
24100            f16::NEG_INFINITY,
24101            0.0,
24102            -0.0,
24103            -2.0,
24104            f16::NAN,
24105            5.9e-8, // Denormal
24106        );
24107        let r = _mm_mask_fpclass_ph_mask::<0x18>(0b01010101, a);
24108        assert_eq!(r, 0b01000000);
24109    }
24110
24111    #[simd_test(enable = "avx512fp16,avx512vl")]
24112    fn test_mm256_fpclass_ph_mask() {
24113        let a = _mm256_set_ph(
24114            1.,
24115            f16::INFINITY,
24116            f16::NEG_INFINITY,
24117            0.0,
24118            -0.0,
24119            -2.0,
24120            f16::NAN,
24121            5.9e-8, // Denormal
24122            1.,
24123            f16::INFINITY,
24124            f16::NEG_INFINITY,
24125            0.0,
24126            -0.0,
24127            -2.0,
24128            f16::NAN,
24129            5.9e-8, // Denormal
24130        );
24131        let r = _mm256_fpclass_ph_mask::<0x18>(a); // infinities
24132        assert_eq!(r, 0b0110000001100000);
24133    }
24134
24135    #[simd_test(enable = "avx512fp16,avx512vl")]
24136    fn test_mm256_mask_fpclass_ph_mask() {
24137        let a = _mm256_set_ph(
24138            1.,
24139            f16::INFINITY,
24140            f16::NEG_INFINITY,
24141            0.0,
24142            -0.0,
24143            -2.0,
24144            f16::NAN,
24145            5.9e-8, // Denormal
24146            1.,
24147            f16::INFINITY,
24148            f16::NEG_INFINITY,
24149            0.0,
24150            -0.0,
24151            -2.0,
24152            f16::NAN,
24153            5.9e-8, // Denormal
24154        );
24155        let r = _mm256_mask_fpclass_ph_mask::<0x18>(0b0101010101010101, a);
24156        assert_eq!(r, 0b0100000001000000);
24157    }
24158
24159    #[simd_test(enable = "avx512fp16")]
24160    fn test_mm512_fpclass_ph_mask() {
24161        let a = _mm512_set_ph(
24162            1.,
24163            f16::INFINITY,
24164            f16::NEG_INFINITY,
24165            0.0,
24166            -0.0,
24167            -2.0,
24168            f16::NAN,
24169            5.9e-8, // Denormal
24170            1.,
24171            f16::INFINITY,
24172            f16::NEG_INFINITY,
24173            0.0,
24174            -0.0,
24175            -2.0,
24176            f16::NAN,
24177            5.9e-8, // Denormal
24178            1.,
24179            f16::INFINITY,
24180            f16::NEG_INFINITY,
24181            0.0,
24182            -0.0,
24183            -2.0,
24184            f16::NAN,
24185            5.9e-8, // Denormal
24186            1.,
24187            f16::INFINITY,
24188            f16::NEG_INFINITY,
24189            0.0,
24190            -0.0,
24191            -2.0,
24192            f16::NAN,
24193            5.9e-8, // Denormal
24194        );
24195        let r = _mm512_fpclass_ph_mask::<0x18>(a); // infinities
24196        assert_eq!(r, 0b01100000011000000110000001100000);
24197    }
24198
24199    #[simd_test(enable = "avx512fp16")]
24200    fn test_mm512_mask_fpclass_ph_mask() {
24201        let a = _mm512_set_ph(
24202            1.,
24203            f16::INFINITY,
24204            f16::NEG_INFINITY,
24205            0.0,
24206            -0.0,
24207            -2.0,
24208            f16::NAN,
24209            5.9e-8, // Denormal
24210            1.,
24211            f16::INFINITY,
24212            f16::NEG_INFINITY,
24213            0.0,
24214            -0.0,
24215            -2.0,
24216            f16::NAN,
24217            5.9e-8, // Denormal
24218            1.,
24219            f16::INFINITY,
24220            f16::NEG_INFINITY,
24221            0.0,
24222            -0.0,
24223            -2.0,
24224            f16::NAN,
24225            5.9e-8, // Denormal
24226            1.,
24227            f16::INFINITY,
24228            f16::NEG_INFINITY,
24229            0.0,
24230            -0.0,
24231            -2.0,
24232            f16::NAN,
24233            5.9e-8, // Denormal
24234        );
24235        let r = _mm512_mask_fpclass_ph_mask::<0x18>(0b01010101010101010101010101010101, a);
24236        assert_eq!(r, 0b01000000010000000100000001000000);
24237    }
24238
24239    #[simd_test(enable = "avx512fp16")]
24240    fn test_mm_fpclass_sh_mask() {
24241        let a = _mm_set_sh(f16::INFINITY);
24242        let r = _mm_fpclass_sh_mask::<0x18>(a);
24243        assert_eq!(r, 1);
24244    }
24245
24246    #[simd_test(enable = "avx512fp16")]
24247    fn test_mm_mask_fpclass_sh_mask() {
24248        let a = _mm_set_sh(f16::INFINITY);
24249        let r = _mm_mask_fpclass_sh_mask::<0x18>(0, a);
24250        assert_eq!(r, 0);
24251        let r = _mm_mask_fpclass_sh_mask::<0x18>(1, a);
24252        assert_eq!(r, 1);
24253    }
24254
24255    #[simd_test(enable = "avx512fp16,avx512vl")]
24256    const fn test_mm_mask_blend_ph() {
24257        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24258        let b = _mm_set_ph(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0);
24259        let r = _mm_mask_blend_ph(0b01010101, a, b);
24260        let e = _mm_set_ph(1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0);
24261        assert_eq_m128h(r, e);
24262    }
24263
24264    #[simd_test(enable = "avx512fp16,avx512vl")]
24265    const fn test_mm256_mask_blend_ph() {
24266        let a = _mm256_set_ph(
24267            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24268        );
24269        let b = _mm256_set_ph(
24270            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
24271            -14.0, -15.0, -16.0,
24272        );
24273        let r = _mm256_mask_blend_ph(0b0101010101010101, a, b);
24274        let e = _mm256_set_ph(
24275            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
24276            -16.0,
24277        );
24278        assert_eq_m256h(r, e);
24279    }
24280
24281    #[simd_test(enable = "avx512fp16")]
24282    const fn test_mm512_mask_blend_ph() {
24283        let a = _mm512_set_ph(
24284            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24285            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24286            31.0, 32.0,
24287        );
24288        let b = _mm512_set_ph(
24289            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
24290            -14.0, -15.0, -16.0, -17.0, -18.0, -19.0, -20.0, -21.0, -22.0, -23.0, -24.0, -25.0,
24291            -26.0, -27.0, -28.0, -29.0, -30.0, -31.0, -32.0,
24292        );
24293        let r = _mm512_mask_blend_ph(0b01010101010101010101010101010101, a, b);
24294        let e = _mm512_set_ph(
24295            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
24296            -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0, 27.0, -28.0,
24297            29.0, -30.0, 31.0, -32.0,
24298        );
24299        assert_eq_m512h(r, e);
24300    }
24301
24302    #[simd_test(enable = "avx512fp16,avx512vl")]
24303    fn test_mm_permutex2var_ph() {
24304        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24305        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
24306        let idx = _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14);
24307        let r = _mm_permutex2var_ph(a, idx, b);
24308        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
24309        assert_eq_m128h(r, e);
24310    }
24311
24312    #[simd_test(enable = "avx512fp16,avx512vl")]
24313    fn test_mm256_permutex2var_ph() {
24314        let a = _mm256_setr_ph(
24315            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24316        );
24317        let b = _mm256_setr_ph(
24318            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24319            31.0, 32.0,
24320        );
24321        let idx = _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
24322        let r = _mm256_permutex2var_ph(a, idx, b);
24323        let e = _mm256_setr_ph(
24324            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24325            31.0,
24326        );
24327        assert_eq_m256h(r, e);
24328    }
24329
24330    #[simd_test(enable = "avx512fp16")]
24331    fn test_mm512_permutex2var_ph() {
24332        let a = _mm512_setr_ph(
24333            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24334            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24335            31.0, 32.0,
24336        );
24337        let b = _mm512_setr_ph(
24338            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
24339            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
24340            61.0, 62.0, 63.0, 64.0,
24341        );
24342        let idx = _mm512_set_epi16(
24343            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20,
24344            18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
24345        );
24346        let r = _mm512_permutex2var_ph(a, idx, b);
24347        let e = _mm512_setr_ph(
24348            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24349            31.0, 33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0, 49.0, 51.0, 53.0, 55.0, 57.0,
24350            59.0, 61.0, 63.0,
24351        );
24352        assert_eq_m512h(r, e);
24353    }
24354
24355    #[simd_test(enable = "avx512fp16,avx512vl")]
24356    fn test_mm_permutexvar_ph() {
24357        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24358        let idx = _mm_set_epi16(0, 2, 4, 6, 1, 3, 5, 7);
24359        let r = _mm_permutexvar_ph(idx, a);
24360        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0);
24361        assert_eq_m128h(r, e);
24362    }
24363
24364    #[simd_test(enable = "avx512fp16,avx512vl")]
24365    fn test_mm256_permutexvar_ph() {
24366        let a = _mm256_set_ph(
24367            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24368        );
24369        let idx = _mm256_set_epi16(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
24370        let r = _mm256_permutexvar_ph(idx, a);
24371        let e = _mm256_setr_ph(
24372            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
24373        );
24374        assert_eq_m256h(r, e);
24375    }
24376
24377    #[simd_test(enable = "avx512fp16")]
24378    fn test_mm512_permutexvar_ph() {
24379        let a = _mm512_set_ph(
24380            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24381            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24382            31.0, 32.0,
24383        );
24384        let idx = _mm512_set_epi16(
24385            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15,
24386            17, 19, 21, 23, 25, 27, 29, 31,
24387        );
24388        let r = _mm512_permutexvar_ph(idx, a);
24389        let e = _mm512_setr_ph(
24390            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24391            31.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0,
24392            30.0, 32.0,
24393        );
24394        assert_eq_m512h(r, e);
24395    }
24396
24397    #[simd_test(enable = "avx512fp16,avx512vl")]
24398    fn test_mm_cvtepi16_ph() {
24399        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24400        let r = _mm_cvtepi16_ph(a);
24401        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24402        assert_eq_m128h(r, e);
24403    }
24404
24405    #[simd_test(enable = "avx512fp16,avx512vl")]
24406    fn test_mm_mask_cvtepi16_ph() {
24407        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24408        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24409        let r = _mm_mask_cvtepi16_ph(src, 0b01010101, a);
24410        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24411        assert_eq_m128h(r, e);
24412    }
24413
24414    #[simd_test(enable = "avx512fp16,avx512vl")]
24415    fn test_mm_maskz_cvtepi16_ph() {
24416        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24417        let r = _mm_maskz_cvtepi16_ph(0b01010101, a);
24418        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24419        assert_eq_m128h(r, e);
24420    }
24421
24422    #[simd_test(enable = "avx512fp16,avx512vl")]
24423    fn test_mm256_cvtepi16_ph() {
24424        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24425        let r = _mm256_cvtepi16_ph(a);
24426        let e = _mm256_set_ph(
24427            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24428        );
24429        assert_eq_m256h(r, e);
24430    }
24431
24432    #[simd_test(enable = "avx512fp16,avx512vl")]
24433    fn test_mm256_mask_cvtepi16_ph() {
24434        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24435        let src = _mm256_set_ph(
24436            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24437        );
24438        let r = _mm256_mask_cvtepi16_ph(src, 0b0101010101010101, a);
24439        let e = _mm256_set_ph(
24440            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24441        );
24442        assert_eq_m256h(r, e);
24443    }
24444
24445    #[simd_test(enable = "avx512fp16,avx512vl")]
24446    fn test_mm256_maskz_cvtepi16_ph() {
24447        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24448        let r = _mm256_maskz_cvtepi16_ph(0b0101010101010101, a);
24449        let e = _mm256_set_ph(
24450            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24451        );
24452        assert_eq_m256h(r, e);
24453    }
24454
24455    #[simd_test(enable = "avx512fp16")]
24456    fn test_mm512_cvtepi16_ph() {
24457        let a = _mm512_set_epi16(
24458            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24459            25, 26, 27, 28, 29, 30, 31, 32,
24460        );
24461        let r = _mm512_cvtepi16_ph(a);
24462        let e = _mm512_set_ph(
24463            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24464            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24465            31.0, 32.0,
24466        );
24467        assert_eq_m512h(r, e);
24468    }
24469
24470    #[simd_test(enable = "avx512fp16")]
24471    fn test_mm512_mask_cvtepi16_ph() {
24472        let a = _mm512_set_epi16(
24473            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24474            25, 26, 27, 28, 29, 30, 31, 32,
24475        );
24476        let src = _mm512_set_ph(
24477            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24478            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24479        );
24480        let r = _mm512_mask_cvtepi16_ph(src, 0b01010101010101010101010101010101, a);
24481        let e = _mm512_set_ph(
24482            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24483            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24484        );
24485        assert_eq_m512h(r, e);
24486    }
24487
24488    #[simd_test(enable = "avx512fp16")]
24489    fn test_mm512_maskz_cvtepi16_ph() {
24490        let a = _mm512_set_epi16(
24491            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24492            25, 26, 27, 28, 29, 30, 31, 32,
24493        );
24494        let r = _mm512_maskz_cvtepi16_ph(0b01010101010101010101010101010101, a);
24495        let e = _mm512_set_ph(
24496            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24497            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24498        );
24499        assert_eq_m512h(r, e);
24500    }
24501
24502    #[simd_test(enable = "avx512fp16")]
24503    fn test_mm512_cvt_roundepi16_ph() {
24504        let a = _mm512_set_epi16(
24505            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24506            25, 26, 27, 28, 29, 30, 31, 32,
24507        );
24508        let r = _mm512_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24509        let e = _mm512_set_ph(
24510            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24511            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24512            31.0, 32.0,
24513        );
24514        assert_eq_m512h(r, e);
24515    }
24516
24517    #[simd_test(enable = "avx512fp16")]
24518    fn test_mm512_mask_cvt_roundepi16_ph() {
24519        let a = _mm512_set_epi16(
24520            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24521            25, 26, 27, 28, 29, 30, 31, 32,
24522        );
24523        let src = _mm512_set_ph(
24524            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24525            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24526        );
24527        let r = _mm512_mask_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24528            src,
24529            0b01010101010101010101010101010101,
24530            a,
24531        );
24532        let e = _mm512_set_ph(
24533            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24534            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24535        );
24536        assert_eq_m512h(r, e);
24537    }
24538
24539    #[simd_test(enable = "avx512fp16")]
24540    fn test_mm512_maskz_cvt_roundepi16_ph() {
24541        let a = _mm512_set_epi16(
24542            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24543            25, 26, 27, 28, 29, 30, 31, 32,
24544        );
24545        let r = _mm512_maskz_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24546            0b01010101010101010101010101010101,
24547            a,
24548        );
24549        let e = _mm512_set_ph(
24550            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24551            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24552        );
24553        assert_eq_m512h(r, e);
24554    }
24555
24556    #[simd_test(enable = "avx512fp16,avx512vl")]
24557    fn test_mm_cvtepu16_ph() {
24558        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24559        let r = _mm_cvtepu16_ph(a);
24560        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24561        assert_eq_m128h(r, e);
24562    }
24563
24564    #[simd_test(enable = "avx512fp16,avx512vl")]
24565    fn test_mm_mask_cvtepu16_ph() {
24566        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24567        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24568        let r = _mm_mask_cvtepu16_ph(src, 0b01010101, a);
24569        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24570        assert_eq_m128h(r, e);
24571    }
24572
24573    #[simd_test(enable = "avx512fp16,avx512vl")]
24574    fn test_mm_maskz_cvtepu16_ph() {
24575        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24576        let r = _mm_maskz_cvtepu16_ph(0b01010101, a);
24577        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24578        assert_eq_m128h(r, e);
24579    }
24580
24581    #[simd_test(enable = "avx512fp16,avx512vl")]
24582    fn test_mm256_cvtepu16_ph() {
24583        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24584        let r = _mm256_cvtepu16_ph(a);
24585        let e = _mm256_set_ph(
24586            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24587        );
24588        assert_eq_m256h(r, e);
24589    }
24590
24591    #[simd_test(enable = "avx512fp16,avx512vl")]
24592    fn test_mm256_mask_cvtepu16_ph() {
24593        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24594        let src = _mm256_set_ph(
24595            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24596        );
24597        let r = _mm256_mask_cvtepu16_ph(src, 0b0101010101010101, a);
24598        let e = _mm256_set_ph(
24599            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24600        );
24601        assert_eq_m256h(r, e);
24602    }
24603
24604    #[simd_test(enable = "avx512fp16,avx512vl")]
24605    fn test_mm256_maskz_cvtepu16_ph() {
24606        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24607        let r = _mm256_maskz_cvtepu16_ph(0b0101010101010101, a);
24608        let e = _mm256_set_ph(
24609            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24610        );
24611        assert_eq_m256h(r, e);
24612    }
24613
24614    #[simd_test(enable = "avx512fp16")]
24615    fn test_mm512_cvtepu16_ph() {
24616        let a = _mm512_set_epi16(
24617            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24618            25, 26, 27, 28, 29, 30, 31, 32,
24619        );
24620        let r = _mm512_cvtepu16_ph(a);
24621        let e = _mm512_set_ph(
24622            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24623            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24624            31.0, 32.0,
24625        );
24626        assert_eq_m512h(r, e);
24627    }
24628
24629    #[simd_test(enable = "avx512fp16")]
24630    fn test_mm512_mask_cvtepu16_ph() {
24631        let a = _mm512_set_epi16(
24632            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24633            25, 26, 27, 28, 29, 30, 31, 32,
24634        );
24635        let src = _mm512_set_ph(
24636            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24637            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24638        );
24639        let r = _mm512_mask_cvtepu16_ph(src, 0b01010101010101010101010101010101, a);
24640        let e = _mm512_set_ph(
24641            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24642            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24643        );
24644        assert_eq_m512h(r, e);
24645    }
24646
24647    #[simd_test(enable = "avx512fp16")]
24648    fn test_mm512_maskz_cvtepu16_ph() {
24649        let a = _mm512_set_epi16(
24650            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24651            25, 26, 27, 28, 29, 30, 31, 32,
24652        );
24653        let r = _mm512_maskz_cvtepu16_ph(0b01010101010101010101010101010101, a);
24654        let e = _mm512_set_ph(
24655            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24656            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24657        );
24658        assert_eq_m512h(r, e);
24659    }
24660
24661    #[simd_test(enable = "avx512fp16")]
24662    fn test_mm512_cvt_roundepu16_ph() {
24663        let a = _mm512_set_epi16(
24664            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24665            25, 26, 27, 28, 29, 30, 31, 32,
24666        );
24667        let r = _mm512_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24668        let e = _mm512_set_ph(
24669            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24670            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24671            31.0, 32.0,
24672        );
24673        assert_eq_m512h(r, e);
24674    }
24675
24676    #[simd_test(enable = "avx512fp16")]
24677    fn test_mm512_mask_cvt_roundepu16_ph() {
24678        let a = _mm512_set_epi16(
24679            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24680            25, 26, 27, 28, 29, 30, 31, 32,
24681        );
24682        let src = _mm512_set_ph(
24683            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24684            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24685        );
24686        let r = _mm512_mask_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24687            src,
24688            0b01010101010101010101010101010101,
24689            a,
24690        );
24691        let e = _mm512_set_ph(
24692            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24693            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24694        );
24695        assert_eq_m512h(r, e);
24696    }
24697
24698    #[simd_test(enable = "avx512fp16")]
24699    fn test_mm512_maskz_cvt_roundepu16_ph() {
24700        let a = _mm512_set_epi16(
24701            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24702            25, 26, 27, 28, 29, 30, 31, 32,
24703        );
24704        let r = _mm512_maskz_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24705            0b01010101010101010101010101010101,
24706            a,
24707        );
24708        let e = _mm512_set_ph(
24709            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24710            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24711        );
24712        assert_eq_m512h(r, e);
24713    }
24714
24715    #[simd_test(enable = "avx512fp16,avx512vl")]
24716    fn test_mm_cvtepi32_ph() {
24717        let a = _mm_set_epi32(1, 2, 3, 4);
24718        let r = _mm_cvtepi32_ph(a);
24719        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24720        assert_eq_m128h(r, e);
24721    }
24722
24723    #[simd_test(enable = "avx512fp16,avx512vl")]
24724    fn test_mm_mask_cvtepi32_ph() {
24725        let a = _mm_set_epi32(1, 2, 3, 4);
24726        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24727        let r = _mm_mask_cvtepi32_ph(src, 0b0101, a);
24728        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24729        assert_eq_m128h(r, e);
24730    }
24731
24732    #[simd_test(enable = "avx512fp16,avx512vl")]
24733    fn test_mm_maskz_cvtepi32_ph() {
24734        let a = _mm_set_epi32(1, 2, 3, 4);
24735        let r = _mm_maskz_cvtepi32_ph(0b0101, a);
24736        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24737        assert_eq_m128h(r, e);
24738    }
24739
24740    #[simd_test(enable = "avx512fp16,avx512vl")]
24741    fn test_mm256_cvtepi32_ph() {
24742        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24743        let r = _mm256_cvtepi32_ph(a);
24744        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24745        assert_eq_m128h(r, e);
24746    }
24747
24748    #[simd_test(enable = "avx512fp16,avx512vl")]
24749    fn test_mm256_mask_cvtepi32_ph() {
24750        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24751        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24752        let r = _mm256_mask_cvtepi32_ph(src, 0b01010101, a);
24753        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24754        assert_eq_m128h(r, e);
24755    }
24756
24757    #[simd_test(enable = "avx512fp16,avx512vl")]
24758    fn test_mm256_maskz_cvtepi32_ph() {
24759        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24760        let r = _mm256_maskz_cvtepi32_ph(0b01010101, a);
24761        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24762        assert_eq_m128h(r, e);
24763    }
24764
24765    #[simd_test(enable = "avx512fp16")]
24766    fn test_mm512_cvtepi32_ph() {
24767        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24768        let r = _mm512_cvtepi32_ph(a);
24769        let e = _mm256_set_ph(
24770            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24771        );
24772        assert_eq_m256h(r, e);
24773    }
24774
24775    #[simd_test(enable = "avx512fp16,avx512vl")]
24776    fn test_mm512_mask_cvtepi32_ph() {
24777        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24778        let src = _mm256_set_ph(
24779            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24780        );
24781        let r = _mm512_mask_cvtepi32_ph(src, 0b0101010101010101, a);
24782        let e = _mm256_set_ph(
24783            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24784        );
24785        assert_eq_m256h(r, e);
24786    }
24787
24788    #[simd_test(enable = "avx512fp16,avx512vl")]
24789    fn test_mm512_maskz_cvtepi32_ph() {
24790        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24791        let r = _mm512_maskz_cvtepi32_ph(0b0101010101010101, a);
24792        let e = _mm256_set_ph(
24793            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24794        );
24795        assert_eq_m256h(r, e);
24796    }
24797
24798    #[simd_test(enable = "avx512fp16,avx512vl")]
24799    fn test_mm512_cvt_roundepi32_ph() {
24800        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24801        let r = _mm512_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24802        let e = _mm256_set_ph(
24803            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24804        );
24805        assert_eq_m256h(r, e);
24806    }
24807
24808    #[simd_test(enable = "avx512fp16,avx512vl")]
24809    fn test_mm512_mask_cvt_roundepi32_ph() {
24810        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24811        let src = _mm256_set_ph(
24812            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24813        );
24814        let r = _mm512_mask_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24815            src,
24816            0b0101010101010101,
24817            a,
24818        );
24819        let e = _mm256_set_ph(
24820            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24821        );
24822        assert_eq_m256h(r, e);
24823    }
24824
24825    #[simd_test(enable = "avx512fp16,avx512vl")]
24826    fn test_mm512_maskz_cvt_roundepi32_ph() {
24827        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24828        let r = _mm512_maskz_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24829            0b0101010101010101,
24830            a,
24831        );
24832        let e = _mm256_set_ph(
24833            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24834        );
24835        assert_eq_m256h(r, e);
24836    }
24837
24838    #[simd_test(enable = "avx512fp16,avx512vl")]
24839    fn test_mm_cvti32_sh() {
24840        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24841        let r = _mm_cvti32_sh(a, 10);
24842        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24843        assert_eq_m128h(r, e);
24844    }
24845
24846    #[simd_test(enable = "avx512fp16,avx512vl")]
24847    fn test_mm_cvt_roundi32_sh() {
24848        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24849        let r = _mm_cvt_roundi32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24850        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24851        assert_eq_m128h(r, e);
24852    }
24853
24854    #[simd_test(enable = "avx512fp16,avx512vl")]
24855    fn test_mm_cvtepu32_ph() {
24856        let a = _mm_set_epi32(1, 2, 3, 4);
24857        let r = _mm_cvtepu32_ph(a);
24858        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24859        assert_eq_m128h(r, e);
24860    }
24861
24862    #[simd_test(enable = "avx512fp16,avx512vl")]
24863    fn test_mm_mask_cvtepu32_ph() {
24864        let a = _mm_set_epi32(1, 2, 3, 4);
24865        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24866        let r = _mm_mask_cvtepu32_ph(src, 0b0101, a);
24867        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24868        assert_eq_m128h(r, e);
24869    }
24870
24871    #[simd_test(enable = "avx512fp16,avx512vl")]
24872    fn test_mm_maskz_cvtepu32_ph() {
24873        let a = _mm_set_epi32(1, 2, 3, 4);
24874        let r = _mm_maskz_cvtepu32_ph(0b0101, a);
24875        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24876        assert_eq_m128h(r, e);
24877    }
24878
24879    #[simd_test(enable = "avx512fp16,avx512vl")]
24880    fn test_mm256_cvtepu32_ph() {
24881        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24882        let r = _mm256_cvtepu32_ph(a);
24883        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24884        assert_eq_m128h(r, e);
24885    }
24886
24887    #[simd_test(enable = "avx512fp16,avx512vl")]
24888    fn test_mm256_mask_cvtepu32_ph() {
24889        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24890        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24891        let r = _mm256_mask_cvtepu32_ph(src, 0b01010101, a);
24892        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24893        assert_eq_m128h(r, e);
24894    }
24895
24896    #[simd_test(enable = "avx512fp16,avx512vl")]
24897    fn test_mm256_maskz_cvtepu32_ph() {
24898        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24899        let r = _mm256_maskz_cvtepu32_ph(0b01010101, a);
24900        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24901        assert_eq_m128h(r, e);
24902    }
24903
24904    #[simd_test(enable = "avx512fp16,avx512vl")]
24905    fn test_mm512_cvtepu32_ph() {
24906        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24907        let r = _mm512_cvtepu32_ph(a);
24908        let e = _mm256_set_ph(
24909            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24910        );
24911        assert_eq_m256h(r, e);
24912    }
24913
24914    #[simd_test(enable = "avx512fp16,avx512vl")]
24915    fn test_mm512_mask_cvtepu32_ph() {
24916        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24917        let src = _mm256_set_ph(
24918            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24919        );
24920        let r = _mm512_mask_cvtepu32_ph(src, 0b0101010101010101, a);
24921        let e = _mm256_set_ph(
24922            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24923        );
24924        assert_eq_m256h(r, e);
24925    }
24926
24927    #[simd_test(enable = "avx512fp16,avx512vl")]
24928    fn test_mm512_maskz_cvtepu32_ph() {
24929        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24930        let r = _mm512_maskz_cvtepu32_ph(0b0101010101010101, a);
24931        let e = _mm256_set_ph(
24932            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24933        );
24934        assert_eq_m256h(r, e);
24935    }
24936
24937    #[simd_test(enable = "avx512fp16,avx512vl")]
24938    fn test_mm512_cvt_roundepu32_ph() {
24939        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24940        let r = _mm512_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24941        let e = _mm256_set_ph(
24942            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24943        );
24944        assert_eq_m256h(r, e);
24945    }
24946
24947    #[simd_test(enable = "avx512fp16,avx512vl")]
24948    fn test_mm512_mask_cvt_roundepu32_ph() {
24949        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24950        let src = _mm256_set_ph(
24951            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24952        );
24953        let r = _mm512_mask_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24954            src,
24955            0b0101010101010101,
24956            a,
24957        );
24958        let e = _mm256_set_ph(
24959            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
24960            16.0,
24961        );
24962        assert_eq_m256h(r, e);
24963    }
24964
24965    #[simd_test(enable = "avx512fp16,avx512vl")]
24966    fn test_mm512_maskz_cvt_roundepu32_ph() {
24967        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24968        let r = _mm512_maskz_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24969            0b0101010101010101,
24970            a,
24971        );
24972        let e = _mm256_set_ph(
24973            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24974        );
24975        assert_eq_m256h(r, e);
24976    }
24977
24978    #[simd_test(enable = "avx512fp16,avx512vl")]
24979    fn test_mm_cvtu32_sh() {
24980        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24981        let r = _mm_cvtu32_sh(a, 10);
24982        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24983        assert_eq_m128h(r, e);
24984    }
24985
24986    #[simd_test(enable = "avx512fp16,avx512vl")]
24987    fn test_mm_cvt_roundu32_sh() {
24988        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24989        let r = _mm_cvt_roundu32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24990        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24991        assert_eq_m128h(r, e);
24992    }
24993
24994    #[simd_test(enable = "avx512fp16,avx512vl")]
24995    fn test_mm_cvtepi64_ph() {
24996        let a = _mm_set_epi64x(1, 2);
24997        let r = _mm_cvtepi64_ph(a);
24998        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24999        assert_eq_m128h(r, e);
25000    }
25001
25002    #[simd_test(enable = "avx512fp16,avx512vl")]
25003    fn test_mm_mask_cvtepi64_ph() {
25004        let a = _mm_set_epi64x(1, 2);
25005        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25006        let r = _mm_mask_cvtepi64_ph(src, 0b01, a);
25007        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
25008        assert_eq_m128h(r, e);
25009    }
25010
25011    #[simd_test(enable = "avx512fp16,avx512vl")]
25012    fn test_mm_maskz_cvtepi64_ph() {
25013        let a = _mm_set_epi64x(1, 2);
25014        let r = _mm_maskz_cvtepi64_ph(0b01, a);
25015        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.);
25016        assert_eq_m128h(r, e);
25017    }
25018
25019    #[simd_test(enable = "avx512fp16,avx512vl")]
25020    fn test_mm256_cvtepi64_ph() {
25021        let a = _mm256_set_epi64x(1, 2, 3, 4);
25022        let r = _mm256_cvtepi64_ph(a);
25023        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25024        assert_eq_m128h(r, e);
25025    }
25026
25027    #[simd_test(enable = "avx512fp16,avx512vl")]
25028    fn test_mm256_mask_cvtepi64_ph() {
25029        let a = _mm256_set_epi64x(1, 2, 3, 4);
25030        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25031        let r = _mm256_mask_cvtepi64_ph(src, 0b0101, a);
25032        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
25033        assert_eq_m128h(r, e);
25034    }
25035
25036    #[simd_test(enable = "avx512fp16,avx512vl")]
25037    fn test_mm256_maskz_cvtepi64_ph() {
25038        let a = _mm256_set_epi64x(1, 2, 3, 4);
25039        let r = _mm256_maskz_cvtepi64_ph(0b0101, a);
25040        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25041        assert_eq_m128h(r, e);
25042    }
25043
25044    #[simd_test(enable = "avx512fp16,avx512vl")]
25045    fn test_mm512_cvtepi64_ph() {
25046        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25047        let r = _mm512_cvtepi64_ph(a);
25048        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25049        assert_eq_m128h(r, e);
25050    }
25051
25052    #[simd_test(enable = "avx512fp16,avx512vl")]
25053    fn test_mm512_mask_cvtepi64_ph() {
25054        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25055        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25056        let r = _mm512_mask_cvtepi64_ph(src, 0b01010101, a);
25057        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25058        assert_eq_m128h(r, e);
25059    }
25060
25061    #[simd_test(enable = "avx512fp16,avx512vl")]
25062    fn test_mm512_maskz_cvtepi64_ph() {
25063        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25064        let r = _mm512_maskz_cvtepi64_ph(0b01010101, a);
25065        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25066        assert_eq_m128h(r, e);
25067    }
25068
25069    #[simd_test(enable = "avx512fp16,avx512vl")]
25070    fn test_mm512_cvt_roundepi64_ph() {
25071        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25072        let r = _mm512_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25073        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25074        assert_eq_m128h(r, e);
25075    }
25076
25077    #[simd_test(enable = "avx512fp16")]
25078    fn test_mm512_mask_cvt_roundepi64_ph() {
25079        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25080        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25081        let r = _mm512_mask_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25082            src, 0b01010101, a,
25083        );
25084        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25085        assert_eq_m128h(r, e);
25086    }
25087
25088    #[simd_test(enable = "avx512fp16,avx512vl")]
25089    fn test_mm512_maskz_cvt_roundepi64_ph() {
25090        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25091        let r = _mm512_maskz_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25092            0b01010101, a,
25093        );
25094        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25095        assert_eq_m128h(r, e);
25096    }
25097
25098    #[simd_test(enable = "avx512fp16,avx512vl")]
25099    fn test_mm_cvtepu64_ph() {
25100        let a = _mm_set_epi64x(1, 2);
25101        let r = _mm_cvtepu64_ph(a);
25102        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
25103        assert_eq_m128h(r, e);
25104    }
25105
25106    #[simd_test(enable = "avx512fp16,avx512vl")]
25107    fn test_mm_mask_cvtepu64_ph() {
25108        let a = _mm_set_epi64x(1, 2);
25109        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25110        let r = _mm_mask_cvtepu64_ph(src, 0b01, a);
25111        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
25112        assert_eq_m128h(r, e);
25113    }
25114
25115    #[simd_test(enable = "avx512fp16,avx512vl")]
25116    fn test_mm_maskz_cvtepu64_ph() {
25117        let a = _mm_set_epi64x(1, 2);
25118        let r = _mm_maskz_cvtepu64_ph(0b01, a);
25119        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
25120        assert_eq_m128h(r, e);
25121    }
25122
25123    #[simd_test(enable = "avx512fp16,avx512vl")]
25124    fn test_mm256_cvtepu64_ph() {
25125        let a = _mm256_set_epi64x(1, 2, 3, 4);
25126        let r = _mm256_cvtepu64_ph(a);
25127        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25128        assert_eq_m128h(r, e);
25129    }
25130
25131    #[simd_test(enable = "avx512fp16,avx512vl")]
25132    fn test_mm256_mask_cvtepu64_ph() {
25133        let a = _mm256_set_epi64x(1, 2, 3, 4);
25134        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25135        let r = _mm256_mask_cvtepu64_ph(src, 0b0101, a);
25136        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
25137        assert_eq_m128h(r, e);
25138    }
25139
25140    #[simd_test(enable = "avx512fp16,avx512vl")]
25141    fn test_mm256_maskz_cvtepu64_ph() {
25142        let a = _mm256_set_epi64x(1, 2, 3, 4);
25143        let r = _mm256_maskz_cvtepu64_ph(0b0101, a);
25144        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25145        assert_eq_m128h(r, e);
25146    }
25147
25148    #[simd_test(enable = "avx512fp16,avx512vl")]
25149    fn test_mm512_cvtepu64_ph() {
25150        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25151        let r = _mm512_cvtepu64_ph(a);
25152        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25153        assert_eq_m128h(r, e);
25154    }
25155
25156    #[simd_test(enable = "avx512fp16,avx512vl")]
25157    fn test_mm512_mask_cvtepu64_ph() {
25158        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25159        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25160        let r = _mm512_mask_cvtepu64_ph(src, 0b01010101, a);
25161        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25162        assert_eq_m128h(r, e);
25163    }
25164
25165    #[simd_test(enable = "avx512fp16,avx512vl")]
25166    fn test_mm512_maskz_cvtepu64_ph() {
25167        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25168        let r = _mm512_maskz_cvtepu64_ph(0b01010101, a);
25169        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25170        assert_eq_m128h(r, e);
25171    }
25172
25173    #[simd_test(enable = "avx512fp16,avx512vl")]
25174    fn test_mm512_cvt_roundepu64_ph() {
25175        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25176        let r = _mm512_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25177        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25178        assert_eq_m128h(r, e);
25179    }
25180
25181    #[simd_test(enable = "avx512fp16,avx512vl")]
25182    fn test_mm512_mask_cvt_roundepu64_ph() {
25183        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25184        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25185        let r = _mm512_mask_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25186            src, 0b01010101, a,
25187        );
25188        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25189        assert_eq_m128h(r, e);
25190    }
25191
25192    #[simd_test(enable = "avx512fp16,avx512vl")]
25193    fn test_mm512_maskz_cvt_roundepu64_ph() {
25194        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25195        let r = _mm512_maskz_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25196            0b01010101, a,
25197        );
25198        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25199        assert_eq_m128h(r, e);
25200    }
25201
25202    #[simd_test(enable = "avx512fp16,avx512vl")]
25203    fn test_mm_cvtxps_ph() {
25204        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
25205        let r = _mm_cvtxps_ph(a);
25206        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25207        assert_eq_m128h(r, e);
25208    }
25209
25210    #[simd_test(enable = "avx512fp16,avx512vl")]
25211    fn test_mm_mask_cvtxps_ph() {
25212        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
25213        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25214        let r = _mm_mask_cvtxps_ph(src, 0b0101, a);
25215        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16., 4.0);
25216        assert_eq_m128h(r, e);
25217    }
25218
25219    #[simd_test(enable = "avx512fp16,avx512vl")]
25220    fn test_mm_maskz_cvtxps_ph() {
25221        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
25222        let r = _mm_maskz_cvtxps_ph(0b0101, a);
25223        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25224        assert_eq_m128h(r, e);
25225    }
25226
25227    #[simd_test(enable = "avx512fp16,avx512vl")]
25228    fn test_mm256_cvtxps_ph() {
25229        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25230        let r = _mm256_cvtxps_ph(a);
25231        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25232        assert_eq_m128h(r, e);
25233    }
25234
25235    #[simd_test(enable = "avx512fp16,avx512vl")]
25236    fn test_mm256_mask_cvtxps_ph() {
25237        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25238        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25239        let r = _mm256_mask_cvtxps_ph(src, 0b01010101, a);
25240        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25241        assert_eq_m128h(r, e);
25242    }
25243
25244    #[simd_test(enable = "avx512fp16,avx512vl")]
25245    fn test_mm256_maskz_cvtxps_ph() {
25246        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25247        let r = _mm256_maskz_cvtxps_ph(0b01010101, a);
25248        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
25249        assert_eq_m128h(r, e);
25250    }
25251
25252    #[simd_test(enable = "avx512fp16,avx512vl")]
25253    fn test_mm512_cvtxps_ph() {
25254        let a = _mm512_set_ps(
25255            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25256        );
25257        let r = _mm512_cvtxps_ph(a);
25258        let e = _mm256_set_ph(
25259            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25260        );
25261        assert_eq_m256h(r, e);
25262    }
25263
25264    #[simd_test(enable = "avx512fp16,avx512vl")]
25265    fn test_mm512_mask_cvtxps_ph() {
25266        let a = _mm512_set_ps(
25267            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25268        );
25269        let src = _mm256_set_ph(
25270            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
25271        );
25272        let r = _mm512_mask_cvtxps_ph(src, 0b0101010101010101, a);
25273        let e = _mm256_set_ph(
25274            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
25275        );
25276        assert_eq_m256h(r, e);
25277    }
25278
25279    #[simd_test(enable = "avx512fp16,avx512vl")]
25280    fn test_mm512_maskz_cvtxps_ph() {
25281        let a = _mm512_set_ps(
25282            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25283        );
25284        let r = _mm512_maskz_cvtxps_ph(0b0101010101010101, a);
25285        let e = _mm256_set_ph(
25286            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
25287        );
25288        assert_eq_m256h(r, e);
25289    }
25290
25291    #[simd_test(enable = "avx512fp16,avx512vl")]
25292    fn test_mm512_cvtx_roundps_ph() {
25293        let a = _mm512_set_ps(
25294            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25295        );
25296        let r = _mm512_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25297        let e = _mm256_set_ph(
25298            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25299        );
25300        assert_eq_m256h(r, e);
25301    }
25302
25303    #[simd_test(enable = "avx512fp16,avx512vl")]
25304    fn test_mm512_mask_cvtx_roundps_ph() {
25305        let a = _mm512_set_ps(
25306            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25307        );
25308        let src = _mm256_set_ph(
25309            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
25310        );
25311        let r = _mm512_mask_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25312            src,
25313            0b0101010101010101,
25314            a,
25315        );
25316        let e = _mm256_set_ph(
25317            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
25318            16.0,
25319        );
25320        assert_eq_m256h(r, e);
25321    }
25322
25323    #[simd_test(enable = "avx512fp16,avx512vl")]
25324    fn test_mm512_maskz_cvtx_roundps_ph() {
25325        let a = _mm512_set_ps(
25326            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25327        );
25328        let r = _mm512_maskz_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25329            0b0101010101010101,
25330            a,
25331        );
25332        let e = _mm256_set_ph(
25333            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
25334        );
25335        assert_eq_m256h(r, e);
25336    }
25337
25338    #[simd_test(enable = "avx512fp16,avx512vl")]
25339    fn test_mm_cvtss_sh() {
25340        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25341        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25342        let r = _mm_cvtss_sh(a, b);
25343        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25344        assert_eq_m128h(r, e);
25345    }
25346
25347    #[simd_test(enable = "avx512fp16,avx512vl")]
25348    fn test_mm_mask_cvtss_sh() {
25349        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25350        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25351        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25352        let r = _mm_mask_cvtss_sh(src, 0, a, b);
25353        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25354        assert_eq_m128h(r, e);
25355        let r = _mm_mask_cvtss_sh(src, 1, a, b);
25356        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25357        assert_eq_m128h(r, e);
25358    }
25359
25360    #[simd_test(enable = "avx512fp16,avx512vl")]
25361    fn test_mm_maskz_cvtss_sh() {
25362        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25363        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25364        let r = _mm_maskz_cvtss_sh(0, a, b);
25365        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25366        assert_eq_m128h(r, e);
25367        let r = _mm_maskz_cvtss_sh(1, a, b);
25368        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25369        assert_eq_m128h(r, e);
25370    }
25371
25372    #[simd_test(enable = "avx512fp16,avx512vl")]
25373    fn test_mm_cvt_roundss_sh() {
25374        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25375        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25376        let r = _mm_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25377        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25378        assert_eq_m128h(r, e);
25379    }
25380
25381    #[simd_test(enable = "avx512fp16,avx512vl")]
25382    fn test_mm_mask_cvt_roundss_sh() {
25383        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25384        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25385        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25386        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25387            src, 0, a, b,
25388        );
25389        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25390        assert_eq_m128h(r, e);
25391        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25392            src, 1, a, b,
25393        );
25394        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25395        assert_eq_m128h(r, e);
25396    }
25397
25398    #[simd_test(enable = "avx512fp16,avx512vl")]
25399    fn test_mm_maskz_cvt_roundss_sh() {
25400        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25401        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25402        let r =
25403            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25404        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25405        assert_eq_m128h(r, e);
25406        let r =
25407            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25408        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25409        assert_eq_m128h(r, e);
25410    }
25411
25412    #[simd_test(enable = "avx512fp16,avx512vl")]
25413    fn test_mm_cvtpd_ph() {
25414        let a = _mm_set_pd(1.0, 2.0);
25415        let r = _mm_cvtpd_ph(a);
25416        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
25417        assert_eq_m128h(r, e);
25418    }
25419
25420    #[simd_test(enable = "avx512fp16,avx512vl")]
25421    fn test_mm_mask_cvtpd_ph() {
25422        let a = _mm_set_pd(1.0, 2.0);
25423        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25424        let r = _mm_mask_cvtpd_ph(src, 0b01, a);
25425        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
25426        assert_eq_m128h(r, e);
25427    }
25428
25429    #[simd_test(enable = "avx512fp16,avx512vl")]
25430    fn test_mm_maskz_cvtpd_ph() {
25431        let a = _mm_set_pd(1.0, 2.0);
25432        let r = _mm_maskz_cvtpd_ph(0b01, a);
25433        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
25434        assert_eq_m128h(r, e);
25435    }
25436
25437    #[simd_test(enable = "avx512fp16,avx512vl")]
25438    fn test_mm256_cvtpd_ph() {
25439        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25440        let r = _mm256_cvtpd_ph(a);
25441        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25442        assert_eq_m128h(r, e);
25443    }
25444
25445    #[simd_test(enable = "avx512fp16,avx512vl")]
25446    fn test_mm256_mask_cvtpd_ph() {
25447        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25448        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25449        let r = _mm256_mask_cvtpd_ph(src, 0b0101, a);
25450        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
25451        assert_eq_m128h(r, e);
25452    }
25453
25454    #[simd_test(enable = "avx512fp16,avx512vl")]
25455    fn test_mm256_maskz_cvtpd_ph() {
25456        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25457        let r = _mm256_maskz_cvtpd_ph(0b0101, a);
25458        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25459        assert_eq_m128h(r, e);
25460    }
25461
25462    #[simd_test(enable = "avx512fp16,avx512vl")]
25463    fn test_mm512_cvtpd_ph() {
25464        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25465        let r = _mm512_cvtpd_ph(a);
25466        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25467        assert_eq_m128h(r, e);
25468    }
25469
25470    #[simd_test(enable = "avx512fp16,avx512vl")]
25471    fn test_mm512_mask_cvtpd_ph() {
25472        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25473        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25474        let r = _mm512_mask_cvtpd_ph(src, 0b01010101, a);
25475        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25476        assert_eq_m128h(r, e);
25477    }
25478
25479    #[simd_test(enable = "avx512fp16,avx512vl")]
25480    fn test_mm512_maskz_cvtpd_ph() {
25481        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25482        let r = _mm512_maskz_cvtpd_ph(0b01010101, a);
25483        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25484        assert_eq_m128h(r, e);
25485    }
25486
25487    #[simd_test(enable = "avx512fp16,avx512vl")]
25488    fn test_mm512_cvt_roundpd_ph() {
25489        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25490        let r = _mm512_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25491        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25492        assert_eq_m128h(r, e);
25493    }
25494
25495    #[simd_test(enable = "avx512fp16,avx512vl")]
25496    fn test_mm512_mask_cvt_roundpd_ph() {
25497        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25498        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25499        let r = _mm512_mask_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25500            src, 0b01010101, a,
25501        );
25502        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25503        assert_eq_m128h(r, e);
25504    }
25505
25506    #[simd_test(enable = "avx512fp16,avx512vl")]
25507    fn test_mm512_maskz_cvt_roundpd_ph() {
25508        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25509        let r = _mm512_maskz_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25510            0b01010101, a,
25511        );
25512        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25513        assert_eq_m128h(r, e);
25514    }
25515
25516    #[simd_test(enable = "avx512fp16,avx512vl")]
25517    fn test_mm_cvtsd_sh() {
25518        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25519        let b = _mm_setr_pd(1.0, 2.0);
25520        let r = _mm_cvtsd_sh(a, b);
25521        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25522        assert_eq_m128h(r, e);
25523    }
25524
25525    #[simd_test(enable = "avx512fp16,avx512vl")]
25526    fn test_mm_mask_cvtsd_sh() {
25527        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25528        let b = _mm_setr_pd(1.0, 2.0);
25529        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25530        let r = _mm_mask_cvtsd_sh(src, 0, a, b);
25531        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25532        assert_eq_m128h(r, e);
25533        let r = _mm_mask_cvtsd_sh(src, 1, a, b);
25534        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25535        assert_eq_m128h(r, e);
25536    }
25537
25538    #[simd_test(enable = "avx512fp16,avx512vl")]
25539    fn test_mm_maskz_cvtsd_sh() {
25540        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25541        let b = _mm_setr_pd(1.0, 2.0);
25542        let r = _mm_maskz_cvtsd_sh(0, a, b);
25543        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25544        assert_eq_m128h(r, e);
25545        let r = _mm_maskz_cvtsd_sh(1, a, b);
25546        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25547        assert_eq_m128h(r, e);
25548    }
25549
25550    #[simd_test(enable = "avx512fp16,avx512vl")]
25551    fn test_mm_cvt_roundsd_sh() {
25552        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25553        let b = _mm_setr_pd(1.0, 2.0);
25554        let r = _mm_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25555        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25556        assert_eq_m128h(r, e);
25557    }
25558
25559    #[simd_test(enable = "avx512fp16,avx512vl")]
25560    fn test_mm_mask_cvt_roundsd_sh() {
25561        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25562        let b = _mm_setr_pd(1.0, 2.0);
25563        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25564        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25565            src, 0, a, b,
25566        );
25567        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25568        assert_eq_m128h(r, e);
25569        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25570            src, 1, a, b,
25571        );
25572        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25573        assert_eq_m128h(r, e);
25574    }
25575
25576    #[simd_test(enable = "avx512fp16,avx512vl")]
25577    fn test_mm_maskz_cvt_roundsd_sh() {
25578        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25579        let b = _mm_setr_pd(1.0, 2.0);
25580        let r =
25581            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25582        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25583        assert_eq_m128h(r, e);
25584        let r =
25585            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25586        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25587        assert_eq_m128h(r, e);
25588    }
25589
25590    #[simd_test(enable = "avx512fp16,avx512vl")]
25591    fn test_mm_cvtph_epi16() {
25592        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25593        let r = _mm_cvttph_epi16(a);
25594        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25595        assert_eq_m128i(r, e);
25596    }
25597
25598    #[simd_test(enable = "avx512fp16,avx512vl")]
25599    fn test_mm_mask_cvtph_epi16() {
25600        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25601        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25602        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25603        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25604        assert_eq_m128i(r, e);
25605    }
25606
25607    #[simd_test(enable = "avx512fp16,avx512vl")]
25608    fn test_mm_maskz_cvtph_epi16() {
25609        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25610        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25611        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25612        assert_eq_m128i(r, e);
25613    }
25614
25615    #[simd_test(enable = "avx512fp16,avx512vl")]
25616    fn test_mm256_cvtph_epi16() {
25617        let a = _mm256_set_ph(
25618            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25619        );
25620        let r = _mm256_cvttph_epi16(a);
25621        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25622        assert_eq_m256i(r, e);
25623    }
25624
25625    #[simd_test(enable = "avx512fp16,avx512vl")]
25626    fn test_mm256_mask_cvtph_epi16() {
25627        let a = _mm256_set_ph(
25628            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25629        );
25630        let src = _mm256_set_epi16(
25631            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25632        );
25633        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25634        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25635        assert_eq_m256i(r, e);
25636    }
25637
25638    #[simd_test(enable = "avx512fp16,avx512vl")]
25639    fn test_mm256_maskz_cvtph_epi16() {
25640        let a = _mm256_set_ph(
25641            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25642        );
25643        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25644        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25645        assert_eq_m256i(r, e);
25646    }
25647
25648    #[simd_test(enable = "avx512fp16")]
25649    fn test_mm512_cvtph_epi16() {
25650        let a = _mm512_set_ph(
25651            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25652            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25653            31.0, 32.0,
25654        );
25655        let r = _mm512_cvttph_epi16(a);
25656        let e = _mm512_set_epi16(
25657            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25658            25, 26, 27, 28, 29, 30, 31, 32,
25659        );
25660        assert_eq_m512i(r, e);
25661    }
25662
25663    #[simd_test(enable = "avx512fp16")]
25664    fn test_mm512_mask_cvtph_epi16() {
25665        let a = _mm512_set_ph(
25666            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25667            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25668            31.0, 32.0,
25669        );
25670        let src = _mm512_set_epi16(
25671            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25672            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25673        );
25674        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25675        let e = _mm512_set_epi16(
25676            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25677            24, 34, 26, 36, 28, 38, 30, 40, 32,
25678        );
25679        assert_eq_m512i(r, e);
25680    }
25681
25682    #[simd_test(enable = "avx512fp16")]
25683    fn test_mm512_maskz_cvtph_epi16() {
25684        let a = _mm512_set_ph(
25685            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25686            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25687            31.0, 32.0,
25688        );
25689        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25690        let e = _mm512_set_epi16(
25691            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25692            0, 28, 0, 30, 0, 32,
25693        );
25694        assert_eq_m512i(r, e);
25695    }
25696
25697    #[simd_test(enable = "avx512fp16")]
25698    fn test_mm512_cvt_roundph_epi16() {
25699        let a = _mm512_set_ph(
25700            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25701            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25702            31.0, 32.0,
25703        );
25704        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25705        let e = _mm512_set_epi16(
25706            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25707            25, 26, 27, 28, 29, 30, 31, 32,
25708        );
25709        assert_eq_m512i(r, e);
25710    }
25711
25712    #[simd_test(enable = "avx512fp16")]
25713    fn test_mm512_mask_cvt_roundph_epi16() {
25714        let a = _mm512_set_ph(
25715            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25716            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25717            31.0, 32.0,
25718        );
25719        let src = _mm512_set_epi16(
25720            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25721            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25722        );
25723        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25724            src,
25725            0b01010101010101010101010101010101,
25726            a,
25727        );
25728        let e = _mm512_set_epi16(
25729            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25730            24, 34, 26, 36, 28, 38, 30, 40, 32,
25731        );
25732        assert_eq_m512i(r, e);
25733    }
25734
25735    #[simd_test(enable = "avx512fp16")]
25736    fn test_mm512_maskz_cvt_roundph_epi16() {
25737        let a = _mm512_set_ph(
25738            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25739            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25740            31.0, 32.0,
25741        );
25742        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25743            0b01010101010101010101010101010101,
25744            a,
25745        );
25746        let e = _mm512_set_epi16(
25747            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25748            0, 28, 0, 30, 0, 32,
25749        );
25750        assert_eq_m512i(r, e);
25751    }
25752
25753    #[simd_test(enable = "avx512fp16,avx512vl")]
25754    fn test_mm_cvtph_epu16() {
25755        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25756        let r = _mm_cvttph_epu16(a);
25757        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25758        assert_eq_m128i(r, e);
25759    }
25760
25761    #[simd_test(enable = "avx512fp16,avx512vl")]
25762    fn test_mm_mask_cvtph_epu16() {
25763        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25764        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25765        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25766        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25767        assert_eq_m128i(r, e);
25768    }
25769
25770    #[simd_test(enable = "avx512fp16,avx512vl")]
25771    fn test_mm_maskz_cvtph_epu16() {
25772        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25773        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25774        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25775        assert_eq_m128i(r, e);
25776    }
25777
25778    #[simd_test(enable = "avx512fp16,avx512vl")]
25779    fn test_mm256_cvtph_epu16() {
25780        let a = _mm256_set_ph(
25781            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25782        );
25783        let r = _mm256_cvttph_epu16(a);
25784        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25785        assert_eq_m256i(r, e);
25786    }
25787
25788    #[simd_test(enable = "avx512fp16,avx512vl")]
25789    fn test_mm256_mask_cvtph_epu16() {
25790        let a = _mm256_set_ph(
25791            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25792        );
25793        let src = _mm256_set_epi16(
25794            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25795        );
25796        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25797        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25798        assert_eq_m256i(r, e);
25799    }
25800
25801    #[simd_test(enable = "avx512fp16,avx512vl")]
25802    fn test_mm256_maskz_cvtph_epu16() {
25803        let a = _mm256_set_ph(
25804            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25805        );
25806        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25807        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25808        assert_eq_m256i(r, e);
25809    }
25810
25811    #[simd_test(enable = "avx512fp16")]
25812    fn test_mm512_cvtph_epu16() {
25813        let a = _mm512_set_ph(
25814            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25815            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25816            31.0, 32.0,
25817        );
25818        let r = _mm512_cvttph_epu16(a);
25819        let e = _mm512_set_epi16(
25820            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25821            25, 26, 27, 28, 29, 30, 31, 32,
25822        );
25823        assert_eq_m512i(r, e);
25824    }
25825
25826    #[simd_test(enable = "avx512fp16")]
25827    fn test_mm512_mask_cvtph_epu16() {
25828        let a = _mm512_set_ph(
25829            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25830            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25831            31.0, 32.0,
25832        );
25833        let src = _mm512_set_epi16(
25834            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25835            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25836        );
25837        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25838        let e = _mm512_set_epi16(
25839            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25840            24, 34, 26, 36, 28, 38, 30, 40, 32,
25841        );
25842        assert_eq_m512i(r, e);
25843    }
25844
25845    #[simd_test(enable = "avx512fp16")]
25846    fn test_mm512_maskz_cvtph_epu16() {
25847        let a = _mm512_set_ph(
25848            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25849            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25850            31.0, 32.0,
25851        );
25852        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25853        let e = _mm512_set_epi16(
25854            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25855            0, 28, 0, 30, 0, 32,
25856        );
25857        assert_eq_m512i(r, e);
25858    }
25859
25860    #[simd_test(enable = "avx512fp16")]
25861    fn test_mm512_cvt_roundph_epu16() {
25862        let a = _mm512_set_ph(
25863            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25864            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25865            31.0, 32.0,
25866        );
25867        let r = _mm512_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25868        let e = _mm512_set_epi16(
25869            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25870            25, 26, 27, 28, 29, 30, 31, 32,
25871        );
25872        assert_eq_m512i(r, e);
25873    }
25874
25875    #[simd_test(enable = "avx512fp16")]
25876    fn test_mm512_mask_cvt_roundph_epu16() {
25877        let a = _mm512_set_ph(
25878            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25879            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25880            31.0, 32.0,
25881        );
25882        let src = _mm512_set_epi16(
25883            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25884            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25885        );
25886        let r = _mm512_mask_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25887            src,
25888            0b01010101010101010101010101010101,
25889            a,
25890        );
25891        let e = _mm512_set_epi16(
25892            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25893            24, 34, 26, 36, 28, 38, 30, 40, 32,
25894        );
25895        assert_eq_m512i(r, e);
25896    }
25897
25898    #[simd_test(enable = "avx512fp16")]
25899    fn test_mm512_maskz_cvt_roundph_epu16() {
25900        let a = _mm512_set_ph(
25901            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25902            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25903            31.0, 32.0,
25904        );
25905        let r = _mm512_maskz_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25906            0b01010101010101010101010101010101,
25907            a,
25908        );
25909        let e = _mm512_set_epi16(
25910            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25911            0, 28, 0, 30, 0, 32,
25912        );
25913        assert_eq_m512i(r, e);
25914    }
25915
25916    #[simd_test(enable = "avx512fp16,avx512vl")]
25917    fn test_mm_cvttph_epi16() {
25918        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25919        let r = _mm_cvttph_epi16(a);
25920        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25921        assert_eq_m128i(r, e);
25922    }
25923
25924    #[simd_test(enable = "avx512fp16,avx512vl")]
25925    fn test_mm_mask_cvttph_epi16() {
25926        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25927        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25928        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25929        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25930        assert_eq_m128i(r, e);
25931    }
25932
25933    #[simd_test(enable = "avx512fp16,avx512vl")]
25934    fn test_mm_maskz_cvttph_epi16() {
25935        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25936        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25937        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25938        assert_eq_m128i(r, e);
25939    }
25940
25941    #[simd_test(enable = "avx512fp16,avx512vl")]
25942    fn test_mm256_cvttph_epi16() {
25943        let a = _mm256_set_ph(
25944            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25945        );
25946        let r = _mm256_cvttph_epi16(a);
25947        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25948        assert_eq_m256i(r, e);
25949    }
25950
25951    #[simd_test(enable = "avx512fp16,avx512vl")]
25952    fn test_mm256_mask_cvttph_epi16() {
25953        let a = _mm256_set_ph(
25954            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25955        );
25956        let src = _mm256_set_epi16(
25957            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25958        );
25959        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25960        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25961        assert_eq_m256i(r, e);
25962    }
25963
25964    #[simd_test(enable = "avx512fp16,avx512vl")]
25965    fn test_mm256_maskz_cvttph_epi16() {
25966        let a = _mm256_set_ph(
25967            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25968        );
25969        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25970        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25971        assert_eq_m256i(r, e);
25972    }
25973
25974    #[simd_test(enable = "avx512fp16")]
25975    fn test_mm512_cvttph_epi16() {
25976        let a = _mm512_set_ph(
25977            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25978            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25979            31.0, 32.0,
25980        );
25981        let r = _mm512_cvttph_epi16(a);
25982        let e = _mm512_set_epi16(
25983            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25984            25, 26, 27, 28, 29, 30, 31, 32,
25985        );
25986        assert_eq_m512i(r, e);
25987    }
25988
25989    #[simd_test(enable = "avx512fp16")]
25990    fn test_mm512_mask_cvttph_epi16() {
25991        let a = _mm512_set_ph(
25992            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25993            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25994            31.0, 32.0,
25995        );
25996        let src = _mm512_set_epi16(
25997            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25998            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25999        );
26000        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
26001        let e = _mm512_set_epi16(
26002            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
26003            24, 34, 26, 36, 28, 38, 30, 40, 32,
26004        );
26005        assert_eq_m512i(r, e);
26006    }
26007
26008    #[simd_test(enable = "avx512fp16")]
26009    fn test_mm512_maskz_cvttph_epi16() {
26010        let a = _mm512_set_ph(
26011            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26012            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26013            31.0, 32.0,
26014        );
26015        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
26016        let e = _mm512_set_epi16(
26017            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
26018            0, 28, 0, 30, 0, 32,
26019        );
26020        assert_eq_m512i(r, e);
26021    }
26022
26023    #[simd_test(enable = "avx512fp16")]
26024    fn test_mm512_cvtt_roundph_epi16() {
26025        let a = _mm512_set_ph(
26026            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26027            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26028            31.0, 32.0,
26029        );
26030        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
26031        let e = _mm512_set_epi16(
26032            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
26033            25, 26, 27, 28, 29, 30, 31, 32,
26034        );
26035        assert_eq_m512i(r, e);
26036    }
26037
26038    #[simd_test(enable = "avx512fp16")]
26039    fn test_mm512_mask_cvtt_roundph_epi16() {
26040        let a = _mm512_set_ph(
26041            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26042            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26043            31.0, 32.0,
26044        );
26045        let src = _mm512_set_epi16(
26046            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
26047            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
26048        );
26049        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
26050            src,
26051            0b01010101010101010101010101010101,
26052            a,
26053        );
26054        let e = _mm512_set_epi16(
26055            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
26056            24, 34, 26, 36, 28, 38, 30, 40, 32,
26057        );
26058        assert_eq_m512i(r, e);
26059    }
26060
26061    #[simd_test(enable = "avx512fp16")]
26062    fn test_mm512_maskz_cvtt_roundph_epi16() {
26063        let a = _mm512_set_ph(
26064            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26065            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26066            31.0, 32.0,
26067        );
26068        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
26069            0b01010101010101010101010101010101,
26070            a,
26071        );
26072        let e = _mm512_set_epi16(
26073            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
26074            0, 28, 0, 30, 0, 32,
26075        );
26076        assert_eq_m512i(r, e);
26077    }
26078
26079    #[simd_test(enable = "avx512fp16,avx512vl")]
26080    fn test_mm_cvttph_epu16() {
26081        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26082        let r = _mm_cvttph_epu16(a);
26083        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
26084        assert_eq_m128i(r, e);
26085    }
26086
26087    #[simd_test(enable = "avx512fp16,avx512vl")]
26088    fn test_mm_mask_cvttph_epu16() {
26089        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26090        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
26091        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
26092        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
26093        assert_eq_m128i(r, e);
26094    }
26095
26096    #[simd_test(enable = "avx512fp16,avx512vl")]
26097    fn test_mm_maskz_cvttph_epu16() {
26098        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26099        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
26100        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
26101        assert_eq_m128i(r, e);
26102    }
26103
26104    #[simd_test(enable = "avx512fp16,avx512vl")]
26105    fn test_mm256_cvttph_epu16() {
26106        let a = _mm256_set_ph(
26107            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26108        );
26109        let r = _mm256_cvttph_epu16(a);
26110        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26111        assert_eq_m256i(r, e);
26112    }
26113
26114    #[simd_test(enable = "avx512fp16,avx512vl")]
26115    fn test_mm256_mask_cvttph_epu16() {
26116        let a = _mm256_set_ph(
26117            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26118        );
26119        let src = _mm256_set_epi16(
26120            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26121        );
26122        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
26123        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26124        assert_eq_m256i(r, e);
26125    }
26126
26127    #[simd_test(enable = "avx512fp16,avx512vl")]
26128    fn test_mm256_maskz_cvttph_epu16() {
26129        let a = _mm256_set_ph(
26130            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26131        );
26132        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
26133        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26134        assert_eq_m256i(r, e);
26135    }
26136
26137    #[simd_test(enable = "avx512fp16")]
26138    fn test_mm512_cvttph_epu16() {
26139        let a = _mm512_set_ph(
26140            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26141            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26142            31.0, 32.0,
26143        );
26144        let r = _mm512_cvttph_epu16(a);
26145        let e = _mm512_set_epi16(
26146            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
26147            25, 26, 27, 28, 29, 30, 31, 32,
26148        );
26149        assert_eq_m512i(r, e);
26150    }
26151
26152    #[simd_test(enable = "avx512fp16")]
26153    fn test_mm512_mask_cvttph_epu16() {
26154        let a = _mm512_set_ph(
26155            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26156            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26157            31.0, 32.0,
26158        );
26159        let src = _mm512_set_epi16(
26160            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
26161            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
26162        );
26163        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
26164        let e = _mm512_set_epi16(
26165            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
26166            24, 34, 26, 36, 28, 38, 30, 40, 32,
26167        );
26168        assert_eq_m512i(r, e);
26169    }
26170
26171    #[simd_test(enable = "avx512fp16")]
26172    fn test_mm512_maskz_cvttph_epu16() {
26173        let a = _mm512_set_ph(
26174            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26175            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26176            31.0, 32.0,
26177        );
26178        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
26179        let e = _mm512_set_epi16(
26180            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
26181            0, 28, 0, 30, 0, 32,
26182        );
26183        assert_eq_m512i(r, e);
26184    }
26185
26186    #[simd_test(enable = "avx512fp16")]
26187    fn test_mm512_cvtt_roundph_epu16() {
26188        let a = _mm512_set_ph(
26189            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26190            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26191            31.0, 32.0,
26192        );
26193        let r = _mm512_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(a);
26194        let e = _mm512_set_epi16(
26195            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
26196            25, 26, 27, 28, 29, 30, 31, 32,
26197        );
26198        assert_eq_m512i(r, e);
26199    }
26200
26201    #[simd_test(enable = "avx512fp16")]
26202    fn test_mm512_mask_cvtt_roundph_epu16() {
26203        let a = _mm512_set_ph(
26204            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26205            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26206            31.0, 32.0,
26207        );
26208        let src = _mm512_set_epi16(
26209            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
26210            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
26211        );
26212        let r = _mm512_mask_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
26213            src,
26214            0b01010101010101010101010101010101,
26215            a,
26216        );
26217        let e = _mm512_set_epi16(
26218            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
26219            24, 34, 26, 36, 28, 38, 30, 40, 32,
26220        );
26221        assert_eq_m512i(r, e);
26222    }
26223
26224    #[simd_test(enable = "avx512fp16")]
26225    fn test_mm512_maskz_cvtt_roundph_epu16() {
26226        let a = _mm512_set_ph(
26227            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26228            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26229            31.0, 32.0,
26230        );
26231        let r = _mm512_maskz_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
26232            0b01010101010101010101010101010101,
26233            a,
26234        );
26235        let e = _mm512_set_epi16(
26236            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
26237            0, 28, 0, 30, 0, 32,
26238        );
26239        assert_eq_m512i(r, e);
26240    }
26241
26242    #[simd_test(enable = "avx512fp16,avx512vl")]
26243    fn test_mm_cvtph_epi32() {
26244        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26245        let r = _mm_cvtph_epi32(a);
26246        let e = _mm_set_epi32(1, 2, 3, 4);
26247        assert_eq_m128i(r, e);
26248    }
26249
26250    #[simd_test(enable = "avx512fp16,avx512vl")]
26251    fn test_mm_mask_cvtph_epi32() {
26252        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26253        let src = _mm_set_epi32(10, 11, 12, 13);
26254        let r = _mm_mask_cvtph_epi32(src, 0b0101, a);
26255        let e = _mm_set_epi32(10, 2, 12, 4);
26256        assert_eq_m128i(r, e);
26257    }
26258
26259    #[simd_test(enable = "avx512fp16,avx512vl")]
26260    fn test_mm_maskz_cvtph_epi32() {
26261        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26262        let r = _mm_maskz_cvtph_epi32(0b0101, a);
26263        let e = _mm_set_epi32(0, 2, 0, 4);
26264        assert_eq_m128i(r, e);
26265    }
26266
26267    #[simd_test(enable = "avx512fp16,avx512vl")]
26268    fn test_mm256_cvtph_epi32() {
26269        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26270        let r = _mm256_cvtph_epi32(a);
26271        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26272        assert_eq_m256i(r, e);
26273    }
26274
26275    #[simd_test(enable = "avx512fp16,avx512vl")]
26276    fn test_mm256_mask_cvtph_epi32() {
26277        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26278        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26279        let r = _mm256_mask_cvtph_epi32(src, 0b01010101, a);
26280        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26281        assert_eq_m256i(r, e);
26282    }
26283
26284    #[simd_test(enable = "avx512fp16,avx512vl")]
26285    fn test_mm256_maskz_cvtph_epi32() {
26286        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26287        let r = _mm256_maskz_cvtph_epi32(0b01010101, a);
26288        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26289        assert_eq_m256i(r, e);
26290    }
26291
26292    #[simd_test(enable = "avx512fp16")]
26293    fn test_mm512_cvtph_epi32() {
26294        let a = _mm256_set_ph(
26295            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26296        );
26297        let r = _mm512_cvtph_epi32(a);
26298        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26299        assert_eq_m512i(r, e);
26300    }
26301
26302    #[simd_test(enable = "avx512fp16")]
26303    fn test_mm512_mask_cvtph_epi32() {
26304        let a = _mm256_set_ph(
26305            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26306        );
26307        let src = _mm512_set_epi32(
26308            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26309        );
26310        let r = _mm512_mask_cvtph_epi32(src, 0b0101010101010101, a);
26311        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26312        assert_eq_m512i(r, e);
26313    }
26314
26315    #[simd_test(enable = "avx512fp16")]
26316    fn test_mm512_maskz_cvtph_epi32() {
26317        let a = _mm256_set_ph(
26318            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26319        );
26320        let r = _mm512_maskz_cvtph_epi32(0b0101010101010101, a);
26321        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26322        assert_eq_m512i(r, e);
26323    }
26324
26325    #[simd_test(enable = "avx512fp16")]
26326    fn test_mm512_cvt_roundph_epi32() {
26327        let a = _mm256_set_ph(
26328            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26329        );
26330        let r = _mm512_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26331        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26332        assert_eq_m512i(r, e);
26333    }
26334
26335    #[simd_test(enable = "avx512fp16")]
26336    fn test_mm512_mask_cvt_roundph_epi32() {
26337        let a = _mm256_set_ph(
26338            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26339        );
26340        let src = _mm512_set_epi32(
26341            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26342        );
26343        let r = _mm512_mask_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26344            src,
26345            0b0101010101010101,
26346            a,
26347        );
26348        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26349        assert_eq_m512i(r, e);
26350    }
26351
26352    #[simd_test(enable = "avx512fp16")]
26353    fn test_mm512_maskz_cvt_roundph_epi32() {
26354        let a = _mm256_set_ph(
26355            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26356        );
26357        let r = _mm512_maskz_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26358            0b0101010101010101,
26359            a,
26360        );
26361        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26362        assert_eq_m512i(r, e);
26363    }
26364
26365    #[simd_test(enable = "avx512fp16")]
26366    fn test_mm_cvtsh_i32() {
26367        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26368        let r = _mm_cvtsh_i32(a);
26369        assert_eq!(r, 1);
26370    }
26371
26372    #[simd_test(enable = "avx512fp16")]
26373    fn test_mm_cvt_roundsh_i32() {
26374        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26375        let r = _mm_cvt_roundsh_i32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26376        assert_eq!(r, 1);
26377    }
26378
26379    #[simd_test(enable = "avx512fp16,avx512vl")]
26380    fn test_mm_cvtph_epu32() {
26381        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26382        let r = _mm_cvtph_epu32(a);
26383        let e = _mm_set_epi32(1, 2, 3, 4);
26384        assert_eq_m128i(r, e);
26385    }
26386
26387    #[simd_test(enable = "avx512fp16,avx512vl")]
26388    fn test_mm_mask_cvtph_epu32() {
26389        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26390        let src = _mm_set_epi32(10, 11, 12, 13);
26391        let r = _mm_mask_cvtph_epu32(src, 0b0101, a);
26392        let e = _mm_set_epi32(10, 2, 12, 4);
26393        assert_eq_m128i(r, e);
26394    }
26395
26396    #[simd_test(enable = "avx512fp16,avx512vl")]
26397    fn test_mm_maskz_cvtph_epu32() {
26398        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26399        let r = _mm_maskz_cvtph_epu32(0b0101, a);
26400        let e = _mm_set_epi32(0, 2, 0, 4);
26401        assert_eq_m128i(r, e);
26402    }
26403
26404    #[simd_test(enable = "avx512fp16,avx512vl")]
26405    fn test_mm256_cvtph_epu32() {
26406        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26407        let r = _mm256_cvtph_epu32(a);
26408        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26409        assert_eq_m256i(r, e);
26410    }
26411
26412    #[simd_test(enable = "avx512fp16,avx512vl")]
26413    fn test_mm256_mask_cvtph_epu32() {
26414        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26415        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26416        let r = _mm256_mask_cvtph_epu32(src, 0b01010101, a);
26417        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26418        assert_eq_m256i(r, e);
26419    }
26420
26421    #[simd_test(enable = "avx512fp16,avx512vl")]
26422    fn test_mm256_maskz_cvtph_epu32() {
26423        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26424        let r = _mm256_maskz_cvtph_epu32(0b01010101, a);
26425        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26426        assert_eq_m256i(r, e);
26427    }
26428
26429    #[simd_test(enable = "avx512fp16")]
26430    fn test_mm512_cvtph_epu32() {
26431        let a = _mm256_set_ph(
26432            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26433        );
26434        let r = _mm512_cvtph_epu32(a);
26435        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26436        assert_eq_m512i(r, e);
26437    }
26438
26439    #[simd_test(enable = "avx512fp16")]
26440    fn test_mm512_mask_cvtph_epu32() {
26441        let a = _mm256_set_ph(
26442            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26443        );
26444        let src = _mm512_set_epi32(
26445            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26446        );
26447        let r = _mm512_mask_cvtph_epu32(src, 0b0101010101010101, a);
26448        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26449        assert_eq_m512i(r, e);
26450    }
26451
26452    #[simd_test(enable = "avx512fp16")]
26453    fn test_mm512_maskz_cvtph_epu32() {
26454        let a = _mm256_set_ph(
26455            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26456        );
26457        let r = _mm512_maskz_cvtph_epu32(0b0101010101010101, a);
26458        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26459        assert_eq_m512i(r, e);
26460    }
26461
26462    #[simd_test(enable = "avx512fp16")]
26463    fn test_mm512_cvt_roundph_epu32() {
26464        let a = _mm256_set_ph(
26465            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26466        );
26467        let r = _mm512_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26468        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26469        assert_eq_m512i(r, e);
26470    }
26471
26472    #[simd_test(enable = "avx512fp16")]
26473    fn test_mm512_mask_cvt_roundph_epu32() {
26474        let a = _mm256_set_ph(
26475            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26476        );
26477        let src = _mm512_set_epi32(
26478            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26479        );
26480        let r = _mm512_mask_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26481            src,
26482            0b0101010101010101,
26483            a,
26484        );
26485        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26486        assert_eq_m512i(r, e);
26487    }
26488
26489    #[simd_test(enable = "avx512fp16")]
26490    fn test_mm512_maskz_cvt_roundph_epu32() {
26491        let a = _mm256_set_ph(
26492            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26493        );
26494        let r = _mm512_maskz_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26495            0b0101010101010101,
26496            a,
26497        );
26498        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26499        assert_eq_m512i(r, e);
26500    }
26501
26502    #[simd_test(enable = "avx512fp16")]
26503    fn test_mm_cvtsh_u32() {
26504        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26505        let r = _mm_cvtsh_u32(a);
26506        assert_eq!(r, 1);
26507    }
26508
26509    #[simd_test(enable = "avx512fp16")]
26510    fn test_mm_cvt_roundsh_u32() {
26511        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26512        let r = _mm_cvt_roundsh_u32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26513        assert_eq!(r, 1);
26514    }
26515
26516    #[simd_test(enable = "avx512fp16,avx512vl")]
26517    fn test_mm_cvttph_epi32() {
26518        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26519        let r = _mm_cvttph_epi32(a);
26520        let e = _mm_set_epi32(1, 2, 3, 4);
26521        assert_eq_m128i(r, e);
26522    }
26523
26524    #[simd_test(enable = "avx512fp16,avx512vl")]
26525    fn test_mm_mask_cvttph_epi32() {
26526        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26527        let src = _mm_set_epi32(10, 11, 12, 13);
26528        let r = _mm_mask_cvttph_epi32(src, 0b0101, a);
26529        let e = _mm_set_epi32(10, 2, 12, 4);
26530        assert_eq_m128i(r, e);
26531    }
26532
26533    #[simd_test(enable = "avx512fp16,avx512vl")]
26534    fn test_mm_maskz_cvttph_epi32() {
26535        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26536        let r = _mm_maskz_cvttph_epi32(0b0101, a);
26537        let e = _mm_set_epi32(0, 2, 0, 4);
26538        assert_eq_m128i(r, e);
26539    }
26540
26541    #[simd_test(enable = "avx512fp16,avx512vl")]
26542    fn test_mm256_cvttph_epi32() {
26543        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26544        let r = _mm256_cvttph_epi32(a);
26545        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26546        assert_eq_m256i(r, e);
26547    }
26548
26549    #[simd_test(enable = "avx512fp16,avx512vl")]
26550    fn test_mm256_mask_cvttph_epi32() {
26551        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26552        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26553        let r = _mm256_mask_cvttph_epi32(src, 0b01010101, a);
26554        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26555        assert_eq_m256i(r, e);
26556    }
26557
26558    #[simd_test(enable = "avx512fp16,avx512vl")]
26559    fn test_mm256_maskz_cvttph_epi32() {
26560        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26561        let r = _mm256_maskz_cvttph_epi32(0b01010101, a);
26562        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26563        assert_eq_m256i(r, e);
26564    }
26565
26566    #[simd_test(enable = "avx512fp16")]
26567    fn test_mm512_cvttph_epi32() {
26568        let a = _mm256_set_ph(
26569            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26570        );
26571        let r = _mm512_cvttph_epi32(a);
26572        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26573        assert_eq_m512i(r, e);
26574    }
26575
26576    #[simd_test(enable = "avx512fp16")]
26577    fn test_mm512_mask_cvttph_epi32() {
26578        let a = _mm256_set_ph(
26579            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26580        );
26581        let src = _mm512_set_epi32(
26582            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26583        );
26584        let r = _mm512_mask_cvttph_epi32(src, 0b0101010101010101, a);
26585        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26586        assert_eq_m512i(r, e);
26587    }
26588
26589    #[simd_test(enable = "avx512fp16")]
26590    fn test_mm512_maskz_cvttph_epi32() {
26591        let a = _mm256_set_ph(
26592            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26593        );
26594        let r = _mm512_maskz_cvttph_epi32(0b0101010101010101, a);
26595        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26596        assert_eq_m512i(r, e);
26597    }
26598
26599    #[simd_test(enable = "avx512fp16")]
26600    fn test_mm512_cvtt_roundph_epi32() {
26601        let a = _mm256_set_ph(
26602            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26603        );
26604        let r = _mm512_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(a);
26605        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26606        assert_eq_m512i(r, e);
26607    }
26608
26609    #[simd_test(enable = "avx512fp16")]
26610    fn test_mm512_mask_cvtt_roundph_epi32() {
26611        let a = _mm256_set_ph(
26612            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26613        );
26614        let src = _mm512_set_epi32(
26615            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26616        );
26617        let r = _mm512_mask_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26618        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26619        assert_eq_m512i(r, e);
26620    }
26621
26622    #[simd_test(enable = "avx512fp16")]
26623    fn test_mm512_maskz_cvtt_roundph_epi32() {
26624        let a = _mm256_set_ph(
26625            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26626        );
26627        let r = _mm512_maskz_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26628        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26629        assert_eq_m512i(r, e);
26630    }
26631
26632    #[simd_test(enable = "avx512fp16")]
26633    fn test_mm_cvttsh_i32() {
26634        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26635        let r = _mm_cvttsh_i32(a);
26636        assert_eq!(r, 1);
26637    }
26638
26639    #[simd_test(enable = "avx512fp16")]
26640    fn test_mm_cvtt_roundsh_i32() {
26641        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26642        let r = _mm_cvtt_roundsh_i32::<_MM_FROUND_NO_EXC>(a);
26643        assert_eq!(r, 1);
26644    }
26645
26646    #[simd_test(enable = "avx512fp16,avx512vl")]
26647    fn test_mm_cvttph_epu32() {
26648        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26649        let r = _mm_cvttph_epu32(a);
26650        let e = _mm_set_epi32(1, 2, 3, 4);
26651        assert_eq_m128i(r, e);
26652    }
26653
26654    #[simd_test(enable = "avx512fp16,avx512vl")]
26655    fn test_mm_mask_cvttph_epu32() {
26656        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26657        let src = _mm_set_epi32(10, 11, 12, 13);
26658        let r = _mm_mask_cvttph_epu32(src, 0b0101, a);
26659        let e = _mm_set_epi32(10, 2, 12, 4);
26660        assert_eq_m128i(r, e);
26661    }
26662
26663    #[simd_test(enable = "avx512fp16,avx512vl")]
26664    fn test_mm_maskz_cvttph_epu32() {
26665        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26666        let r = _mm_maskz_cvttph_epu32(0b0101, a);
26667        let e = _mm_set_epi32(0, 2, 0, 4);
26668        assert_eq_m128i(r, e);
26669    }
26670
26671    #[simd_test(enable = "avx512fp16,avx512vl")]
26672    fn test_mm256_cvttph_epu32() {
26673        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26674        let r = _mm256_cvttph_epu32(a);
26675        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26676        assert_eq_m256i(r, e);
26677    }
26678
26679    #[simd_test(enable = "avx512fp16,avx512vl")]
26680    fn test_mm256_mask_cvttph_epu32() {
26681        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26682        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26683        let r = _mm256_mask_cvttph_epu32(src, 0b01010101, a);
26684        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26685        assert_eq_m256i(r, e);
26686    }
26687
26688    #[simd_test(enable = "avx512fp16,avx512vl")]
26689    fn test_mm256_maskz_cvttph_epu32() {
26690        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26691        let r = _mm256_maskz_cvttph_epu32(0b01010101, a);
26692        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26693        assert_eq_m256i(r, e);
26694    }
26695
26696    #[simd_test(enable = "avx512fp16")]
26697    fn test_mm512_cvttph_epu32() {
26698        let a = _mm256_set_ph(
26699            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26700        );
26701        let r = _mm512_cvttph_epu32(a);
26702        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26703        assert_eq_m512i(r, e);
26704    }
26705
26706    #[simd_test(enable = "avx512fp16")]
26707    fn test_mm512_mask_cvttph_epu32() {
26708        let a = _mm256_set_ph(
26709            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26710        );
26711        let src = _mm512_set_epi32(
26712            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26713        );
26714        let r = _mm512_mask_cvttph_epu32(src, 0b0101010101010101, a);
26715        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26716        assert_eq_m512i(r, e);
26717    }
26718
26719    #[simd_test(enable = "avx512fp16")]
26720    fn test_mm512_maskz_cvttph_epu32() {
26721        let a = _mm256_set_ph(
26722            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26723        );
26724        let r = _mm512_maskz_cvttph_epu32(0b0101010101010101, a);
26725        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26726        assert_eq_m512i(r, e);
26727    }
26728
26729    #[simd_test(enable = "avx512fp16")]
26730    fn test_mm512_cvtt_roundph_epu32() {
26731        let a = _mm256_set_ph(
26732            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26733        );
26734        let r = _mm512_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(a);
26735        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26736        assert_eq_m512i(r, e);
26737    }
26738
26739    #[simd_test(enable = "avx512fp16")]
26740    fn test_mm512_mask_cvtt_roundph_epu32() {
26741        let a = _mm256_set_ph(
26742            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26743        );
26744        let src = _mm512_set_epi32(
26745            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26746        );
26747        let r = _mm512_mask_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26748        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26749        assert_eq_m512i(r, e);
26750    }
26751
26752    #[simd_test(enable = "avx512fp16")]
26753    fn test_mm512_maskz_cvtt_roundph_epu32() {
26754        let a = _mm256_set_ph(
26755            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26756        );
26757        let r = _mm512_maskz_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26758        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26759        assert_eq_m512i(r, e);
26760    }
26761
26762    #[simd_test(enable = "avx512fp16")]
26763    fn test_mm_cvttsh_u32() {
26764        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26765        let r = _mm_cvttsh_u32(a);
26766        assert_eq!(r, 1);
26767    }
26768
26769    #[simd_test(enable = "avx512fp16")]
26770    fn test_mm_cvtt_roundsh_u32() {
26771        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26772        let r = _mm_cvtt_roundsh_u32::<_MM_FROUND_NO_EXC>(a);
26773        assert_eq!(r, 1);
26774    }
26775
26776    #[simd_test(enable = "avx512fp16,avx512vl")]
26777    fn test_mm_cvtph_epi64() {
26778        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26779        let r = _mm_cvtph_epi64(a);
26780        let e = _mm_set_epi64x(1, 2);
26781        assert_eq_m128i(r, e);
26782    }
26783
26784    #[simd_test(enable = "avx512fp16,avx512vl")]
26785    fn test_mm_mask_cvtph_epi64() {
26786        let src = _mm_set_epi64x(3, 4);
26787        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26788        let r = _mm_mask_cvtph_epi64(src, 0b01, a);
26789        let e = _mm_set_epi64x(3, 2);
26790        assert_eq_m128i(r, e);
26791    }
26792
26793    #[simd_test(enable = "avx512fp16,avx512vl")]
26794    fn test_mm_maskz_cvtph_epi64() {
26795        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26796        let r = _mm_maskz_cvtph_epi64(0b01, a);
26797        let e = _mm_set_epi64x(0, 2);
26798        assert_eq_m128i(r, e);
26799    }
26800
26801    #[simd_test(enable = "avx512fp16,avx512vl")]
26802    fn test_mm256_cvtph_epi64() {
26803        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26804        let r = _mm256_cvtph_epi64(a);
26805        let e = _mm256_set_epi64x(1, 2, 3, 4);
26806        assert_eq_m256i(r, e);
26807    }
26808
26809    #[simd_test(enable = "avx512fp16,avx512vl")]
26810    fn test_mm256_mask_cvtph_epi64() {
26811        let src = _mm256_set_epi64x(5, 6, 7, 8);
26812        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26813        let r = _mm256_mask_cvtph_epi64(src, 0b0101, a);
26814        let e = _mm256_set_epi64x(5, 2, 7, 4);
26815        assert_eq_m256i(r, e);
26816    }
26817
26818    #[simd_test(enable = "avx512fp16,avx512vl")]
26819    fn test_mm256_maskz_cvtph_epi64() {
26820        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26821        let r = _mm256_maskz_cvtph_epi64(0b0101, a);
26822        let e = _mm256_set_epi64x(0, 2, 0, 4);
26823        assert_eq_m256i(r, e);
26824    }
26825
26826    #[simd_test(enable = "avx512fp16")]
26827    fn test_mm512_cvtph_epi64() {
26828        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26829        let r = _mm512_cvtph_epi64(a);
26830        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26831        assert_eq_m512i(r, e);
26832    }
26833
26834    #[simd_test(enable = "avx512fp16")]
26835    fn test_mm512_mask_cvtph_epi64() {
26836        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26837        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26838        let r = _mm512_mask_cvtph_epi64(src, 0b01010101, a);
26839        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26840        assert_eq_m512i(r, e);
26841    }
26842
26843    #[simd_test(enable = "avx512fp16")]
26844    fn test_mm512_maskz_cvtph_epi64() {
26845        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26846        let r = _mm512_maskz_cvtph_epi64(0b01010101, a);
26847        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26848        assert_eq_m512i(r, e);
26849    }
26850
26851    #[simd_test(enable = "avx512fp16")]
26852    fn test_mm512_cvt_roundph_epi64() {
26853        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26854        let r = _mm512_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26855        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26856        assert_eq_m512i(r, e);
26857    }
26858
26859    #[simd_test(enable = "avx512fp16")]
26860    fn test_mm512_mask_cvt_roundph_epi64() {
26861        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26862        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26863        let r = _mm512_mask_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26864            src, 0b01010101, a,
26865        );
26866        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26867        assert_eq_m512i(r, e);
26868    }
26869
26870    #[simd_test(enable = "avx512fp16")]
26871    fn test_mm512_maskz_cvt_roundph_epi64() {
26872        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26873        let r = _mm512_maskz_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26874            0b01010101, a,
26875        );
26876        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26877        assert_eq_m512i(r, e);
26878    }
26879
26880    #[simd_test(enable = "avx512fp16,avx512vl")]
26881    fn test_mm_cvtph_epu64() {
26882        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26883        let r = _mm_cvtph_epu64(a);
26884        let e = _mm_set_epi64x(1, 2);
26885        assert_eq_m128i(r, e);
26886    }
26887
26888    #[simd_test(enable = "avx512fp16,avx512vl")]
26889    fn test_mm_mask_cvtph_epu64() {
26890        let src = _mm_set_epi64x(3, 4);
26891        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26892        let r = _mm_mask_cvtph_epu64(src, 0b01, a);
26893        let e = _mm_set_epi64x(3, 2);
26894        assert_eq_m128i(r, e);
26895    }
26896
26897    #[simd_test(enable = "avx512fp16,avx512vl")]
26898    fn test_mm_maskz_cvtph_epu64() {
26899        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26900        let r = _mm_maskz_cvtph_epu64(0b01, a);
26901        let e = _mm_set_epi64x(0, 2);
26902        assert_eq_m128i(r, e);
26903    }
26904
26905    #[simd_test(enable = "avx512fp16,avx512vl")]
26906    fn test_mm256_cvtph_epu64() {
26907        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26908        let r = _mm256_cvtph_epu64(a);
26909        let e = _mm256_set_epi64x(1, 2, 3, 4);
26910        assert_eq_m256i(r, e);
26911    }
26912
26913    #[simd_test(enable = "avx512fp16,avx512vl")]
26914    fn test_mm256_mask_cvtph_epu64() {
26915        let src = _mm256_set_epi64x(5, 6, 7, 8);
26916        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26917        let r = _mm256_mask_cvtph_epu64(src, 0b0101, a);
26918        let e = _mm256_set_epi64x(5, 2, 7, 4);
26919        assert_eq_m256i(r, e);
26920    }
26921
26922    #[simd_test(enable = "avx512fp16,avx512vl")]
26923    fn test_mm256_maskz_cvtph_epu64() {
26924        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26925        let r = _mm256_maskz_cvtph_epu64(0b0101, a);
26926        let e = _mm256_set_epi64x(0, 2, 0, 4);
26927        assert_eq_m256i(r, e);
26928    }
26929
26930    #[simd_test(enable = "avx512fp16")]
26931    fn test_mm512_cvtph_epu64() {
26932        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26933        let r = _mm512_cvtph_epu64(a);
26934        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26935        assert_eq_m512i(r, e);
26936    }
26937
26938    #[simd_test(enable = "avx512fp16")]
26939    fn test_mm512_mask_cvtph_epu64() {
26940        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26941        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26942        let r = _mm512_mask_cvtph_epu64(src, 0b01010101, a);
26943        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26944        assert_eq_m512i(r, e);
26945    }
26946
26947    #[simd_test(enable = "avx512fp16")]
26948    fn test_mm512_maskz_cvtph_epu64() {
26949        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26950        let r = _mm512_maskz_cvtph_epu64(0b01010101, a);
26951        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26952        assert_eq_m512i(r, e);
26953    }
26954
26955    #[simd_test(enable = "avx512fp16")]
26956    fn test_mm512_cvt_roundph_epu64() {
26957        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26958        let r = _mm512_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26959        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26960        assert_eq_m512i(r, e);
26961    }
26962
26963    #[simd_test(enable = "avx512fp16")]
26964    fn test_mm512_mask_cvt_roundph_epu64() {
26965        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26966        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26967        let r = _mm512_mask_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26968            src, 0b01010101, a,
26969        );
26970        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26971        assert_eq_m512i(r, e);
26972    }
26973
26974    #[simd_test(enable = "avx512fp16")]
26975    fn test_mm512_maskz_cvt_roundph_epu64() {
26976        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26977        let r = _mm512_maskz_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26978            0b01010101, a,
26979        );
26980        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26981        assert_eq_m512i(r, e);
26982    }
26983
26984    #[simd_test(enable = "avx512fp16,avx512vl")]
26985    fn test_mm_cvttph_epi64() {
26986        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26987        let r = _mm_cvttph_epi64(a);
26988        let e = _mm_set_epi64x(1, 2);
26989        assert_eq_m128i(r, e);
26990    }
26991
26992    #[simd_test(enable = "avx512fp16,avx512vl")]
26993    fn test_mm_mask_cvttph_epi64() {
26994        let src = _mm_set_epi64x(3, 4);
26995        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26996        let r = _mm_mask_cvttph_epi64(src, 0b01, a);
26997        let e = _mm_set_epi64x(3, 2);
26998        assert_eq_m128i(r, e);
26999    }
27000
27001    #[simd_test(enable = "avx512fp16,avx512vl")]
27002    fn test_mm_maskz_cvttph_epi64() {
27003        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27004        let r = _mm_maskz_cvttph_epi64(0b01, a);
27005        let e = _mm_set_epi64x(0, 2);
27006        assert_eq_m128i(r, e);
27007    }
27008
27009    #[simd_test(enable = "avx512fp16,avx512vl")]
27010    fn test_mm256_cvttph_epi64() {
27011        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27012        let r = _mm256_cvttph_epi64(a);
27013        let e = _mm256_set_epi64x(1, 2, 3, 4);
27014        assert_eq_m256i(r, e);
27015    }
27016
27017    #[simd_test(enable = "avx512fp16,avx512vl")]
27018    fn test_mm256_mask_cvttph_epi64() {
27019        let src = _mm256_set_epi64x(5, 6, 7, 8);
27020        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27021        let r = _mm256_mask_cvttph_epi64(src, 0b0101, a);
27022        let e = _mm256_set_epi64x(5, 2, 7, 4);
27023        assert_eq_m256i(r, e);
27024    }
27025
27026    #[simd_test(enable = "avx512fp16,avx512vl")]
27027    fn test_mm256_maskz_cvttph_epi64() {
27028        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27029        let r = _mm256_maskz_cvttph_epi64(0b0101, a);
27030        let e = _mm256_set_epi64x(0, 2, 0, 4);
27031        assert_eq_m256i(r, e);
27032    }
27033
27034    #[simd_test(enable = "avx512fp16")]
27035    fn test_mm512_cvttph_epi64() {
27036        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27037        let r = _mm512_cvttph_epi64(a);
27038        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
27039        assert_eq_m512i(r, e);
27040    }
27041
27042    #[simd_test(enable = "avx512fp16")]
27043    fn test_mm512_mask_cvttph_epi64() {
27044        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
27045        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27046        let r = _mm512_mask_cvttph_epi64(src, 0b01010101, a);
27047        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
27048        assert_eq_m512i(r, e);
27049    }
27050
27051    #[simd_test(enable = "avx512fp16")]
27052    fn test_mm512_maskz_cvttph_epi64() {
27053        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27054        let r = _mm512_maskz_cvttph_epi64(0b01010101, a);
27055        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
27056        assert_eq_m512i(r, e);
27057    }
27058
27059    #[simd_test(enable = "avx512fp16")]
27060    fn test_mm512_cvtt_roundph_epi64() {
27061        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27062        let r = _mm512_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(a);
27063        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
27064        assert_eq_m512i(r, e);
27065    }
27066
27067    #[simd_test(enable = "avx512fp16")]
27068    fn test_mm512_mask_cvtt_roundph_epi64() {
27069        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
27070        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27071        let r = _mm512_mask_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
27072        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
27073        assert_eq_m512i(r, e);
27074    }
27075
27076    #[simd_test(enable = "avx512fp16")]
27077    fn test_mm512_maskz_cvtt_roundph_epi64() {
27078        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27079        let r = _mm512_maskz_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(0b01010101, a);
27080        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
27081        assert_eq_m512i(r, e);
27082    }
27083
27084    #[simd_test(enable = "avx512fp16,avx512vl")]
27085    fn test_mm_cvttph_epu64() {
27086        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27087        let r = _mm_cvttph_epu64(a);
27088        let e = _mm_set_epi64x(1, 2);
27089        assert_eq_m128i(r, e);
27090    }
27091
27092    #[simd_test(enable = "avx512fp16,avx512vl")]
27093    fn test_mm_mask_cvttph_epu64() {
27094        let src = _mm_set_epi64x(3, 4);
27095        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27096        let r = _mm_mask_cvttph_epu64(src, 0b01, a);
27097        let e = _mm_set_epi64x(3, 2);
27098        assert_eq_m128i(r, e);
27099    }
27100
27101    #[simd_test(enable = "avx512fp16,avx512vl")]
27102    fn test_mm_maskz_cvttph_epu64() {
27103        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27104        let r = _mm_maskz_cvttph_epu64(0b01, a);
27105        let e = _mm_set_epi64x(0, 2);
27106        assert_eq_m128i(r, e);
27107    }
27108
27109    #[simd_test(enable = "avx512fp16,avx512vl")]
27110    fn test_mm256_cvttph_epu64() {
27111        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27112        let r = _mm256_cvttph_epu64(a);
27113        let e = _mm256_set_epi64x(1, 2, 3, 4);
27114        assert_eq_m256i(r, e);
27115    }
27116
27117    #[simd_test(enable = "avx512fp16,avx512vl")]
27118    fn test_mm256_mask_cvttph_epu64() {
27119        let src = _mm256_set_epi64x(5, 6, 7, 8);
27120        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27121        let r = _mm256_mask_cvttph_epu64(src, 0b0101, a);
27122        let e = _mm256_set_epi64x(5, 2, 7, 4);
27123        assert_eq_m256i(r, e);
27124    }
27125
27126    #[simd_test(enable = "avx512fp16,avx512vl")]
27127    fn test_mm256_maskz_cvttph_epu64() {
27128        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27129        let r = _mm256_maskz_cvttph_epu64(0b0101, a);
27130        let e = _mm256_set_epi64x(0, 2, 0, 4);
27131        assert_eq_m256i(r, e);
27132    }
27133
27134    #[simd_test(enable = "avx512fp16")]
27135    fn test_mm512_cvttph_epu64() {
27136        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27137        let r = _mm512_cvttph_epu64(a);
27138        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
27139        assert_eq_m512i(r, e);
27140    }
27141
27142    #[simd_test(enable = "avx512fp16")]
27143    fn test_mm512_mask_cvttph_epu64() {
27144        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
27145        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27146        let r = _mm512_mask_cvttph_epu64(src, 0b01010101, a);
27147        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
27148        assert_eq_m512i(r, e);
27149    }
27150
27151    #[simd_test(enable = "avx512fp16")]
27152    fn test_mm512_maskz_cvttph_epu64() {
27153        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27154        let r = _mm512_maskz_cvttph_epu64(0b01010101, a);
27155        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
27156        assert_eq_m512i(r, e);
27157    }
27158
27159    #[simd_test(enable = "avx512fp16")]
27160    fn test_mm512_cvtt_roundph_epu64() {
27161        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27162        let r = _mm512_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(a);
27163        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
27164        assert_eq_m512i(r, e);
27165    }
27166
27167    #[simd_test(enable = "avx512fp16")]
27168    fn test_mm512_mask_cvtt_roundph_epu64() {
27169        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
27170        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27171        let r = _mm512_mask_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
27172        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
27173        assert_eq_m512i(r, e);
27174    }
27175
27176    #[simd_test(enable = "avx512fp16")]
27177    fn test_mm512_maskz_cvtt_roundph_epu64() {
27178        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27179        let r = _mm512_maskz_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(0b01010101, a);
27180        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
27181        assert_eq_m512i(r, e);
27182    }
27183
27184    #[simd_test(enable = "avx512fp16,avx512vl")]
27185    fn test_mm_cvtxph_ps() {
27186        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27187        let r = _mm_cvtxph_ps(a);
27188        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
27189        assert_eq_m128(r, e);
27190    }
27191
27192    #[simd_test(enable = "avx512fp16,avx512vl")]
27193    fn test_mm_mask_cvtxph_ps() {
27194        let src = _mm_set_ps(10.0, 11.0, 12.0, 13.0);
27195        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27196        let r = _mm_mask_cvtxph_ps(src, 0b0101, a);
27197        let e = _mm_set_ps(10.0, 2.0, 12.0, 4.0);
27198        assert_eq_m128(r, e);
27199    }
27200
27201    #[simd_test(enable = "avx512fp16,avx512vl")]
27202    fn test_mm_maskz_cvtxph_ps() {
27203        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27204        let r = _mm_maskz_cvtxph_ps(0b0101, a);
27205        let e = _mm_set_ps(0.0, 2.0, 0.0, 4.0);
27206        assert_eq_m128(r, e);
27207    }
27208
27209    #[simd_test(enable = "avx512fp16,avx512vl")]
27210    fn test_mm256_cvtxph_ps() {
27211        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27212        let r = _mm256_cvtxph_ps(a);
27213        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27214        assert_eq_m256(r, e);
27215    }
27216
27217    #[simd_test(enable = "avx512fp16,avx512vl")]
27218    fn test_mm256_mask_cvtxph_ps() {
27219        let src = _mm256_set_ps(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27220        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27221        let r = _mm256_mask_cvtxph_ps(src, 0b01010101, a);
27222        let e = _mm256_set_ps(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27223        assert_eq_m256(r, e);
27224    }
27225
27226    #[simd_test(enable = "avx512fp16,avx512vl")]
27227    fn test_mm256_maskz_cvtxph_ps() {
27228        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27229        let r = _mm256_maskz_cvtxph_ps(0b01010101, a);
27230        let e = _mm256_set_ps(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27231        assert_eq_m256(r, e);
27232    }
27233
27234    #[simd_test(enable = "avx512fp16")]
27235    fn test_mm512_cvtxph_ps() {
27236        let a = _mm256_set_ph(
27237            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27238        );
27239        let r = _mm512_cvtxph_ps(a);
27240        let e = _mm512_set_ps(
27241            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27242        );
27243        assert_eq_m512(r, e);
27244    }
27245
27246    #[simd_test(enable = "avx512fp16")]
27247    fn test_mm512_mask_cvtxph_ps() {
27248        let src = _mm512_set_ps(
27249            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
27250            24.0, 25.0,
27251        );
27252        let a = _mm256_set_ph(
27253            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27254        );
27255        let r = _mm512_mask_cvtxph_ps(src, 0b0101010101010101, a);
27256        let e = _mm512_set_ps(
27257            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
27258            16.0,
27259        );
27260        assert_eq_m512(r, e);
27261    }
27262
27263    #[simd_test(enable = "avx512fp16")]
27264    fn test_mm512_maskz_cvtxph_ps() {
27265        let a = _mm256_set_ph(
27266            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27267        );
27268        let r = _mm512_maskz_cvtxph_ps(0b0101010101010101, a);
27269        let e = _mm512_set_ps(
27270            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
27271        );
27272        assert_eq_m512(r, e);
27273    }
27274
27275    #[simd_test(enable = "avx512fp16")]
27276    fn test_mm512_cvtx_roundph_ps() {
27277        let a = _mm256_set_ph(
27278            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27279        );
27280        let r = _mm512_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(a);
27281        let e = _mm512_set_ps(
27282            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27283        );
27284        assert_eq_m512(r, e);
27285    }
27286
27287    #[simd_test(enable = "avx512fp16")]
27288    fn test_mm512_mask_cvtx_roundph_ps() {
27289        let src = _mm512_set_ps(
27290            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
27291            24.0, 25.0,
27292        );
27293        let a = _mm256_set_ph(
27294            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27295        );
27296        let r = _mm512_mask_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
27297        let e = _mm512_set_ps(
27298            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
27299            16.0,
27300        );
27301        assert_eq_m512(r, e);
27302    }
27303
27304    #[simd_test(enable = "avx512fp16")]
27305    fn test_mm512_maskz_cvtx_roundph_ps() {
27306        let a = _mm256_set_ph(
27307            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27308        );
27309        let r = _mm512_maskz_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
27310        let e = _mm512_set_ps(
27311            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
27312        );
27313        assert_eq_m512(r, e);
27314    }
27315
27316    #[simd_test(enable = "avx512fp16")]
27317    fn test_mm_cvtsh_ss() {
27318        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27319        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27320        let r = _mm_cvtsh_ss(a, b);
27321        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27322        assert_eq_m128(r, e);
27323    }
27324
27325    #[simd_test(enable = "avx512fp16")]
27326    fn test_mm_mask_cvtsh_ss() {
27327        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27328        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27329        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27330        let r = _mm_mask_cvtsh_ss(src, 0, a, b);
27331        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27332        assert_eq_m128(r, e);
27333        let r = _mm_mask_cvtsh_ss(src, 1, a, b);
27334        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27335        assert_eq_m128(r, e);
27336    }
27337
27338    #[simd_test(enable = "avx512fp16")]
27339    fn test_mm_maskz_cvtsh_ss() {
27340        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27341        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27342        let r = _mm_maskz_cvtsh_ss(0, a, b);
27343        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27344        assert_eq_m128(r, e);
27345        let r = _mm_maskz_cvtsh_ss(1, a, b);
27346        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27347        assert_eq_m128(r, e);
27348    }
27349
27350    #[simd_test(enable = "avx512fp16")]
27351    fn test_mm_cvt_roundsh_ss() {
27352        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27353        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27354        let r = _mm_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(a, b);
27355        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27356        assert_eq_m128(r, e);
27357    }
27358
27359    #[simd_test(enable = "avx512fp16")]
27360    fn test_mm_mask_cvt_roundsh_ss() {
27361        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27362        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27363        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27364        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27365        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27366        assert_eq_m128(r, e);
27367        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27368        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27369        assert_eq_m128(r, e);
27370    }
27371
27372    #[simd_test(enable = "avx512fp16")]
27373    fn test_mm_maskz_cvt_roundsh_ss() {
27374        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27375        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27376        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(0, a, b);
27377        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27378        assert_eq_m128(r, e);
27379        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(1, a, b);
27380        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27381        assert_eq_m128(r, e);
27382    }
27383
27384    #[simd_test(enable = "avx512fp16,avx512vl")]
27385    fn test_mm_cvtph_pd() {
27386        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27387        let r = _mm_cvtph_pd(a);
27388        let e = _mm_set_pd(1.0, 2.0);
27389        assert_eq_m128d(r, e);
27390    }
27391
27392    #[simd_test(enable = "avx512fp16,avx512vl")]
27393    fn test_mm_mask_cvtph_pd() {
27394        let src = _mm_set_pd(10.0, 11.0);
27395        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27396        let r = _mm_mask_cvtph_pd(src, 0b01, a);
27397        let e = _mm_set_pd(10.0, 2.0);
27398        assert_eq_m128d(r, e);
27399    }
27400
27401    #[simd_test(enable = "avx512fp16,avx512vl")]
27402    fn test_mm_maskz_cvtph_pd() {
27403        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27404        let r = _mm_maskz_cvtph_pd(0b01, a);
27405        let e = _mm_set_pd(0.0, 2.0);
27406        assert_eq_m128d(r, e);
27407    }
27408
27409    #[simd_test(enable = "avx512fp16,avx512vl")]
27410    fn test_mm256_cvtph_pd() {
27411        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27412        let r = _mm256_cvtph_pd(a);
27413        let e = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
27414        assert_eq_m256d(r, e);
27415    }
27416
27417    #[simd_test(enable = "avx512fp16,avx512vl")]
27418    fn test_mm256_mask_cvtph_pd() {
27419        let src = _mm256_set_pd(10.0, 11.0, 12.0, 13.0);
27420        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27421        let r = _mm256_mask_cvtph_pd(src, 0b0101, a);
27422        let e = _mm256_set_pd(10.0, 2.0, 12.0, 4.0);
27423        assert_eq_m256d(r, e);
27424    }
27425
27426    #[simd_test(enable = "avx512fp16,avx512vl")]
27427    fn test_mm256_maskz_cvtph_pd() {
27428        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27429        let r = _mm256_maskz_cvtph_pd(0b0101, a);
27430        let e = _mm256_set_pd(0.0, 2.0, 0.0, 4.0);
27431        assert_eq_m256d(r, e);
27432    }
27433
27434    #[simd_test(enable = "avx512fp16")]
27435    fn test_mm512_cvtph_pd() {
27436        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27437        let r = _mm512_cvtph_pd(a);
27438        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27439        assert_eq_m512d(r, e);
27440    }
27441
27442    #[simd_test(enable = "avx512fp16")]
27443    fn test_mm512_mask_cvtph_pd() {
27444        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27445        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27446        let r = _mm512_mask_cvtph_pd(src, 0b01010101, a);
27447        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27448        assert_eq_m512d(r, e);
27449    }
27450
27451    #[simd_test(enable = "avx512fp16")]
27452    fn test_mm512_maskz_cvtph_pd() {
27453        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27454        let r = _mm512_maskz_cvtph_pd(0b01010101, a);
27455        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27456        assert_eq_m512d(r, e);
27457    }
27458
27459    #[simd_test(enable = "avx512fp16")]
27460    fn test_mm512_cvt_roundph_pd() {
27461        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27462        let r = _mm512_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(a);
27463        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27464        assert_eq_m512d(r, e);
27465    }
27466
27467    #[simd_test(enable = "avx512fp16")]
27468    fn test_mm512_mask_cvt_roundph_pd() {
27469        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27470        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27471        let r = _mm512_mask_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
27472        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27473        assert_eq_m512d(r, e);
27474    }
27475
27476    #[simd_test(enable = "avx512fp16")]
27477    fn test_mm512_maskz_cvt_roundph_pd() {
27478        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27479        let r = _mm512_maskz_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(0b01010101, a);
27480        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27481        assert_eq_m512d(r, e);
27482    }
27483
27484    #[simd_test(enable = "avx512fp16")]
27485    fn test_mm_cvtsh_sd() {
27486        let a = _mm_setr_pd(2.0, 20.0);
27487        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27488        let r = _mm_cvtsh_sd(a, b);
27489        let e = _mm_setr_pd(1.0, 20.0);
27490        assert_eq_m128d(r, e);
27491    }
27492
27493    #[simd_test(enable = "avx512fp16")]
27494    fn test_mm_mask_cvtsh_sd() {
27495        let src = _mm_setr_pd(3.0, 11.0);
27496        let a = _mm_setr_pd(2.0, 20.0);
27497        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27498        let r = _mm_mask_cvtsh_sd(src, 0, a, b);
27499        let e = _mm_setr_pd(3.0, 20.0);
27500        assert_eq_m128d(r, e);
27501        let r = _mm_mask_cvtsh_sd(src, 1, a, b);
27502        let e = _mm_setr_pd(1.0, 20.0);
27503        assert_eq_m128d(r, e);
27504    }
27505
27506    #[simd_test(enable = "avx512fp16")]
27507    fn test_mm_maskz_cvtsh_sd() {
27508        let a = _mm_setr_pd(2.0, 20.0);
27509        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27510        let r = _mm_maskz_cvtsh_sd(0, a, b);
27511        let e = _mm_setr_pd(0.0, 20.0);
27512        assert_eq_m128d(r, e);
27513        let r = _mm_maskz_cvtsh_sd(1, a, b);
27514        let e = _mm_setr_pd(1.0, 20.0);
27515        assert_eq_m128d(r, e);
27516    }
27517
27518    #[simd_test(enable = "avx512fp16")]
27519    fn test_mm_cvt_roundsh_sd() {
27520        let a = _mm_setr_pd(2.0, 20.0);
27521        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27522        let r = _mm_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(a, b);
27523        let e = _mm_setr_pd(1.0, 20.0);
27524        assert_eq_m128d(r, e);
27525    }
27526
27527    #[simd_test(enable = "avx512fp16")]
27528    fn test_mm_mask_cvt_roundsh_sd() {
27529        let src = _mm_setr_pd(3.0, 11.0);
27530        let a = _mm_setr_pd(2.0, 20.0);
27531        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27532        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27533        let e = _mm_setr_pd(3.0, 20.0);
27534        assert_eq_m128d(r, e);
27535        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27536        let e = _mm_setr_pd(1.0, 20.0);
27537        assert_eq_m128d(r, e);
27538    }
27539
27540    #[simd_test(enable = "avx512fp16")]
27541    fn test_mm_maskz_cvt_roundsh_sd() {
27542        let a = _mm_setr_pd(2.0, 20.0);
27543        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27544        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(0, a, b);
27545        let e = _mm_setr_pd(0.0, 20.0);
27546        assert_eq_m128d(r, e);
27547        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(1, a, b);
27548        let e = _mm_setr_pd(1.0, 20.0);
27549        assert_eq_m128d(r, e);
27550    }
27551
27552    #[simd_test(enable = "avx512fp16")]
27553    const fn test_mm_cvtsh_h() {
27554        let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
27555        let r = _mm_cvtsh_h(a);
27556        assert_eq!(r, 1.0);
27557    }
27558
27559    #[simd_test(enable = "avx512fp16")]
27560    const fn test_mm256_cvtsh_h() {
27561        let a = _mm256_setr_ph(
27562            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27563        );
27564        let r = _mm256_cvtsh_h(a);
27565        assert_eq!(r, 1.0);
27566    }
27567
27568    #[simd_test(enable = "avx512fp16")]
27569    const fn test_mm512_cvtsh_h() {
27570        let a = _mm512_setr_ph(
27571            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27572            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
27573            31.0, 32.0,
27574        );
27575        let r = _mm512_cvtsh_h(a);
27576        assert_eq!(r, 1.0);
27577    }
27578
27579    #[simd_test(enable = "avx512fp16")]
27580    const fn test_mm_cvtsi128_si16() {
27581        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
27582        let r = _mm_cvtsi128_si16(a);
27583        assert_eq!(r, 1);
27584    }
27585
27586    #[simd_test(enable = "avx512fp16")]
27587    const fn test_mm_cvtsi16_si128() {
27588        let a = 1;
27589        let r = _mm_cvtsi16_si128(a);
27590        let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
27591        assert_eq_m128i(r, e);
27592    }
27593}