core/stdarch/crates/core_arch/src/x86/
avx512fp16.rs

1use crate::arch::asm;
2use crate::core_arch::{simd::*, x86::*};
3use crate::intrinsics::{fmaf16, simd::*};
4use crate::ptr;
5
6/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
7///
8/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ph)
9#[inline]
10#[target_feature(enable = "avx512fp16")]
11#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
13pub const fn _mm_set_ph(
14    e7: f16,
15    e6: f16,
16    e5: f16,
17    e4: f16,
18    e3: f16,
19    e2: f16,
20    e1: f16,
21    e0: f16,
22) -> __m128h {
23    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
24}
25
26/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
27///
28/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ph)
29#[inline]
30#[target_feature(enable = "avx512fp16")]
31#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
32#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
33pub const fn _mm256_set_ph(
34    e15: f16,
35    e14: f16,
36    e13: f16,
37    e12: f16,
38    e11: f16,
39    e10: f16,
40    e9: f16,
41    e8: f16,
42    e7: f16,
43    e6: f16,
44    e5: f16,
45    e4: f16,
46    e3: f16,
47    e2: f16,
48    e1: f16,
49    e0: f16,
50) -> __m256h {
51    __m256h([
52        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
53    ])
54}
55
56/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
57///
58/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ph)
59#[inline]
60#[target_feature(enable = "avx512fp16")]
61#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
62#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
63pub const fn _mm512_set_ph(
64    e31: f16,
65    e30: f16,
66    e29: f16,
67    e28: f16,
68    e27: f16,
69    e26: f16,
70    e25: f16,
71    e24: f16,
72    e23: f16,
73    e22: f16,
74    e21: f16,
75    e20: f16,
76    e19: f16,
77    e18: f16,
78    e17: f16,
79    e16: f16,
80    e15: f16,
81    e14: f16,
82    e13: f16,
83    e12: f16,
84    e11: f16,
85    e10: f16,
86    e9: f16,
87    e8: f16,
88    e7: f16,
89    e6: f16,
90    e5: f16,
91    e4: f16,
92    e3: f16,
93    e2: f16,
94    e1: f16,
95    e0: f16,
96) -> __m512h {
97    __m512h([
98        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
99        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
100    ])
101}
102
103/// Copy half-precision (16-bit) floating-point elements from a to the lower element of dst and zero
104/// the upper 7 elements.
105///
106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sh)
107#[inline]
108#[target_feature(enable = "avx512fp16")]
109#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
110#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
111pub const fn _mm_set_sh(a: f16) -> __m128h {
112    __m128h([a, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
113}
114
115/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
116///
117/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ph)
118#[inline]
119#[target_feature(enable = "avx512fp16")]
120#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
121#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
122pub const fn _mm_set1_ph(a: f16) -> __m128h {
123    unsafe { transmute(f16x8::splat(a)) }
124}
125
126/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
127///
128/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ph)
129#[inline]
130#[target_feature(enable = "avx512fp16")]
131#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
132#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
133pub const fn _mm256_set1_ph(a: f16) -> __m256h {
134    unsafe { transmute(f16x16::splat(a)) }
135}
136
137/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
138///
139/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ph)
140#[inline]
141#[target_feature(enable = "avx512fp16")]
142#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
143#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
144pub const fn _mm512_set1_ph(a: f16) -> __m512h {
145    unsafe { transmute(f16x32::splat(a)) }
146}
147
148/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
149///
150/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ph)
151#[inline]
152#[target_feature(enable = "avx512fp16")]
153#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
154#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
155pub const fn _mm_setr_ph(
156    e0: f16,
157    e1: f16,
158    e2: f16,
159    e3: f16,
160    e4: f16,
161    e5: f16,
162    e6: f16,
163    e7: f16,
164) -> __m128h {
165    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
166}
167
168/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
169///
170/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ph)
171#[inline]
172#[target_feature(enable = "avx512fp16")]
173#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
174#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
175pub const fn _mm256_setr_ph(
176    e0: f16,
177    e1: f16,
178    e2: f16,
179    e3: f16,
180    e4: f16,
181    e5: f16,
182    e6: f16,
183    e7: f16,
184    e8: f16,
185    e9: f16,
186    e10: f16,
187    e11: f16,
188    e12: f16,
189    e13: f16,
190    e14: f16,
191    e15: f16,
192) -> __m256h {
193    __m256h([
194        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
195    ])
196}
197
198/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
199///
200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ph)
201#[inline]
202#[target_feature(enable = "avx512fp16")]
203#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
204#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
205pub const fn _mm512_setr_ph(
206    e0: f16,
207    e1: f16,
208    e2: f16,
209    e3: f16,
210    e4: f16,
211    e5: f16,
212    e6: f16,
213    e7: f16,
214    e8: f16,
215    e9: f16,
216    e10: f16,
217    e11: f16,
218    e12: f16,
219    e13: f16,
220    e14: f16,
221    e15: f16,
222    e16: f16,
223    e17: f16,
224    e18: f16,
225    e19: f16,
226    e20: f16,
227    e21: f16,
228    e22: f16,
229    e23: f16,
230    e24: f16,
231    e25: f16,
232    e26: f16,
233    e27: f16,
234    e28: f16,
235    e29: f16,
236    e30: f16,
237    e31: f16,
238) -> __m512h {
239    __m512h([
240        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
241        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
242    ])
243}
244
245/// Return vector of type __m128h with all elements set to zero.
246///
247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ph)
248#[inline]
249#[target_feature(enable = "avx512fp16,avx512vl")]
250#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
251#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
252pub const fn _mm_setzero_ph() -> __m128h {
253    unsafe { transmute(f16x8::ZERO) }
254}
255
256/// Return vector of type __m256h with all elements set to zero.
257///
258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ph)
259#[inline]
260#[target_feature(enable = "avx512fp16,avx512vl")]
261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
262#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
263pub const fn _mm256_setzero_ph() -> __m256h {
264    f16x16::ZERO.as_m256h()
265}
266
267/// Return vector of type __m512h with all elements set to zero.
268///
269/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ph)
270#[inline]
271#[target_feature(enable = "avx512fp16")]
272#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
273#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
274pub const fn _mm512_setzero_ph() -> __m512h {
275    f16x32::ZERO.as_m512h()
276}
277
278/// Return vector of type `__m128h` with indetermination elements.
279/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
280/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
281/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
282///
283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ph)
284#[inline]
285#[target_feature(enable = "avx512fp16,avx512vl")]
286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
287#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
288pub const fn _mm_undefined_ph() -> __m128h {
289    f16x8::ZERO.as_m128h()
290}
291
292/// Return vector of type `__m256h` with indetermination elements.
293/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
294/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
295/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
296///
297/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ph)
298#[inline]
299#[target_feature(enable = "avx512fp16,avx512vl")]
300#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
301#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
302pub const fn _mm256_undefined_ph() -> __m256h {
303    f16x16::ZERO.as_m256h()
304}
305
306/// Return vector of type `__m512h` with indetermination elements.
307/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
308/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
309/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
310///
311/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ph)
312#[inline]
313#[target_feature(enable = "avx512fp16")]
314#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
315#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
316pub const fn _mm512_undefined_ph() -> __m512h {
317    f16x32::ZERO.as_m512h()
318}
319
320/// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
321/// does not generate any instructions, thus it has zero latency.
322///
323/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ph)
324#[inline]
325#[target_feature(enable = "avx512fp16")]
326#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
327#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
328pub const fn _mm_castpd_ph(a: __m128d) -> __m128h {
329    unsafe { transmute(a) }
330}
331
332/// Cast vector of type `__m256d` to type `__m256h`. This intrinsic is only used for compilation and
333/// does not generate any instructions, thus it has zero latency.
334///
335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
336#[inline]
337#[target_feature(enable = "avx512fp16")]
338#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
339#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
340pub const fn _mm256_castpd_ph(a: __m256d) -> __m256h {
341    unsafe { transmute(a) }
342}
343
344/// Cast vector of type `__m512d` to type `__m512h`. This intrinsic is only used for compilation and
345/// does not generate any instructions, thus it has zero latency.
346///
347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ph)
348#[inline]
349#[target_feature(enable = "avx512fp16")]
350#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
351#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
352pub const fn _mm512_castpd_ph(a: __m512d) -> __m512h {
353    unsafe { transmute(a) }
354}
355
356/// Cast vector of type `__m128h` to type `__m128d`. This intrinsic is only used for compilation and
357/// does not generate any instructions, thus it has zero latency.
358///
359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_pd)
360#[inline]
361#[target_feature(enable = "avx512fp16")]
362#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
363#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
364pub const fn _mm_castph_pd(a: __m128h) -> __m128d {
365    unsafe { transmute(a) }
366}
367
368/// Cast vector of type `__m256h` to type `__m256d`. This intrinsic is only used for compilation and
369/// does not generate any instructions, thus it has zero latency.
370///
371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_pd)
372#[inline]
373#[target_feature(enable = "avx512fp16")]
374#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
375#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
376pub const fn _mm256_castph_pd(a: __m256h) -> __m256d {
377    unsafe { transmute(a) }
378}
379
380/// Cast vector of type `__m512h` to type `__m512d`. This intrinsic is only used for compilation and
381/// does not generate any instructions, thus it has zero latency.
382///
383/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_pd)
384#[inline]
385#[target_feature(enable = "avx512fp16")]
386#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
387#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
388pub const fn _mm512_castph_pd(a: __m512h) -> __m512d {
389    unsafe { transmute(a) }
390}
391
392/// Cast vector of type `__m128` to type `__m128h`. This intrinsic is only used for compilation and
393/// does not generate any instructions, thus it has zero latency.
394///
395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_ph)
396#[inline]
397#[target_feature(enable = "avx512fp16")]
398#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
399#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
400pub const fn _mm_castps_ph(a: __m128) -> __m128h {
401    unsafe { transmute(a) }
402}
403
404/// Cast vector of type `__m256` to type `__m256h`. This intrinsic is only used for compilation and
405/// does not generate any instructions, thus it has zero latency.
406///
407/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_ph)
408#[inline]
409#[target_feature(enable = "avx512fp16")]
410#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
411#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
412pub const fn _mm256_castps_ph(a: __m256) -> __m256h {
413    unsafe { transmute(a) }
414}
415
416/// Cast vector of type `__m512` to type `__m512h`. This intrinsic is only used for compilation and
417/// does not generate any instructions, thus it has zero latency.
418///
419/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_ph)
420#[inline]
421#[target_feature(enable = "avx512fp16")]
422#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
423#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
424pub const fn _mm512_castps_ph(a: __m512) -> __m512h {
425    unsafe { transmute(a) }
426}
427
428/// Cast vector of type `__m128h` to type `__m128`. This intrinsic is only used for compilation and
429/// does not generate any instructions, thus it has zero latency.
430///
431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_ps)
432#[inline]
433#[target_feature(enable = "avx512fp16")]
434#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
435#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
436pub const fn _mm_castph_ps(a: __m128h) -> __m128 {
437    unsafe { transmute(a) }
438}
439
440/// Cast vector of type `__m256h` to type `__m256`. This intrinsic is only used for compilation and
441/// does not generate any instructions, thus it has zero latency.
442///
443/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_ps)
444#[inline]
445#[target_feature(enable = "avx512fp16")]
446#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
447#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
448pub const fn _mm256_castph_ps(a: __m256h) -> __m256 {
449    unsafe { transmute(a) }
450}
451
452/// Cast vector of type `__m512h` to type `__m512`. This intrinsic is only used for compilation and
453/// does not generate any instructions, thus it has zero latency.
454///
455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_ps)
456#[inline]
457#[target_feature(enable = "avx512fp16")]
458#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
459#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
460pub const fn _mm512_castph_ps(a: __m512h) -> __m512 {
461    unsafe { transmute(a) }
462}
463
464/// Cast vector of type `__m128i` to type `__m128h`. This intrinsic is only used for compilation and
465/// does not generate any instructions, thus it has zero latency.
466///
467/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ph)
468#[inline]
469#[target_feature(enable = "avx512fp16")]
470#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
471#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
472pub const fn _mm_castsi128_ph(a: __m128i) -> __m128h {
473    unsafe { transmute(a) }
474}
475
476/// Cast vector of type `__m256i` to type `__m256h`. This intrinsic is only used for compilation and
477/// does not generate any instructions, thus it has zero latency.
478///
479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ph)
480#[inline]
481#[target_feature(enable = "avx512fp16")]
482#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
483#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
484pub const fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
485    unsafe { transmute(a) }
486}
487
488/// Cast vector of type `__m512i` to type `__m512h`. This intrinsic is only used for compilation and
489/// does not generate any instructions, thus it has zero latency.
490///
491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ph)
492#[inline]
493#[target_feature(enable = "avx512fp16")]
494#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
495#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
496pub const fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
497    unsafe { transmute(a) }
498}
499
500/// Cast vector of type `__m128h` to type `__m128i`. This intrinsic is only used for compilation and
501/// does not generate any instructions, thus it has zero latency.
502///
503/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_si128)
504#[inline]
505#[target_feature(enable = "avx512fp16")]
506#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
507#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
508pub const fn _mm_castph_si128(a: __m128h) -> __m128i {
509    unsafe { transmute(a) }
510}
511
512/// Cast vector of type `__m256h` to type `__m256i`. This intrinsic is only used for compilation and
513/// does not generate any instructions, thus it has zero latency.
514///
515/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_si256)
516#[inline]
517#[target_feature(enable = "avx512fp16")]
518#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
519#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
520pub const fn _mm256_castph_si256(a: __m256h) -> __m256i {
521    unsafe { transmute(a) }
522}
523
524/// Cast vector of type `__m512h` to type `__m512i`. This intrinsic is only used for compilation and
525/// does not generate any instructions, thus it has zero latency.
526///
527/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_si512)
528#[inline]
529#[target_feature(enable = "avx512fp16")]
530#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
531#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
532pub const fn _mm512_castph_si512(a: __m512h) -> __m512i {
533    unsafe { transmute(a) }
534}
535
536/// Cast vector of type `__m256h` to type `__m128h`. This intrinsic is only used for compilation and
537/// does not generate any instructions, thus it has zero latency.
538///
539/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph256_ph128)
540#[inline]
541#[target_feature(enable = "avx512fp16")]
542#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
543#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
544pub const fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
545    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
546}
547
548/// Cast vector of type `__m512h` to type `__m128h`. This intrinsic is only used for compilation and
549/// does not generate any instructions, thus it has zero latency.
550///
551/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph128)
552#[inline]
553#[target_feature(enable = "avx512fp16")]
554#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
555#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
556pub const fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
557    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
558}
559
560/// Cast vector of type `__m512h` to type `__m256h`. This intrinsic is only used for compilation and
561/// does not generate any instructions, thus it has zero latency.
562///
563/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph256)
564#[inline]
565#[target_feature(enable = "avx512fp16")]
566#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
567#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
568pub const fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
569    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
570}
571
572/// Cast vector of type `__m128h` to type `__m256h`. The upper 8 elements of the result are undefined.
573/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
574/// but most of the time it does not generate any instructions.
575///
576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph128_ph256)
577#[inline]
578#[target_feature(enable = "avx512fp16")]
579#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
580#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
581pub const fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
582    unsafe {
583        simd_shuffle!(
584            a,
585            _mm_undefined_ph(),
586            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
587        )
588    }
589}
590
591/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are undefined.
592/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
593/// but most of the time it does not generate any instructions.
594///
595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph128_ph512)
596#[inline]
597#[target_feature(enable = "avx512fp16")]
598#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
599#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
600pub const fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
601    unsafe {
602        simd_shuffle!(
603            a,
604            _mm_undefined_ph(),
605            [
606                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
607                8, 8, 8, 8
608            ]
609        )
610    }
611}
612
613/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are undefined.
614/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
615/// but most of the time it does not generate any instructions.
616///
617/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph256_ph512)
618#[inline]
619#[target_feature(enable = "avx512fp16")]
620#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
621#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
622pub const fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
623    unsafe {
624        simd_shuffle!(
625            a,
626            _mm256_undefined_ph(),
627            [
628                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
629                16, 16, 16, 16, 16, 16, 16, 16, 16
630            ]
631        )
632    }
633}
634
635/// Cast vector of type `__m256h` to type `__m128h`. The upper 8 elements of the result are zeroed.
636/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
637/// any instructions.
638///
639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextph128_ph256)
640#[inline]
641#[target_feature(enable = "avx512fp16")]
642#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
643#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
644pub const fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
645    unsafe {
646        simd_shuffle!(
647            a,
648            _mm_setzero_ph(),
649            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
650        )
651    }
652}
653
654/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
655/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
656/// any instructions.
657///
658/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
659#[inline]
660#[target_feature(enable = "avx512fp16")]
661#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
662#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
663pub const fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
664    unsafe {
665        simd_shuffle!(
666            a,
667            _mm256_setzero_ph(),
668            [
669                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
670                16, 16, 16, 16, 16, 16, 16, 16, 16
671            ]
672        )
673    }
674}
675
676/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
677/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
678/// any instructions.
679///
680/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph128_ph512)
681#[inline]
682#[target_feature(enable = "avx512fp16")]
683#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
684#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
685pub const fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
686    unsafe {
687        simd_shuffle!(
688            a,
689            _mm_setzero_ph(),
690            [
691                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
692                8, 8, 8, 8
693            ]
694        )
695    }
696}
697
698macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
699    ($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
700        let dst: $mask_type;
701        asm!(
702            "vcmpph {k}, {a}, {b}, {imm8}",
703            k = lateout(kreg) dst,
704            a = in($reg) $a,
705            b = in($reg) $b,
706            imm8 = const IMM5,
707            options(pure, nomem, nostack)
708        );
709        dst
710    }};
711    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
712        let dst: $mask_type;
713        asm!(
714            "vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
715            k = lateout(kreg) dst,
716            mask = in(kreg) $mask,
717            a = in($reg) $a,
718            b = in($reg) $b,
719            imm8 = const IMM5,
720            options(pure, nomem, nostack)
721        );
722        dst
723    }};
724}
725
726/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
727/// operand specified by imm8, and store the results in mask vector k.
728///
729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask)
730#[inline]
731#[target_feature(enable = "avx512fp16,avx512vl")]
732#[rustc_legacy_const_generics(2)]
733#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
734pub fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
735    unsafe {
736        static_assert_uimm_bits!(IMM5, 5);
737        cmp_asm!(__mmask8, xmm_reg, a, b)
738    }
739}
740
741/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
742/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
743/// zeroed out when the corresponding mask bit is not set).
744///
745/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask)
746#[inline]
747#[target_feature(enable = "avx512fp16,avx512vl")]
748#[rustc_legacy_const_generics(3)]
749#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
750pub fn _mm_mask_cmp_ph_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
751    unsafe {
752        static_assert_uimm_bits!(IMM5, 5);
753        cmp_asm!(__mmask8, k1, xmm_reg, a, b)
754    }
755}
756
757/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
758/// operand specified by imm8, and store the results in mask vector k.
759///
760/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ph_mask)
761#[inline]
762#[target_feature(enable = "avx512fp16,avx512vl")]
763#[rustc_legacy_const_generics(2)]
764#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
765pub fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
766    unsafe {
767        static_assert_uimm_bits!(IMM5, 5);
768        cmp_asm!(__mmask16, ymm_reg, a, b)
769    }
770}
771
772/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
773/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
774/// zeroed out when the corresponding mask bit is not set).
775///
776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask)
777#[inline]
778#[target_feature(enable = "avx512fp16,avx512vl")]
779#[rustc_legacy_const_generics(3)]
780#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
781pub fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
782    k1: __mmask16,
783    a: __m256h,
784    b: __m256h,
785) -> __mmask16 {
786    unsafe {
787        static_assert_uimm_bits!(IMM5, 5);
788        cmp_asm!(__mmask16, k1, ymm_reg, a, b)
789    }
790}
791
792/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
793/// operand specified by imm8, and store the results in mask vector k.
794///
795/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ph_mask)
796#[inline]
797#[target_feature(enable = "avx512fp16")]
798#[rustc_legacy_const_generics(2)]
799#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
800pub fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
801    unsafe {
802        static_assert_uimm_bits!(IMM5, 5);
803        cmp_asm!(__mmask32, zmm_reg, a, b)
804    }
805}
806
807/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
808/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
809/// zeroed out when the corresponding mask bit is not set).
810///
811/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask)
812#[inline]
813#[target_feature(enable = "avx512fp16")]
814#[rustc_legacy_const_generics(3)]
815#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
816pub fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
817    k1: __mmask32,
818    a: __m512h,
819    b: __m512h,
820) -> __mmask32 {
821    unsafe {
822        static_assert_uimm_bits!(IMM5, 5);
823        cmp_asm!(__mmask32, k1, zmm_reg, a, b)
824    }
825}
826
827/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
828/// operand specified by imm8, and store the results in mask vector k.
829///
830/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
831///
832/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
833#[inline]
834#[target_feature(enable = "avx512fp16")]
835#[rustc_legacy_const_generics(2, 3)]
836#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
837pub fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
838    a: __m512h,
839    b: __m512h,
840) -> __mmask32 {
841    unsafe {
842        static_assert_uimm_bits!(IMM5, 5);
843        static_assert_sae!(SAE);
844        if SAE == _MM_FROUND_NO_EXC {
845            let dst: __mmask32;
846            asm!(
847                "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
848                k = lateout(kreg) dst,
849                a = in(zmm_reg) a,
850                b = in(zmm_reg) b,
851                imm8 = const IMM5,
852                options(pure, nomem, nostack)
853            );
854            dst
855        } else {
856            cmp_asm!(__mmask32, zmm_reg, a, b)
857        }
858    }
859}
860
861/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
862/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
863/// zeroed out when the corresponding mask bit is not set).
864///
865/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
866///
867/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
868#[inline]
869#[target_feature(enable = "avx512fp16")]
870#[rustc_legacy_const_generics(3, 4)]
871#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
872pub fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
873    k1: __mmask32,
874    a: __m512h,
875    b: __m512h,
876) -> __mmask32 {
877    unsafe {
878        static_assert_uimm_bits!(IMM5, 5);
879        static_assert_sae!(SAE);
880        if SAE == _MM_FROUND_NO_EXC {
881            let dst: __mmask32;
882            asm!(
883                "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
884                k = lateout(kreg) dst,
885                k1 = in(kreg) k1,
886                a = in(zmm_reg) a,
887                b = in(zmm_reg) b,
888                imm8 = const IMM5,
889                options(pure, nomem, nostack)
890            );
891            dst
892        } else {
893            cmp_asm!(__mmask32, k1, zmm_reg, a, b)
894        }
895    }
896}
897
898/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
899/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
900/// passing _MM_FROUND_NO_EXC in the sae parameter.
901///
902/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask)
903#[inline]
904#[target_feature(enable = "avx512fp16")]
905#[rustc_legacy_const_generics(2, 3)]
906#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
907pub fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __mmask8 {
908    static_assert_uimm_bits!(IMM5, 5);
909    static_assert_sae!(SAE);
910    _mm_mask_cmp_round_sh_mask::<IMM5, SAE>(0xff, a, b)
911}
912
913/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
914/// operand specified by imm8, and store the result in mask vector k using zeromask k1. Exceptions can be
915/// suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
916///
917/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask)
918#[inline]
919#[target_feature(enable = "avx512fp16")]
920#[rustc_legacy_const_generics(3, 4)]
921#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
922pub fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
923    k1: __mmask8,
924    a: __m128h,
925    b: __m128h,
926) -> __mmask8 {
927    unsafe {
928        static_assert_uimm_bits!(IMM5, 5);
929        static_assert_sae!(SAE);
930        vcmpsh(a, b, IMM5, k1, SAE)
931    }
932}
933
934/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
935/// operand specified by imm8, and store the result in mask vector k.
936///
937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask)
938#[inline]
939#[target_feature(enable = "avx512fp16")]
940#[rustc_legacy_const_generics(2)]
941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
942pub fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
943    static_assert_uimm_bits!(IMM5, 5);
944    _mm_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
945}
946
947/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
948/// operand specified by imm8, and store the result in mask vector k using zeromask k1.
949///
950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask)
951#[inline]
952#[target_feature(enable = "avx512fp16")]
953#[rustc_legacy_const_generics(3)]
954#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
955pub fn _mm_mask_cmp_sh_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
956    static_assert_uimm_bits!(IMM5, 5);
957    _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
958}
959
960/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
961/// operand specified by imm8, and return the boolean result (0 or 1).
962/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
963///
964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sh)
965#[inline]
966#[target_feature(enable = "avx512fp16")]
967#[rustc_legacy_const_generics(2, 3)]
968#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
969pub fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
970    unsafe {
971        static_assert_uimm_bits!(IMM5, 5);
972        static_assert_sae!(SAE);
973        vcomish(a, b, IMM5, SAE)
974    }
975}
976
977/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
978/// operand specified by imm8, and return the boolean result (0 or 1).
979///
980/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_sh)
981#[inline]
982#[target_feature(enable = "avx512fp16")]
983#[rustc_legacy_const_generics(2)]
984#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
985pub fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
986    static_assert_uimm_bits!(IMM5, 5);
987    _mm_comi_round_sh::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
988}
989
990/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and return
991/// the boolean result (0 or 1).
992///
993/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sh)
994#[inline]
995#[target_feature(enable = "avx512fp16")]
996#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
997pub fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
998    _mm_comi_sh::<_CMP_EQ_OS>(a, b)
999}
1000
1001/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
1002/// and return the boolean result (0 or 1).
1003///
1004/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sh)
1005#[inline]
1006#[target_feature(enable = "avx512fp16")]
1007#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1008pub fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
1009    _mm_comi_sh::<_CMP_GE_OS>(a, b)
1010}
1011
1012/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
1013/// the boolean result (0 or 1).
1014///
1015/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sh)
1016#[inline]
1017#[target_feature(enable = "avx512fp16")]
1018#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1019pub fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
1020    _mm_comi_sh::<_CMP_GT_OS>(a, b)
1021}
1022
1023/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
1024/// return the boolean result (0 or 1).
1025///
1026/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sh)
1027#[inline]
1028#[target_feature(enable = "avx512fp16")]
1029#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1030pub fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
1031    _mm_comi_sh::<_CMP_LE_OS>(a, b)
1032}
1033
1034/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
1035/// the boolean result (0 or 1).
1036///
1037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sh)
1038#[inline]
1039#[target_feature(enable = "avx512fp16")]
1040#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1041pub fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
1042    _mm_comi_sh::<_CMP_LT_OS>(a, b)
1043}
1044
1045/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1046/// the boolean result (0 or 1).
1047///
1048/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sh)
1049#[inline]
1050#[target_feature(enable = "avx512fp16")]
1051#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1052pub fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
1053    _mm_comi_sh::<_CMP_NEQ_US>(a, b)
1054}
1055
1056/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and
1057/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1058///
1059/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sh)
1060#[inline]
1061#[target_feature(enable = "avx512fp16")]
1062#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1063pub fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
1064    _mm_comi_sh::<_CMP_EQ_OQ>(a, b)
1065}
1066
1067/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
1068/// and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1069///
1070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sh)
1071#[inline]
1072#[target_feature(enable = "avx512fp16")]
1073#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1074pub fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
1075    _mm_comi_sh::<_CMP_GE_OQ>(a, b)
1076}
1077
1078/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
1079/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1080///
1081/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sh)
1082#[inline]
1083#[target_feature(enable = "avx512fp16")]
1084#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1085pub fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
1086    _mm_comi_sh::<_CMP_GT_OQ>(a, b)
1087}
1088
1089/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
1090/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1091///
1092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sh)
1093#[inline]
1094#[target_feature(enable = "avx512fp16")]
1095#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1096pub fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
1097    _mm_comi_sh::<_CMP_LE_OQ>(a, b)
1098}
1099
1100/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
1101/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1102///
1103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sh)
1104#[inline]
1105#[target_feature(enable = "avx512fp16")]
1106#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1107pub fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
1108    _mm_comi_sh::<_CMP_LT_OQ>(a, b)
1109}
1110
1111/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1112/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1113///
1114/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sh)
1115#[inline]
1116#[target_feature(enable = "avx512fp16")]
1117#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1118pub fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
1119    _mm_comi_sh::<_CMP_NEQ_UQ>(a, b)
1120}
1121
1122/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1123/// a new vector. The address must be aligned to 16 bytes or a general-protection exception may be generated.
1124///
1125/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ph)
1126#[inline]
1127#[target_feature(enable = "avx512fp16,avx512vl")]
1128#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1129#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1130pub const unsafe fn _mm_load_ph(mem_addr: *const f16) -> __m128h {
1131    *mem_addr.cast()
1132}
1133
1134/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1135/// a new vector. The address must be aligned to 32 bytes or a general-protection exception may be generated.
1136///
1137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ph)
1138#[inline]
1139#[target_feature(enable = "avx512fp16,avx512vl")]
1140#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1141#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1142pub const unsafe fn _mm256_load_ph(mem_addr: *const f16) -> __m256h {
1143    *mem_addr.cast()
1144}
1145
1146/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1147/// a new vector. The address must be aligned to 64 bytes or a general-protection exception may be generated.
1148///
1149/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ph)
1150#[inline]
1151#[target_feature(enable = "avx512fp16")]
1152#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1153#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1154pub const unsafe fn _mm512_load_ph(mem_addr: *const f16) -> __m512h {
1155    *mem_addr.cast()
1156}
1157
1158/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector,
1159/// and zero the upper elements
1160///
1161/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sh)
1162#[inline]
1163#[target_feature(enable = "avx512fp16")]
1164#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1165#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1166pub const unsafe fn _mm_load_sh(mem_addr: *const f16) -> __m128h {
1167    _mm_set_sh(*mem_addr)
1168}
1169
1170/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1171/// using writemask k (the element is copied from src when mask bit 0 is not set), and zero the upper elements.
1172///
1173/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sh)
1174#[inline]
1175#[target_feature(enable = "avx512fp16")]
1176#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1177pub unsafe fn _mm_mask_load_sh(src: __m128h, k: __mmask8, mem_addr: *const f16) -> __m128h {
1178    let mut dst = src;
1179    asm!(
1180        vpl!("vmovsh {dst}{{{k}}}"),
1181        dst = inout(xmm_reg) dst,
1182        k = in(kreg) k,
1183        p = in(reg) mem_addr,
1184        options(pure, readonly, nostack, preserves_flags)
1185    );
1186    dst
1187}
1188
1189/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1190/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and zero the upper elements.
1191///
1192/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sh)
1193#[inline]
1194#[target_feature(enable = "avx512fp16")]
1195#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1196pub unsafe fn _mm_maskz_load_sh(k: __mmask8, mem_addr: *const f16) -> __m128h {
1197    let mut dst: __m128h;
1198    asm!(
1199        vpl!("vmovsh {dst}{{{k}}}{{z}}"),
1200        dst = out(xmm_reg) dst,
1201        k = in(kreg) k,
1202        p = in(reg) mem_addr,
1203        options(pure, readonly, nostack, preserves_flags)
1204    );
1205    dst
1206}
1207
1208/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1209/// a new vector. The address does not need to be aligned to any particular boundary.
1210///
1211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ph)
1212#[inline]
1213#[target_feature(enable = "avx512fp16,avx512vl")]
1214#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1215#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1216pub const unsafe fn _mm_loadu_ph(mem_addr: *const f16) -> __m128h {
1217    ptr::read_unaligned(mem_addr.cast())
1218}
1219
1220/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1221/// a new vector. The address does not need to be aligned to any particular boundary.
1222///
1223/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ph)
1224#[inline]
1225#[target_feature(enable = "avx512fp16,avx512vl")]
1226#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1227#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1228pub const unsafe fn _mm256_loadu_ph(mem_addr: *const f16) -> __m256h {
1229    ptr::read_unaligned(mem_addr.cast())
1230}
1231
1232/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1233/// a new vector. The address does not need to be aligned to any particular boundary.
1234///
1235/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ph)
1236#[inline]
1237#[target_feature(enable = "avx512fp16")]
1238#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1239#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1240pub const unsafe fn _mm512_loadu_ph(mem_addr: *const f16) -> __m512h {
1241    ptr::read_unaligned(mem_addr.cast())
1242}
1243
1244/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1245/// using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper
1246/// 7 packed elements from a to the upper elements of dst.
1247///
1248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sh)
1249#[inline]
1250#[target_feature(enable = "avx512fp16")]
1251#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1252#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1253pub const fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1254    unsafe {
1255        let mut mov: f16 = simd_extract!(src, 0);
1256        if (k & 1) != 0 {
1257            mov = simd_extract!(b, 0);
1258        }
1259        simd_insert!(a, 0, mov)
1260    }
1261}
1262
1263/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1264/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
1265/// elements from a to the upper elements of dst.
1266///
1267/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sh)
1268#[inline]
1269#[target_feature(enable = "avx512fp16")]
1270#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1271#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1272pub const fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1273    unsafe {
1274        let mut mov: f16 = 0.;
1275        if (k & 1) != 0 {
1276            mov = simd_extract!(b, 0);
1277        }
1278        simd_insert!(a, 0, mov)
1279    }
1280}
1281
1282/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst,
1283/// and copy the upper 7 packed elements from a to the upper elements of dst.
1284///
1285/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sh)
1286#[inline]
1287#[target_feature(enable = "avx512fp16")]
1288#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1289#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1290pub const fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
1291    unsafe {
1292        let mov: f16 = simd_extract!(b, 0);
1293        simd_insert!(a, 0, mov)
1294    }
1295}
1296
1297/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1298/// The address must be aligned to 16 bytes or a general-protection exception may be generated.
1299///
1300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ph)
1301#[inline]
1302#[target_feature(enable = "avx512fp16,avx512vl")]
1303#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1304#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1305pub const unsafe fn _mm_store_ph(mem_addr: *mut f16, a: __m128h) {
1306    *mem_addr.cast() = a;
1307}
1308
1309/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1310/// The address must be aligned to 32 bytes or a general-protection exception may be generated.
1311///
1312/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ph)
1313#[inline]
1314#[target_feature(enable = "avx512fp16,avx512vl")]
1315#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1316#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1317pub const unsafe fn _mm256_store_ph(mem_addr: *mut f16, a: __m256h) {
1318    *mem_addr.cast() = a;
1319}
1320
1321/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1322/// The address must be aligned to 64 bytes or a general-protection exception may be generated.
1323///
1324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ph)
1325#[inline]
1326#[target_feature(enable = "avx512fp16")]
1327#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1328#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1329pub const unsafe fn _mm512_store_ph(mem_addr: *mut f16, a: __m512h) {
1330    *mem_addr.cast() = a;
1331}
1332
1333/// Store the lower half-precision (16-bit) floating-point element from a into memory.
1334///
1335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sh)
1336#[inline]
1337#[target_feature(enable = "avx512fp16")]
1338#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1339#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1340pub const unsafe fn _mm_store_sh(mem_addr: *mut f16, a: __m128h) {
1341    *mem_addr = simd_extract!(a, 0);
1342}
1343
1344/// Store the lower half-precision (16-bit) floating-point element from a into memory using writemask k
1345///
1346/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sh)
1347#[inline]
1348#[target_feature(enable = "avx512fp16")]
1349#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1350pub unsafe fn _mm_mask_store_sh(mem_addr: *mut f16, k: __mmask8, a: __m128h) {
1351    asm!(
1352        vps!("vmovdqu16", "{{{k}}}, {src}"),
1353        p = in(reg) mem_addr,
1354        k = in(kreg) k,
1355        src = in(xmm_reg) a,
1356        options(nostack, preserves_flags)
1357    );
1358}
1359
1360/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1361/// The address does not need to be aligned to any particular boundary.
1362///
1363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ph)
1364#[inline]
1365#[target_feature(enable = "avx512fp16,avx512vl")]
1366#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1367#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1368pub const unsafe fn _mm_storeu_ph(mem_addr: *mut f16, a: __m128h) {
1369    ptr::write_unaligned(mem_addr.cast(), a);
1370}
1371
1372/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1373/// The address does not need to be aligned to any particular boundary.
1374///
1375/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ph)
1376#[inline]
1377#[target_feature(enable = "avx512fp16,avx512vl")]
1378#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1379#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1380pub const unsafe fn _mm256_storeu_ph(mem_addr: *mut f16, a: __m256h) {
1381    ptr::write_unaligned(mem_addr.cast(), a);
1382}
1383
1384/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1385/// The address does not need to be aligned to any particular boundary.
1386///
1387/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ph)
1388#[inline]
1389#[target_feature(enable = "avx512fp16")]
1390#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1391#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1392pub const unsafe fn _mm512_storeu_ph(mem_addr: *mut f16, a: __m512h) {
1393    ptr::write_unaligned(mem_addr.cast(), a);
1394}
1395
1396/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1397///
1398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ph)
1399#[inline]
1400#[target_feature(enable = "avx512fp16,avx512vl")]
1401#[cfg_attr(test, assert_instr(vaddph))]
1402#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1403#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1404pub const fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
1405    unsafe { simd_add(a, b) }
1406}
1407
1408/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1409/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1410///
1411/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ph)
1412#[inline]
1413#[target_feature(enable = "avx512fp16,avx512vl")]
1414#[cfg_attr(test, assert_instr(vaddph))]
1415#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1416#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1417pub const fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1418    unsafe {
1419        let r = _mm_add_ph(a, b);
1420        simd_select_bitmask(k, r, src)
1421    }
1422}
1423
1424/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1425/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1426///
1427/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ph)
1428#[inline]
1429#[target_feature(enable = "avx512fp16,avx512vl")]
1430#[cfg_attr(test, assert_instr(vaddph))]
1431#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1432#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1433pub const fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1434    unsafe {
1435        let r = _mm_add_ph(a, b);
1436        simd_select_bitmask(k, r, _mm_setzero_ph())
1437    }
1438}
1439
1440/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1441///
1442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ph)
1443#[inline]
1444#[target_feature(enable = "avx512fp16,avx512vl")]
1445#[cfg_attr(test, assert_instr(vaddph))]
1446#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1447#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1448pub const fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
1449    unsafe { simd_add(a, b) }
1450}
1451
1452/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1453/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1454///
1455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ph)
1456#[inline]
1457#[target_feature(enable = "avx512fp16,avx512vl")]
1458#[cfg_attr(test, assert_instr(vaddph))]
1459#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1460#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1461pub const fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1462    unsafe {
1463        let r = _mm256_add_ph(a, b);
1464        simd_select_bitmask(k, r, src)
1465    }
1466}
1467
1468/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1469/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1470///
1471/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ph)
1472#[inline]
1473#[target_feature(enable = "avx512fp16,avx512vl")]
1474#[cfg_attr(test, assert_instr(vaddph))]
1475#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1476#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1477pub const fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1478    unsafe {
1479        let r = _mm256_add_ph(a, b);
1480        simd_select_bitmask(k, r, _mm256_setzero_ph())
1481    }
1482}
1483
1484/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1485///
1486/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ph)
1487#[inline]
1488#[target_feature(enable = "avx512fp16")]
1489#[cfg_attr(test, assert_instr(vaddph))]
1490#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1491#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1492pub const fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
1493    unsafe { simd_add(a, b) }
1494}
1495
1496/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1497/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1498///
1499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ph)
1500#[inline]
1501#[target_feature(enable = "avx512fp16")]
1502#[cfg_attr(test, assert_instr(vaddph))]
1503#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1504#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1505pub const fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1506    unsafe {
1507        let r = _mm512_add_ph(a, b);
1508        simd_select_bitmask(k, r, src)
1509    }
1510}
1511
1512/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1513/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1514///
1515/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ph)
1516#[inline]
1517#[target_feature(enable = "avx512fp16")]
1518#[cfg_attr(test, assert_instr(vaddph))]
1519#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1520#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1521pub const fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1522    unsafe {
1523        let r = _mm512_add_ph(a, b);
1524        simd_select_bitmask(k, r, _mm512_setzero_ph())
1525    }
1526}
1527
1528/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1529/// Rounding is done according to the rounding parameter, which can be one of:
1530///
1531/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1532/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1533/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1534/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1535/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1536///
1537/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ph)
1538#[inline]
1539#[target_feature(enable = "avx512fp16")]
1540#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1541#[rustc_legacy_const_generics(2)]
1542#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1543pub fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1544    unsafe {
1545        static_assert_rounding!(ROUNDING);
1546        vaddph(a, b, ROUNDING)
1547    }
1548}
1549
1550/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1551/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1552/// Rounding is done according to the rounding parameter, which can be one of:
1553///
1554/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1555/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1556/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1557/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1558/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1559///
1560/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ph)
1561#[inline]
1562#[target_feature(enable = "avx512fp16")]
1563#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1564#[rustc_legacy_const_generics(4)]
1565#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1566pub fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
1567    src: __m512h,
1568    k: __mmask32,
1569    a: __m512h,
1570    b: __m512h,
1571) -> __m512h {
1572    unsafe {
1573        static_assert_rounding!(ROUNDING);
1574        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1575        simd_select_bitmask(k, r, src)
1576    }
1577}
1578
1579/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1580/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1581/// Rounding is done according to the rounding parameter, which can be one of:
1582///
1583/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1584/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1585/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1586/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1587///
1588/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ph)
1589#[inline]
1590#[target_feature(enable = "avx512fp16")]
1591#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1592#[rustc_legacy_const_generics(3)]
1593#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1594pub fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
1595    k: __mmask32,
1596    a: __m512h,
1597    b: __m512h,
1598) -> __m512h {
1599    unsafe {
1600        static_assert_rounding!(ROUNDING);
1601        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1602        simd_select_bitmask(k, r, _mm512_setzero_ph())
1603    }
1604}
1605
1606/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1607/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1608/// Rounding is done according to the rounding parameter, which can be one of:
1609///
1610/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1611/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1612/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1613/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1614/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1615///
1616/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sh)
1617#[inline]
1618#[target_feature(enable = "avx512fp16")]
1619#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1620#[rustc_legacy_const_generics(2)]
1621#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1622pub fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1623    static_assert_rounding!(ROUNDING);
1624    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
1625}
1626
1627/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1628/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1629/// writemask k (the element is copied from src when mask bit 0 is not set).
1630/// Rounding is done according to the rounding parameter, which can be one of:
1631///
1632/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1633/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1634/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1635/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1636/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1637///
1638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sh)
1639#[inline]
1640#[target_feature(enable = "avx512fp16")]
1641#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1642#[rustc_legacy_const_generics(4)]
1643#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1644pub fn _mm_mask_add_round_sh<const ROUNDING: i32>(
1645    src: __m128h,
1646    k: __mmask8,
1647    a: __m128h,
1648    b: __m128h,
1649) -> __m128h {
1650    unsafe {
1651        static_assert_rounding!(ROUNDING);
1652        vaddsh(a, b, src, k, ROUNDING)
1653    }
1654}
1655
1656/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1657/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1658/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1659/// Rounding is done according to the rounding parameter, which can be one of:
1660///
1661/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1662/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1663/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1664/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1665/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1666///
1667/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sh)
1668#[inline]
1669#[target_feature(enable = "avx512fp16")]
1670#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1671#[rustc_legacy_const_generics(3)]
1672#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1673pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1674    static_assert_rounding!(ROUNDING);
1675    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
1676}
1677
1678/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1679/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1680///
1681/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sh)
1682#[inline]
1683#[target_feature(enable = "avx512fp16")]
1684#[cfg_attr(test, assert_instr(vaddsh))]
1685#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1686#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1687pub const fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
1688    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) + _mm_cvtsh_h(b)) }
1689}
1690
1691/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1692/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1693/// writemask k (the element is copied from src when mask bit 0 is not set).
1694///
1695/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sh)
1696#[inline]
1697#[target_feature(enable = "avx512fp16")]
1698#[cfg_attr(test, assert_instr(vaddsh))]
1699#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1700#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1701pub const fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1702    unsafe {
1703        let extractsrc: f16 = simd_extract!(src, 0);
1704        let mut add: f16 = extractsrc;
1705        if (k & 0b00000001) != 0 {
1706            let extracta: f16 = simd_extract!(a, 0);
1707            let extractb: f16 = simd_extract!(b, 0);
1708            add = extracta + extractb;
1709        }
1710        simd_insert!(a, 0, add)
1711    }
1712}
1713
1714/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1715/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1716/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1717///
1718/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sh)
1719#[inline]
1720#[target_feature(enable = "avx512fp16")]
1721#[cfg_attr(test, assert_instr(vaddsh))]
1722#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1723#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1724pub const fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1725    unsafe {
1726        let mut add: f16 = 0.;
1727        if (k & 0b00000001) != 0 {
1728            let extracta: f16 = simd_extract!(a, 0);
1729            let extractb: f16 = simd_extract!(b, 0);
1730            add = extracta + extractb;
1731        }
1732        simd_insert!(a, 0, add)
1733    }
1734}
1735
1736/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1737///
1738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ph)
1739#[inline]
1740#[target_feature(enable = "avx512fp16,avx512vl")]
1741#[cfg_attr(test, assert_instr(vsubph))]
1742#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1743#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1744pub const fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
1745    unsafe { simd_sub(a, b) }
1746}
1747
1748/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1749/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1750///
1751/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ph)
1752#[inline]
1753#[target_feature(enable = "avx512fp16,avx512vl")]
1754#[cfg_attr(test, assert_instr(vsubph))]
1755#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1756#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1757pub const fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1758    unsafe {
1759        let r = _mm_sub_ph(a, b);
1760        simd_select_bitmask(k, r, src)
1761    }
1762}
1763
1764/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1765/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1766///
1767/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ph)
1768#[inline]
1769#[target_feature(enable = "avx512fp16,avx512vl")]
1770#[cfg_attr(test, assert_instr(vsubph))]
1771#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1772#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1773pub const fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1774    unsafe {
1775        let r = _mm_sub_ph(a, b);
1776        simd_select_bitmask(k, r, _mm_setzero_ph())
1777    }
1778}
1779
1780/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1781///
1782/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ph)
1783#[inline]
1784#[target_feature(enable = "avx512fp16,avx512vl")]
1785#[cfg_attr(test, assert_instr(vsubph))]
1786#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1787#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1788pub const fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
1789    unsafe { simd_sub(a, b) }
1790}
1791
1792/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1793/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1794///
1795/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ph)
1796#[inline]
1797#[target_feature(enable = "avx512fp16,avx512vl")]
1798#[cfg_attr(test, assert_instr(vsubph))]
1799#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1800#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1801pub const fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1802    unsafe {
1803        let r = _mm256_sub_ph(a, b);
1804        simd_select_bitmask(k, r, src)
1805    }
1806}
1807
1808/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1809/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1810///
1811/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ph)
1812#[inline]
1813#[target_feature(enable = "avx512fp16,avx512vl")]
1814#[cfg_attr(test, assert_instr(vsubph))]
1815#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1816#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1817pub const fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1818    unsafe {
1819        let r = _mm256_sub_ph(a, b);
1820        simd_select_bitmask(k, r, _mm256_setzero_ph())
1821    }
1822}
1823
1824/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1825///
1826/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ph)
1827#[inline]
1828#[target_feature(enable = "avx512fp16")]
1829#[cfg_attr(test, assert_instr(vsubph))]
1830#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1831#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1832pub const fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
1833    unsafe { simd_sub(a, b) }
1834}
1835
1836/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1837/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1838///
1839/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ph)
1840#[inline]
1841#[target_feature(enable = "avx512fp16")]
1842#[cfg_attr(test, assert_instr(vsubph))]
1843#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1844#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1845pub const fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1846    unsafe {
1847        let r = _mm512_sub_ph(a, b);
1848        simd_select_bitmask(k, r, src)
1849    }
1850}
1851
1852/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1853/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1854///
1855/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ph)
1856#[inline]
1857#[target_feature(enable = "avx512fp16")]
1858#[cfg_attr(test, assert_instr(vsubph))]
1859#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1860#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1861pub const fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1862    unsafe {
1863        let r = _mm512_sub_ph(a, b);
1864        simd_select_bitmask(k, r, _mm512_setzero_ph())
1865    }
1866}
1867
1868/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1869/// Rounding is done according to the rounding parameter, which can be one of:
1870///
1871/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1872/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1873/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1874/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1875/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1876///
1877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ph)
1878#[inline]
1879#[target_feature(enable = "avx512fp16")]
1880#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1881#[rustc_legacy_const_generics(2)]
1882#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1883pub fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1884    unsafe {
1885        static_assert_rounding!(ROUNDING);
1886        vsubph(a, b, ROUNDING)
1887    }
1888}
1889
1890/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1891/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1892/// Rounding is done according to the rounding parameter, which can be one of:
1893///
1894/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1895/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1896/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1897/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1898/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1899///
1900/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ph)
1901#[inline]
1902#[target_feature(enable = "avx512fp16")]
1903#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1904#[rustc_legacy_const_generics(4)]
1905#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1906pub fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
1907    src: __m512h,
1908    k: __mmask32,
1909    a: __m512h,
1910    b: __m512h,
1911) -> __m512h {
1912    unsafe {
1913        static_assert_rounding!(ROUNDING);
1914        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1915        simd_select_bitmask(k, r, src)
1916    }
1917}
1918
1919/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1920/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1921/// Rounding is done according to the rounding parameter, which can be one of:
1922///
1923/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1924/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1925/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1926/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1927/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1928///
1929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ph)
1930#[inline]
1931#[target_feature(enable = "avx512fp16")]
1932#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1933#[rustc_legacy_const_generics(3)]
1934#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1935pub fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
1936    k: __mmask32,
1937    a: __m512h,
1938    b: __m512h,
1939) -> __m512h {
1940    unsafe {
1941        static_assert_rounding!(ROUNDING);
1942        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1943        simd_select_bitmask(k, r, _mm512_setzero_ph())
1944    }
1945}
1946
1947/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1948/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1949/// Rounding is done according to the rounding parameter, which can be one of:
1950///
1951/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1952/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1953/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1954/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1955/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1956///
1957/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sh)
1958#[inline]
1959#[target_feature(enable = "avx512fp16")]
1960#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1961#[rustc_legacy_const_generics(2)]
1962#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1963pub fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1964    static_assert_rounding!(ROUNDING);
1965    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
1966}
1967
1968/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1969/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1970/// writemask k (the element is copied from src when mask bit 0 is not set).
1971/// Rounding is done according to the rounding parameter, which can be one of:
1972///
1973/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1974/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1975/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1976/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1977/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1978///
1979/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sh)
1980#[inline]
1981#[target_feature(enable = "avx512fp16")]
1982#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1983#[rustc_legacy_const_generics(4)]
1984#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1985pub fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
1986    src: __m128h,
1987    k: __mmask8,
1988    a: __m128h,
1989    b: __m128h,
1990) -> __m128h {
1991    unsafe {
1992        static_assert_rounding!(ROUNDING);
1993        vsubsh(a, b, src, k, ROUNDING)
1994    }
1995}
1996
1997/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1998/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1999/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2000/// Rounding is done according to the rounding parameter, which can be one of:
2001///
2002/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2003/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2004/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2005/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2006/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2007///
2008/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sh)
2009#[inline]
2010#[target_feature(enable = "avx512fp16")]
2011#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
2012#[rustc_legacy_const_generics(3)]
2013#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2014pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2015    static_assert_rounding!(ROUNDING);
2016    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2017}
2018
2019/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
2020/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2021///
2022/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sh)
2023#[inline]
2024#[target_feature(enable = "avx512fp16")]
2025#[cfg_attr(test, assert_instr(vsubsh))]
2026#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2027#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2028pub const fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
2029    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) - _mm_cvtsh_h(b)) }
2030}
2031
2032/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
2033/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2034/// writemask k (the element is copied from src when mask bit 0 is not set).
2035///
2036/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sh)
2037#[inline]
2038#[target_feature(enable = "avx512fp16")]
2039#[cfg_attr(test, assert_instr(vsubsh))]
2040#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2041#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2042pub const fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2043    unsafe {
2044        let extractsrc: f16 = simd_extract!(src, 0);
2045        let mut add: f16 = extractsrc;
2046        if (k & 0b00000001) != 0 {
2047            let extracta: f16 = simd_extract!(a, 0);
2048            let extractb: f16 = simd_extract!(b, 0);
2049            add = extracta - extractb;
2050        }
2051        simd_insert!(a, 0, add)
2052    }
2053}
2054
2055/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
2056/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2057/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2058///
2059/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sh)
2060#[inline]
2061#[target_feature(enable = "avx512fp16")]
2062#[cfg_attr(test, assert_instr(vsubsh))]
2063#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2064#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2065pub const fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2066    unsafe {
2067        let mut add: f16 = 0.;
2068        if (k & 0b00000001) != 0 {
2069            let extracta: f16 = simd_extract!(a, 0);
2070            let extractb: f16 = simd_extract!(b, 0);
2071            add = extracta - extractb;
2072        }
2073        simd_insert!(a, 0, add)
2074    }
2075}
2076
2077/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2078///
2079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ph)
2080#[inline]
2081#[target_feature(enable = "avx512fp16,avx512vl")]
2082#[cfg_attr(test, assert_instr(vmulph))]
2083#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2084#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2085pub const fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
2086    unsafe { simd_mul(a, b) }
2087}
2088
2089/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2090/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2091///
2092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ph)
2093#[inline]
2094#[target_feature(enable = "avx512fp16,avx512vl")]
2095#[cfg_attr(test, assert_instr(vmulph))]
2096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2097#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2098pub const fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2099    unsafe {
2100        let r = _mm_mul_ph(a, b);
2101        simd_select_bitmask(k, r, src)
2102    }
2103}
2104
2105/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2106/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2107///
2108/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ph)
2109#[inline]
2110#[target_feature(enable = "avx512fp16,avx512vl")]
2111#[cfg_attr(test, assert_instr(vmulph))]
2112#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2113#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2114pub const fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2115    unsafe {
2116        let r = _mm_mul_ph(a, b);
2117        simd_select_bitmask(k, r, _mm_setzero_ph())
2118    }
2119}
2120
2121/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2122///
2123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ph)
2124#[inline]
2125#[target_feature(enable = "avx512fp16,avx512vl")]
2126#[cfg_attr(test, assert_instr(vmulph))]
2127#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2128#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2129pub const fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
2130    unsafe { simd_mul(a, b) }
2131}
2132
2133/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2134/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2135///
2136/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ph)
2137#[inline]
2138#[target_feature(enable = "avx512fp16,avx512vl")]
2139#[cfg_attr(test, assert_instr(vmulph))]
2140#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2141#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2142pub const fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2143    unsafe {
2144        let r = _mm256_mul_ph(a, b);
2145        simd_select_bitmask(k, r, src)
2146    }
2147}
2148
2149/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2150/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2151///
2152/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ph)
2153#[inline]
2154#[target_feature(enable = "avx512fp16,avx512vl")]
2155#[cfg_attr(test, assert_instr(vmulph))]
2156#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2157#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2158pub const fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2159    unsafe {
2160        let r = _mm256_mul_ph(a, b);
2161        simd_select_bitmask(k, r, _mm256_setzero_ph())
2162    }
2163}
2164
2165/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2166///
2167/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ph)
2168#[inline]
2169#[target_feature(enable = "avx512fp16")]
2170#[cfg_attr(test, assert_instr(vmulph))]
2171#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2172#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2173pub const fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
2174    unsafe { simd_mul(a, b) }
2175}
2176
2177/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2178/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2179///
2180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ph)
2181#[inline]
2182#[target_feature(enable = "avx512fp16")]
2183#[cfg_attr(test, assert_instr(vmulph))]
2184#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2185#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2186pub const fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2187    unsafe {
2188        let r = _mm512_mul_ph(a, b);
2189        simd_select_bitmask(k, r, src)
2190    }
2191}
2192
2193/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2194/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2195///
2196/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ph)
2197#[inline]
2198#[target_feature(enable = "avx512fp16")]
2199#[cfg_attr(test, assert_instr(vmulph))]
2200#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2201#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2202pub const fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2203    unsafe {
2204        let r = _mm512_mul_ph(a, b);
2205        simd_select_bitmask(k, r, _mm512_setzero_ph())
2206    }
2207}
2208
2209/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2210/// Rounding is done according to the rounding parameter, which can be one of:
2211///
2212/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2213/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2214/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2215/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2216/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2217///
2218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ph)
2219#[inline]
2220#[target_feature(enable = "avx512fp16")]
2221#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2222#[rustc_legacy_const_generics(2)]
2223#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2224pub fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2225    unsafe {
2226        static_assert_rounding!(ROUNDING);
2227        vmulph(a, b, ROUNDING)
2228    }
2229}
2230
2231/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2232/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2233/// Rounding is done according to the rounding parameter, which can be one of:
2234///
2235/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2236/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2237/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2238/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2239/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2240///
2241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ph)
2242#[inline]
2243#[target_feature(enable = "avx512fp16")]
2244#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2245#[rustc_legacy_const_generics(4)]
2246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2247pub fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
2248    src: __m512h,
2249    k: __mmask32,
2250    a: __m512h,
2251    b: __m512h,
2252) -> __m512h {
2253    unsafe {
2254        static_assert_rounding!(ROUNDING);
2255        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2256        simd_select_bitmask(k, r, src)
2257    }
2258}
2259
2260/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2261/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2262/// Rounding is done according to the rounding parameter, which can be one of:
2263///
2264/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2265/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2266/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2267/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2268/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2269///
2270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ph)
2271#[inline]
2272#[target_feature(enable = "avx512fp16")]
2273#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2274#[rustc_legacy_const_generics(3)]
2275#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2276pub fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
2277    k: __mmask32,
2278    a: __m512h,
2279    b: __m512h,
2280) -> __m512h {
2281    unsafe {
2282        static_assert_rounding!(ROUNDING);
2283        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2284        simd_select_bitmask(k, r, _mm512_setzero_ph())
2285    }
2286}
2287
2288/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2289/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2290/// Rounding is done according to the rounding parameter, which can be one of:
2291///
2292/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2293/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2294/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2295/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2296/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2297///
2298/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sh)
2299#[inline]
2300#[target_feature(enable = "avx512fp16")]
2301#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2302#[rustc_legacy_const_generics(2)]
2303#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2304pub fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2305    static_assert_rounding!(ROUNDING);
2306    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2307}
2308
2309/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2310/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2311/// writemask k (the element is copied from src when mask bit 0 is not set).
2312/// Rounding is done according to the rounding parameter, which can be one of:
2313///
2314/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2315/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2316/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2317/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2318/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2319///
2320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sh)
2321#[inline]
2322#[target_feature(enable = "avx512fp16")]
2323#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2324#[rustc_legacy_const_generics(4)]
2325#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2326pub fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
2327    src: __m128h,
2328    k: __mmask8,
2329    a: __m128h,
2330    b: __m128h,
2331) -> __m128h {
2332    unsafe {
2333        static_assert_rounding!(ROUNDING);
2334        vmulsh(a, b, src, k, ROUNDING)
2335    }
2336}
2337
2338/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2339/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2340/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2341/// Rounding is done according to the rounding parameter, which can be one of:
2342///
2343/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2344/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2345/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2346/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2347/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2348///
2349/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sh)
2350#[inline]
2351#[target_feature(enable = "avx512fp16")]
2352#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2353#[rustc_legacy_const_generics(3)]
2354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2355pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2356    static_assert_rounding!(ROUNDING);
2357    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2358}
2359
2360/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2361/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2362///
2363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sh)
2364#[inline]
2365#[target_feature(enable = "avx512fp16")]
2366#[cfg_attr(test, assert_instr(vmulsh))]
2367#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2368#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2369pub const fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
2370    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) * _mm_cvtsh_h(b)) }
2371}
2372
2373/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2374/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2375/// writemask k (the element is copied from src when mask bit 0 is not set).
2376///
2377/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sh)
2378#[inline]
2379#[target_feature(enable = "avx512fp16")]
2380#[cfg_attr(test, assert_instr(vmulsh))]
2381#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2382#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2383pub const fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2384    unsafe {
2385        let extractsrc: f16 = simd_extract!(src, 0);
2386        let mut add: f16 = extractsrc;
2387        if (k & 0b00000001) != 0 {
2388            let extracta: f16 = simd_extract!(a, 0);
2389            let extractb: f16 = simd_extract!(b, 0);
2390            add = extracta * extractb;
2391        }
2392        simd_insert!(a, 0, add)
2393    }
2394}
2395
2396/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2397/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2398/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2399///
2400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sh)
2401#[inline]
2402#[target_feature(enable = "avx512fp16")]
2403#[cfg_attr(test, assert_instr(vmulsh))]
2404#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2405#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2406pub const fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2407    unsafe {
2408        let mut add: f16 = 0.;
2409        if (k & 0b00000001) != 0 {
2410            let extracta: f16 = simd_extract!(a, 0);
2411            let extractb: f16 = simd_extract!(b, 0);
2412            add = extracta * extractb;
2413        }
2414        simd_insert!(a, 0, add)
2415    }
2416}
2417
2418/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2419///
2420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ph)
2421#[inline]
2422#[target_feature(enable = "avx512fp16,avx512vl")]
2423#[cfg_attr(test, assert_instr(vdivph))]
2424#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2425#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2426pub const fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
2427    unsafe { simd_div(a, b) }
2428}
2429
2430/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2431/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2432///
2433/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ph)
2434#[inline]
2435#[target_feature(enable = "avx512fp16,avx512vl")]
2436#[cfg_attr(test, assert_instr(vdivph))]
2437#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2438#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2439pub const fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2440    unsafe {
2441        let r = _mm_div_ph(a, b);
2442        simd_select_bitmask(k, r, src)
2443    }
2444}
2445
2446/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2447/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2448///
2449/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ph)
2450#[inline]
2451#[target_feature(enable = "avx512fp16,avx512vl")]
2452#[cfg_attr(test, assert_instr(vdivph))]
2453#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2454#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2455pub const fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2456    unsafe {
2457        let r = _mm_div_ph(a, b);
2458        simd_select_bitmask(k, r, _mm_setzero_ph())
2459    }
2460}
2461
2462/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2463///
2464/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ph)
2465#[inline]
2466#[target_feature(enable = "avx512fp16,avx512vl")]
2467#[cfg_attr(test, assert_instr(vdivph))]
2468#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2469#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2470pub const fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
2471    unsafe { simd_div(a, b) }
2472}
2473
2474/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2475/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2476///
2477/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ph)
2478#[inline]
2479#[target_feature(enable = "avx512fp16,avx512vl")]
2480#[cfg_attr(test, assert_instr(vdivph))]
2481#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2482#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2483pub const fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2484    unsafe {
2485        let r = _mm256_div_ph(a, b);
2486        simd_select_bitmask(k, r, src)
2487    }
2488}
2489
2490/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2491/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2492///
2493/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ph)
2494#[inline]
2495#[target_feature(enable = "avx512fp16,avx512vl")]
2496#[cfg_attr(test, assert_instr(vdivph))]
2497#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2498#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2499pub const fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2500    unsafe {
2501        let r = _mm256_div_ph(a, b);
2502        simd_select_bitmask(k, r, _mm256_setzero_ph())
2503    }
2504}
2505
2506/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2507///
2508/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ph)
2509#[inline]
2510#[target_feature(enable = "avx512fp16")]
2511#[cfg_attr(test, assert_instr(vdivph))]
2512#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2513#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2514pub const fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
2515    unsafe { simd_div(a, b) }
2516}
2517
2518/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2519/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2520///
2521/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ph)
2522#[inline]
2523#[target_feature(enable = "avx512fp16")]
2524#[cfg_attr(test, assert_instr(vdivph))]
2525#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2526#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2527pub const fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2528    unsafe {
2529        let r = _mm512_div_ph(a, b);
2530        simd_select_bitmask(k, r, src)
2531    }
2532}
2533
2534/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2535/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2536///
2537/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ph)
2538#[inline]
2539#[target_feature(enable = "avx512fp16")]
2540#[cfg_attr(test, assert_instr(vdivph))]
2541#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2542#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2543pub const fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2544    unsafe {
2545        let r = _mm512_div_ph(a, b);
2546        simd_select_bitmask(k, r, _mm512_setzero_ph())
2547    }
2548}
2549
2550/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2551/// Rounding is done according to the rounding parameter, which can be one of:
2552///
2553/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2554/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2555/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2556/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2557/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2558///
2559/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ph)
2560#[inline]
2561#[target_feature(enable = "avx512fp16")]
2562#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2563#[rustc_legacy_const_generics(2)]
2564#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2565pub fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2566    unsafe {
2567        static_assert_rounding!(ROUNDING);
2568        vdivph(a, b, ROUNDING)
2569    }
2570}
2571
2572/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2573/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2574/// Rounding is done according to the rounding parameter, which can be one of:
2575///
2576/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2577/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2578/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2579/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2580/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2581///
2582/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ph)
2583#[inline]
2584#[target_feature(enable = "avx512fp16")]
2585#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2586#[rustc_legacy_const_generics(4)]
2587#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2588pub fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
2589    src: __m512h,
2590    k: __mmask32,
2591    a: __m512h,
2592    b: __m512h,
2593) -> __m512h {
2594    unsafe {
2595        static_assert_rounding!(ROUNDING);
2596        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2597        simd_select_bitmask(k, r, src)
2598    }
2599}
2600
2601/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2602/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2603/// Rounding is done according to the rounding parameter, which can be one of:
2604///
2605/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2606/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2607/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2608/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2609/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2610///
2611/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ph)
2612#[inline]
2613#[target_feature(enable = "avx512fp16")]
2614#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2615#[rustc_legacy_const_generics(3)]
2616#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2617pub fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
2618    k: __mmask32,
2619    a: __m512h,
2620    b: __m512h,
2621) -> __m512h {
2622    unsafe {
2623        static_assert_rounding!(ROUNDING);
2624        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2625        simd_select_bitmask(k, r, _mm512_setzero_ph())
2626    }
2627}
2628
2629/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2630/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2631/// Rounding is done according to the rounding parameter, which can be one of:
2632///
2633/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2634/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2635/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2636/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2637/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2638///
2639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sh)
2640#[inline]
2641#[target_feature(enable = "avx512fp16")]
2642#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2643#[rustc_legacy_const_generics(2)]
2644#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2645pub fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2646    static_assert_rounding!(ROUNDING);
2647    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2648}
2649
2650/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2651/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2652/// writemask k (the element is copied from src when mask bit 0 is not set).
2653/// Rounding is done according to the rounding parameter, which can be one of:
2654///
2655/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2656/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2657/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2658/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2659/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2660///
2661/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sh)
2662#[inline]
2663#[target_feature(enable = "avx512fp16")]
2664#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2665#[rustc_legacy_const_generics(4)]
2666#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2667pub fn _mm_mask_div_round_sh<const ROUNDING: i32>(
2668    src: __m128h,
2669    k: __mmask8,
2670    a: __m128h,
2671    b: __m128h,
2672) -> __m128h {
2673    unsafe {
2674        static_assert_rounding!(ROUNDING);
2675        vdivsh(a, b, src, k, ROUNDING)
2676    }
2677}
2678
2679/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2680/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2681/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2682/// Rounding is done according to the rounding parameter, which can be one of:
2683///
2684/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2685/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2686/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2687/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2688/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2689///
2690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sh)
2691#[inline]
2692#[target_feature(enable = "avx512fp16")]
2693#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2694#[rustc_legacy_const_generics(3)]
2695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2696pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2697    static_assert_rounding!(ROUNDING);
2698    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2699}
2700
2701/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2702/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2703///
2704/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sh)
2705#[inline]
2706#[target_feature(enable = "avx512fp16")]
2707#[cfg_attr(test, assert_instr(vdivsh))]
2708#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2709#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2710pub const fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
2711    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) / _mm_cvtsh_h(b)) }
2712}
2713
2714/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2715/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2716/// writemask k (the element is copied from src when mask bit 0 is not set).
2717///
2718/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sh)
2719#[inline]
2720#[target_feature(enable = "avx512fp16")]
2721#[cfg_attr(test, assert_instr(vdivsh))]
2722#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2723#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2724pub const fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2725    unsafe {
2726        let extractsrc: f16 = simd_extract!(src, 0);
2727        let mut add: f16 = extractsrc;
2728        if (k & 0b00000001) != 0 {
2729            let extracta: f16 = simd_extract!(a, 0);
2730            let extractb: f16 = simd_extract!(b, 0);
2731            add = extracta / extractb;
2732        }
2733        simd_insert!(a, 0, add)
2734    }
2735}
2736
2737/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2738/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2739/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2740///
2741/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sh)
2742#[inline]
2743#[target_feature(enable = "avx512fp16")]
2744#[cfg_attr(test, assert_instr(vdivsh))]
2745#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2746#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
2747pub const fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2748    unsafe {
2749        let mut add: f16 = 0.;
2750        if (k & 0b00000001) != 0 {
2751            let extracta: f16 = simd_extract!(a, 0);
2752            let extractb: f16 = simd_extract!(b, 0);
2753            add = extracta / extractb;
2754        }
2755        simd_insert!(a, 0, add)
2756    }
2757}
2758
2759/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2760/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2761/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2762///
2763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch)
2764#[inline]
2765#[target_feature(enable = "avx512fp16,avx512vl")]
2766#[cfg_attr(test, assert_instr(vfmulcph))]
2767#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2768pub fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
2769    _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b)
2770}
2771
2772/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2773/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2774/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2775///
2776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch)
2777#[inline]
2778#[target_feature(enable = "avx512fp16,avx512vl")]
2779#[cfg_attr(test, assert_instr(vfmulcph))]
2780#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2781pub fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2782    unsafe { transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
2783}
2784
2785/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2786/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2787/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2788///
2789/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch)
2790#[inline]
2791#[target_feature(enable = "avx512fp16,avx512vl")]
2792#[cfg_attr(test, assert_instr(vfmulcph))]
2793#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2794pub fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2795    _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b)
2796}
2797
2798/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2799/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2800/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2801///
2802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch)
2803#[inline]
2804#[target_feature(enable = "avx512fp16,avx512vl")]
2805#[cfg_attr(test, assert_instr(vfmulcph))]
2806#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2807pub fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
2808    _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b)
2809}
2810
2811/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2812/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2813/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2814///
2815/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch)
2816#[inline]
2817#[target_feature(enable = "avx512fp16,avx512vl")]
2818#[cfg_attr(test, assert_instr(vfmulcph))]
2819#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2820pub fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2821    unsafe { transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
2822}
2823
2824/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2825/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2826/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2827///
2828/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch)
2829#[inline]
2830#[target_feature(enable = "avx512fp16,avx512vl")]
2831#[cfg_attr(test, assert_instr(vfmulcph))]
2832#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2833pub fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2834    _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b)
2835}
2836
2837/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2838/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2839/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2840///
2841/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch)
2842#[inline]
2843#[target_feature(enable = "avx512fp16")]
2844#[cfg_attr(test, assert_instr(vfmulcph))]
2845#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2846pub fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
2847    _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b)
2848}
2849
2850/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2851/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2852/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2853///
2854/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch)
2855#[inline]
2856#[target_feature(enable = "avx512fp16")]
2857#[cfg_attr(test, assert_instr(vfmulcph))]
2858#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2859pub fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2860    _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2861}
2862
2863/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2864/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2865/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2866///
2867/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch)
2868#[inline]
2869#[target_feature(enable = "avx512fp16")]
2870#[cfg_attr(test, assert_instr(vfmulcph))]
2871#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2872pub fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2873    _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b)
2874}
2875
2876/// Multiply the packed complex numbers in a and b, and store the results in dst. Each complex number is
2877/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2878/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2879///
2880/// Rounding is done according to the rounding parameter, which can be one of:
2881///
2882/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2883/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2884/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2885/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2886/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2887///
2888/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch)
2889#[inline]
2890#[target_feature(enable = "avx512fp16")]
2891#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2892#[rustc_legacy_const_generics(2)]
2893#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2894pub fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2895    static_assert_rounding!(ROUNDING);
2896    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
2897}
2898
2899/// Multiply the packed complex numbers in a and b, and store the results in dst using writemask k (the element
2900/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2901/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2902///
2903/// Rounding is done according to the rounding parameter, which can be one of:
2904///
2905/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2906/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2907/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2908/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2909/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2910///
2911/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch)
2912#[inline]
2913#[target_feature(enable = "avx512fp16")]
2914#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2915#[rustc_legacy_const_generics(4)]
2916#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2917pub fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
2918    src: __m512h,
2919    k: __mmask16,
2920    a: __m512h,
2921    b: __m512h,
2922) -> __m512h {
2923    unsafe {
2924        static_assert_rounding!(ROUNDING);
2925        transmute(vfmulcph_512(
2926            transmute(a),
2927            transmute(b),
2928            transmute(src),
2929            k,
2930            ROUNDING,
2931        ))
2932    }
2933}
2934
2935/// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2936/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2937/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2938///
2939/// Rounding is done according to the rounding parameter, which can be one of:
2940///
2941/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2942/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2943/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2944/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2945/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2946///
2947/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch)
2948#[inline]
2949#[target_feature(enable = "avx512fp16")]
2950#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2951#[rustc_legacy_const_generics(3)]
2952#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2953pub fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
2954    k: __mmask16,
2955    a: __m512h,
2956    b: __m512h,
2957) -> __m512h {
2958    static_assert_rounding!(ROUNDING);
2959    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
2960}
2961
2962/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2963/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2964/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2965/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2966///
2967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch)
2968#[inline]
2969#[target_feature(enable = "avx512fp16")]
2970#[cfg_attr(test, assert_instr(vfmulcsh))]
2971#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2972pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
2973    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
2974}
2975
2976/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2977/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2978/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent
2979/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2980///
2981/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch)
2982#[inline]
2983#[target_feature(enable = "avx512fp16")]
2984#[cfg_attr(test, assert_instr(vfmulcsh))]
2985#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2986pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2987    _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2988}
2989
2990/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2991/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2992/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2993/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2994///
2995/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch)
2996#[inline]
2997#[target_feature(enable = "avx512fp16")]
2998#[cfg_attr(test, assert_instr(vfmulcsh))]
2999#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3000pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3001    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), k, a, b)
3002}
3003
3004/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
3005/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
3006/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3007/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3008///
3009/// Rounding is done according to the rounding parameter, which can be one of:
3010///
3011/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3012/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3013/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3014/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3015/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3016///
3017/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch)
3018#[inline]
3019#[target_feature(enable = "avx512fp16")]
3020#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3021#[rustc_legacy_const_generics(2)]
3022#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3023pub fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3024    static_assert_rounding!(ROUNDING);
3025    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
3026}
3027
3028/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
3029/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
3030/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
3031/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3032///
3033/// Rounding is done according to the rounding parameter, which can be one of:
3034///
3035/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3036/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3037/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3038/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3039/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3040///
3041/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch)
3042#[inline]
3043#[target_feature(enable = "avx512fp16")]
3044#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3045#[rustc_legacy_const_generics(4)]
3046#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3047pub fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
3048    src: __m128h,
3049    k: __mmask8,
3050    a: __m128h,
3051    b: __m128h,
3052) -> __m128h {
3053    unsafe {
3054        static_assert_rounding!(ROUNDING);
3055        transmute(vfmulcsh(
3056            transmute(a),
3057            transmute(b),
3058            transmute(src),
3059            k,
3060            ROUNDING,
3061        ))
3062    }
3063}
3064
3065/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
3066/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
3067/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
3068/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3069///
3070/// Rounding is done according to the rounding parameter, which can be one of:
3071///
3072/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3073/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3074/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3075/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3076/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3077///
3078/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch)
3079#[inline]
3080#[target_feature(enable = "avx512fp16")]
3081#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3082#[rustc_legacy_const_generics(3)]
3083#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3084pub fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
3085    k: __mmask8,
3086    a: __m128h,
3087    b: __m128h,
3088) -> __m128h {
3089    static_assert_rounding!(ROUNDING);
3090    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
3091}
3092
3093/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
3094/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3095/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3096///
3097/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch)
3098#[inline]
3099#[target_feature(enable = "avx512fp16,avx512vl")]
3100#[cfg_attr(test, assert_instr(vfmulcph))]
3101#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3102pub fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
3103    _mm_mul_pch(a, b)
3104}
3105
3106/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3107/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
3108/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3109///
3110/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch)
3111#[inline]
3112#[target_feature(enable = "avx512fp16,avx512vl")]
3113#[cfg_attr(test, assert_instr(vfmulcph))]
3114#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3115pub fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3116    _mm_mask_mul_pch(src, k, a, b)
3117}
3118
3119/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3120/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3121/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3122///
3123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch)
3124#[inline]
3125#[target_feature(enable = "avx512fp16,avx512vl")]
3126#[cfg_attr(test, assert_instr(vfmulcph))]
3127#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3128pub fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3129    _mm_maskz_mul_pch(k, a, b)
3130}
3131
3132/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
3133/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3134/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3135///
3136/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch)
3137#[inline]
3138#[target_feature(enable = "avx512fp16,avx512vl")]
3139#[cfg_attr(test, assert_instr(vfmulcph))]
3140#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3141pub fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
3142    _mm256_mul_pch(a, b)
3143}
3144
3145/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3146/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3147/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3148///
3149/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch)
3150#[inline]
3151#[target_feature(enable = "avx512fp16,avx512vl")]
3152#[cfg_attr(test, assert_instr(vfmulcph))]
3153#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3154pub fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3155    _mm256_mask_mul_pch(src, k, a, b)
3156}
3157
3158/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3159/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3160/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3161///
3162/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch)
3163#[inline]
3164#[target_feature(enable = "avx512fp16,avx512vl")]
3165#[cfg_attr(test, assert_instr(vfmulcph))]
3166#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3167pub fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3168    _mm256_maskz_mul_pch(k, a, b)
3169}
3170
3171/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
3172/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3173///
3174/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch)
3175#[inline]
3176#[target_feature(enable = "avx512fp16")]
3177#[cfg_attr(test, assert_instr(vfmulcph))]
3178#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3179pub fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
3180    _mm512_mul_pch(a, b)
3181}
3182
3183/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3184/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3185/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3186///
3187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch)
3188#[inline]
3189#[target_feature(enable = "avx512fp16")]
3190#[cfg_attr(test, assert_instr(vfmulcph))]
3191#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3192pub fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3193    _mm512_mask_mul_pch(src, k, a, b)
3194}
3195
3196/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3197/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3198/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3199///
3200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch)
3201#[inline]
3202#[target_feature(enable = "avx512fp16")]
3203#[cfg_attr(test, assert_instr(vfmulcph))]
3204#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3205pub fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3206    _mm512_maskz_mul_pch(k, a, b)
3207}
3208
3209/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
3210/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3211/// Rounding is done according to the rounding parameter, which can be one of:
3212///
3213/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3214/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3215/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3216/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3217/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3218///
3219/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch)
3220#[inline]
3221#[target_feature(enable = "avx512fp16")]
3222#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3223#[rustc_legacy_const_generics(2)]
3224#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3225pub fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3226    static_assert_rounding!(ROUNDING);
3227    _mm512_mul_round_pch::<ROUNDING>(a, b)
3228}
3229
3230/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3231/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3232/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3233/// Rounding is done according to the rounding parameter, which can be one of:
3234///
3235/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3236/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3237/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3238/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3239/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3240///
3241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch)
3242#[inline]
3243#[target_feature(enable = "avx512fp16")]
3244#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3245#[rustc_legacy_const_generics(4)]
3246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3247pub fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
3248    src: __m512h,
3249    k: __mmask16,
3250    a: __m512h,
3251    b: __m512h,
3252) -> __m512h {
3253    static_assert_rounding!(ROUNDING);
3254    _mm512_mask_mul_round_pch::<ROUNDING>(src, k, a, b)
3255}
3256
3257/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3258/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3259/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3260/// Rounding is done according to the rounding parameter, which can be one of:
3261///
3262/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3263/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3264/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3265/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3266/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3267///
3268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch)
3269#[inline]
3270#[target_feature(enable = "avx512fp16")]
3271#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3272#[rustc_legacy_const_generics(3)]
3273#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3274pub fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
3275    k: __mmask16,
3276    a: __m512h,
3277    b: __m512h,
3278) -> __m512h {
3279    static_assert_rounding!(ROUNDING);
3280    _mm512_maskz_mul_round_pch::<ROUNDING>(k, a, b)
3281}
3282
3283/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is
3284/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3285/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3286///
3287/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch)
3288#[inline]
3289#[target_feature(enable = "avx512fp16")]
3290#[cfg_attr(test, assert_instr(vfmulcsh))]
3291#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3292pub fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
3293    _mm_mul_sch(a, b)
3294}
3295
3296/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3297/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3298/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3299///
3300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch)
3301#[inline]
3302#[target_feature(enable = "avx512fp16")]
3303#[cfg_attr(test, assert_instr(vfmulcsh))]
3304#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3305pub fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3306    _mm_mask_mul_sch(src, k, a, b)
3307}
3308
3309/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3310/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3311/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3312///
3313/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch)
3314#[inline]
3315#[target_feature(enable = "avx512fp16")]
3316#[cfg_attr(test, assert_instr(vfmulcsh))]
3317#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3318pub fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3319    _mm_maskz_mul_sch(k, a, b)
3320}
3321
3322/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is composed
3323/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3324///
3325/// Rounding is done according to the rounding parameter, which can be one of:
3326///
3327/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3328/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3329/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3330/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3331/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3332///
3333/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch)
3334#[inline]
3335#[target_feature(enable = "avx512fp16")]
3336#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3337#[rustc_legacy_const_generics(2)]
3338#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3339pub fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3340    static_assert_rounding!(ROUNDING);
3341    _mm_mul_round_sch::<ROUNDING>(a, b)
3342}
3343
3344/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3345/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3346/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3347///
3348/// Rounding is done according to the rounding parameter, which can be one of:
3349///
3350/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3351/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3352/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3353/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3354/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3355///
3356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch)
3357#[inline]
3358#[target_feature(enable = "avx512fp16")]
3359#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3360#[rustc_legacy_const_generics(4)]
3361#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3362pub fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
3363    src: __m128h,
3364    k: __mmask8,
3365    a: __m128h,
3366    b: __m128h,
3367) -> __m128h {
3368    static_assert_rounding!(ROUNDING);
3369    _mm_mask_mul_round_sch::<ROUNDING>(src, k, a, b)
3370}
3371
3372/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3373/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3374/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3375///
3376/// Rounding is done according to the rounding parameter, which can be one of:
3377///
3378/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3379/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3380/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3381/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3382/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3383///
3384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch)
3385#[inline]
3386#[target_feature(enable = "avx512fp16")]
3387#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3388#[rustc_legacy_const_generics(3)]
3389#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3390pub fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
3391    k: __mmask8,
3392    a: __m128h,
3393    b: __m128h,
3394) -> __m128h {
3395    static_assert_rounding!(ROUNDING);
3396    _mm_maskz_mul_round_sch::<ROUNDING>(k, a, b)
3397}
3398
3399/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3400/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3401/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3402/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3403///
3404/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch)
3405#[inline]
3406#[target_feature(enable = "avx512fp16,avx512vl")]
3407#[cfg_attr(test, assert_instr(vfcmulcph))]
3408#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3409pub fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
3410    _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b)
3411}
3412
3413/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3414/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3415/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3416/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3417///
3418/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch)
3419#[inline]
3420#[target_feature(enable = "avx512fp16,avx512vl")]
3421#[cfg_attr(test, assert_instr(vfcmulcph))]
3422#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3423pub fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3424    unsafe { transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
3425}
3426
3427/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3428/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3429/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3430/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3431///
3432/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch)
3433#[inline]
3434#[target_feature(enable = "avx512fp16,avx512vl")]
3435#[cfg_attr(test, assert_instr(vfcmulcph))]
3436#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3437pub fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3438    _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b)
3439}
3440
3441/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3442/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3443/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3444/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3445///
3446/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch)
3447#[inline]
3448#[target_feature(enable = "avx512fp16,avx512vl")]
3449#[cfg_attr(test, assert_instr(vfcmulcph))]
3450#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3451pub fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
3452    _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b)
3453}
3454
3455/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3456/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3457/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3458/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3459///
3460/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch)
3461#[inline]
3462#[target_feature(enable = "avx512fp16,avx512vl")]
3463#[cfg_attr(test, assert_instr(vfcmulcph))]
3464#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3465pub fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3466    unsafe { transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
3467}
3468
3469/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3470/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3471/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3472/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3473///
3474/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch)
3475#[inline]
3476#[target_feature(enable = "avx512fp16,avx512vl")]
3477#[cfg_attr(test, assert_instr(vfcmulcph))]
3478#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3479pub fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3480    _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b)
3481}
3482
3483/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3484/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3485/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3486/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3487///
3488/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch)
3489#[inline]
3490#[target_feature(enable = "avx512fp16")]
3491#[cfg_attr(test, assert_instr(vfcmulcph))]
3492#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3493pub fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
3494    _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b)
3495}
3496
3497/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3498/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3499/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3500/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3501///
3502/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch)
3503#[inline]
3504#[target_feature(enable = "avx512fp16")]
3505#[cfg_attr(test, assert_instr(vfcmulcph))]
3506#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3507pub fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3508    _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3509}
3510
3511/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3512/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3513/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3514/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3515///
3516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch)
3517#[inline]
3518#[target_feature(enable = "avx512fp16")]
3519#[cfg_attr(test, assert_instr(vfcmulcph))]
3520#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3521pub fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3522    _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b)
3523}
3524
3525/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3526/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3527/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3528/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3529///
3530/// Rounding is done according to the rounding parameter, which can be one of:
3531///
3532/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3533/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3534/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3535/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3536/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3537///
3538/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch)
3539#[inline]
3540#[target_feature(enable = "avx512fp16")]
3541#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3542#[rustc_legacy_const_generics(2)]
3543#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3544pub fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3545    static_assert_rounding!(ROUNDING);
3546    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
3547}
3548
3549/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3550/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3551/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3552/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3553///
3554/// Rounding is done according to the rounding parameter, which can be one of:
3555///
3556/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3557/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3558/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3559/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3560/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3561///
3562/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch)
3563#[inline]
3564#[target_feature(enable = "avx512fp16")]
3565#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3566#[rustc_legacy_const_generics(4)]
3567#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3568pub fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
3569    src: __m512h,
3570    k: __mmask16,
3571    a: __m512h,
3572    b: __m512h,
3573) -> __m512h {
3574    unsafe {
3575        static_assert_rounding!(ROUNDING);
3576        transmute(vfcmulcph_512(
3577            transmute(a),
3578            transmute(b),
3579            transmute(src),
3580            k,
3581            ROUNDING,
3582        ))
3583    }
3584}
3585
3586/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3587/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3588/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3589/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3590///
3591/// Rounding is done according to the rounding parameter, which can be one of:
3592///
3593/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3594/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3595/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3596/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3597/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3598///
3599/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch)
3600#[inline]
3601#[target_feature(enable = "avx512fp16")]
3602#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3603#[rustc_legacy_const_generics(3)]
3604#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3605pub fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
3606    k: __mmask16,
3607    a: __m512h,
3608    b: __m512h,
3609) -> __m512h {
3610    static_assert_rounding!(ROUNDING);
3611    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
3612}
3613
3614/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3615/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3616/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3617///
3618/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch)
3619#[inline]
3620#[target_feature(enable = "avx512fp16")]
3621#[cfg_attr(test, assert_instr(vfcmulcsh))]
3622#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3623pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
3624    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
3625}
3626
3627/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3628/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3629/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3630/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3631///
3632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch)
3633#[inline]
3634#[target_feature(enable = "avx512fp16")]
3635#[cfg_attr(test, assert_instr(vfcmulcsh))]
3636#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3637pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3638    _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3639}
3640
3641/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3642/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3643/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3644/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3645///
3646/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch)
3647#[inline]
3648#[target_feature(enable = "avx512fp16")]
3649#[cfg_attr(test, assert_instr(vfcmulcsh))]
3650#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3651pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3652    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), k, a, b)
3653}
3654
3655/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3656/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3657/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3658///
3659/// Rounding is done according to the rounding parameter, which can be one of:
3660///
3661/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3662/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3663/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3664/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3665/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3666///
3667/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch)
3668#[inline]
3669#[target_feature(enable = "avx512fp16")]
3670#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3671#[rustc_legacy_const_generics(2)]
3672#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3673pub fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3674    static_assert_rounding!(ROUNDING);
3675    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
3676}
3677
3678/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3679/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3680/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3681/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3682///
3683/// Rounding is done according to the rounding parameter, which can be one of:
3684///
3685/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3686/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3687/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3688/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3689/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3690///
3691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch)
3692#[inline]
3693#[target_feature(enable = "avx512fp16")]
3694#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3695#[rustc_legacy_const_generics(4)]
3696#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3697pub fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
3698    src: __m128h,
3699    k: __mmask8,
3700    a: __m128h,
3701    b: __m128h,
3702) -> __m128h {
3703    unsafe {
3704        static_assert_rounding!(ROUNDING);
3705        transmute(vfcmulcsh(
3706            transmute(a),
3707            transmute(b),
3708            transmute(src),
3709            k,
3710            ROUNDING,
3711        ))
3712    }
3713}
3714
3715/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3716/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3717/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3718/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3719///
3720/// Rounding is done according to the rounding parameter, which can be one of:
3721///
3722/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3723/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3724/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3725/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3726/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3727///
3728/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch)
3729#[inline]
3730#[target_feature(enable = "avx512fp16")]
3731#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3732#[rustc_legacy_const_generics(3)]
3733#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3734pub fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
3735    k: __mmask8,
3736    a: __m128h,
3737    b: __m128h,
3738) -> __m128h {
3739    static_assert_rounding!(ROUNDING);
3740    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
3741}
3742
3743/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3744/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3745/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3746/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3747///
3748/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch)
3749#[inline]
3750#[target_feature(enable = "avx512fp16,avx512vl")]
3751#[cfg_attr(test, assert_instr(vfcmulcph))]
3752#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3753pub fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
3754    _mm_cmul_pch(a, b)
3755}
3756
3757/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3758/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3759/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3760/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3761///
3762/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch)
3763#[inline]
3764#[target_feature(enable = "avx512fp16,avx512vl")]
3765#[cfg_attr(test, assert_instr(vfcmulcph))]
3766#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3767pub fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3768    _mm_mask_cmul_pch(src, k, a, b)
3769}
3770
3771/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3772/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3773/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3774/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3775///
3776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch)
3777#[inline]
3778#[target_feature(enable = "avx512fp16,avx512vl")]
3779#[cfg_attr(test, assert_instr(vfcmulcph))]
3780#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3781pub fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3782    _mm_maskz_cmul_pch(k, a, b)
3783}
3784
3785/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3786/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3787/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3788/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3789///
3790/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch)
3791#[inline]
3792#[target_feature(enable = "avx512fp16,avx512vl")]
3793#[cfg_attr(test, assert_instr(vfcmulcph))]
3794#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3795pub fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
3796    _mm256_cmul_pch(a, b)
3797}
3798
3799/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3800/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3801/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3802/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3803///
3804/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch)
3805#[inline]
3806#[target_feature(enable = "avx512fp16,avx512vl")]
3807#[cfg_attr(test, assert_instr(vfcmulcph))]
3808#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3809pub fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3810    _mm256_mask_cmul_pch(src, k, a, b)
3811}
3812
3813/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3814/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3815/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3816/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3817///
3818/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch)
3819#[inline]
3820#[target_feature(enable = "avx512fp16,avx512vl")]
3821#[cfg_attr(test, assert_instr(vfcmulcph))]
3822#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3823pub fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3824    _mm256_maskz_cmul_pch(k, a, b)
3825}
3826
3827/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3828/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3829/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3830/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3831///
3832/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch)
3833#[inline]
3834#[target_feature(enable = "avx512fp16")]
3835#[cfg_attr(test, assert_instr(vfcmulcph))]
3836#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3837pub fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
3838    _mm512_cmul_pch(a, b)
3839}
3840
3841/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3842/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3843/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3844/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3845///
3846/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch)
3847#[inline]
3848#[target_feature(enable = "avx512fp16")]
3849#[cfg_attr(test, assert_instr(vfcmulcph))]
3850#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3851pub fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3852    _mm512_mask_cmul_pch(src, k, a, b)
3853}
3854
3855/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3856/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3857/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3858/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3859///
3860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch)
3861#[inline]
3862#[target_feature(enable = "avx512fp16")]
3863#[cfg_attr(test, assert_instr(vfcmulcph))]
3864#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3865pub fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3866    _mm512_maskz_cmul_pch(k, a, b)
3867}
3868
3869/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3870/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3871/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3872///
3873/// Rounding is done according to the rounding parameter, which can be one of:
3874///
3875/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3876/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3877/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3878/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3879/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3880///
3881/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch)
3882#[inline]
3883#[target_feature(enable = "avx512fp16")]
3884#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3885#[rustc_legacy_const_generics(2)]
3886#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3887pub fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3888    static_assert_rounding!(ROUNDING);
3889    _mm512_cmul_round_pch::<ROUNDING>(a, b)
3890}
3891
3892/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3893/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3894/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3895/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3896///
3897/// Rounding is done according to the rounding parameter, which can be one of:
3898///
3899/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3900/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3901/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3902/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3903/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3904///
3905/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch)
3906#[inline]
3907#[target_feature(enable = "avx512fp16")]
3908#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3909#[rustc_legacy_const_generics(4)]
3910#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3911pub fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
3912    src: __m512h,
3913    k: __mmask16,
3914    a: __m512h,
3915    b: __m512h,
3916) -> __m512h {
3917    static_assert_rounding!(ROUNDING);
3918    _mm512_mask_cmul_round_pch::<ROUNDING>(src, k, a, b)
3919}
3920
3921/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3922/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3923/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3924/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3925///
3926/// Rounding is done according to the rounding parameter, which can be one of:
3927///
3928/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3929/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3930/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3931/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3932/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3933///
3934/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch)
3935#[inline]
3936#[target_feature(enable = "avx512fp16")]
3937#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3938#[rustc_legacy_const_generics(3)]
3939#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3940pub fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
3941    k: __mmask16,
3942    a: __m512h,
3943    b: __m512h,
3944) -> __m512h {
3945    static_assert_rounding!(ROUNDING);
3946    _mm512_maskz_cmul_round_pch::<ROUNDING>(k, a, b)
3947}
3948
3949/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3950/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3951/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3952/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3953///
3954/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch)
3955#[inline]
3956#[target_feature(enable = "avx512fp16")]
3957#[cfg_attr(test, assert_instr(vfcmulcsh))]
3958#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3959pub fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
3960    _mm_cmul_sch(a, b)
3961}
3962
3963/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3964/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3965/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3966/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3967///
3968/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch)
3969#[inline]
3970#[target_feature(enable = "avx512fp16")]
3971#[cfg_attr(test, assert_instr(vfcmulcsh))]
3972#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3973pub fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3974    _mm_mask_cmul_sch(src, k, a, b)
3975}
3976
3977/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3978/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3979/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3980/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3981///
3982/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch)
3983#[inline]
3984#[target_feature(enable = "avx512fp16")]
3985#[cfg_attr(test, assert_instr(vfcmulcsh))]
3986#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3987pub fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3988    _mm_maskz_cmul_sch(k, a, b)
3989}
3990
3991/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3992/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3993/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3994///
3995/// Rounding is done according to the rounding parameter, which can be one of:
3996///
3997/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3998/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3999/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4000/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4001/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4002///
4003/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch)
4004#[inline]
4005#[target_feature(enable = "avx512fp16")]
4006#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
4007#[rustc_legacy_const_generics(2)]
4008#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4009pub fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
4010    static_assert_rounding!(ROUNDING);
4011    _mm_cmul_round_sch::<ROUNDING>(a, b)
4012}
4013
4014/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
4015/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
4016/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4017/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4018///
4019/// Rounding is done according to the rounding parameter, which can be one of:
4020///
4021/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4022/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4023/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4024/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4025/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4026///
4027/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch)
4028#[inline]
4029#[target_feature(enable = "avx512fp16")]
4030#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
4031#[rustc_legacy_const_generics(4)]
4032#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4033pub fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
4034    src: __m128h,
4035    k: __mmask8,
4036    a: __m128h,
4037    b: __m128h,
4038) -> __m128h {
4039    static_assert_rounding!(ROUNDING);
4040    _mm_mask_cmul_round_sch::<ROUNDING>(src, k, a, b)
4041}
4042
4043/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
4044/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
4045/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4046/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4047///
4048/// Rounding is done according to the rounding parameter, which can be one of:
4049///
4050/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4051/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4052/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4053/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4054/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4055///
4056/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch)
4057#[inline]
4058#[target_feature(enable = "avx512fp16")]
4059#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
4060#[rustc_legacy_const_generics(3)]
4061#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4062pub fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
4063    k: __mmask8,
4064    a: __m128h,
4065    b: __m128h,
4066) -> __m128h {
4067    static_assert_rounding!(ROUNDING);
4068    _mm_maskz_cmul_round_sch::<ROUNDING>(k, a, b)
4069}
4070
4071/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
4072/// the results in dst.
4073///
4074/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_ph)
4075#[inline]
4076#[target_feature(enable = "avx512fp16,avx512vl")]
4077#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4078#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4079pub const fn _mm_abs_ph(v2: __m128h) -> __m128h {
4080    unsafe { transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX))) }
4081}
4082
4083/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
4084/// the result in dst.
4085///
4086/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_ph)
4087#[inline]
4088#[target_feature(enable = "avx512fp16,avx512vl")]
4089#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4090#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4091pub const fn _mm256_abs_ph(v2: __m256h) -> __m256h {
4092    unsafe { transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX))) }
4093}
4094
4095/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
4096/// the result in dst.
4097///
4098/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ph)
4099#[inline]
4100#[target_feature(enable = "avx512fp16")]
4101#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4102#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4103pub const fn _mm512_abs_ph(v2: __m512h) -> __m512h {
4104    unsafe { transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX))) }
4105}
4106
4107/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex
4108/// number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines
4109/// the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate
4110/// `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4111///
4112/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conj_pch)
4113#[inline]
4114#[target_feature(enable = "avx512fp16,avx512vl")]
4115#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4116#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4117pub const fn _mm_conj_pch(a: __m128h) -> __m128h {
4118    unsafe { transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN))) }
4119}
4120
4121/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4122/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4123/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4124/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4125///
4126/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conj_pch)
4127#[inline]
4128#[target_feature(enable = "avx512fp16,avx512vl")]
4129#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4130#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4131pub const fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
4132    unsafe {
4133        let r: __m128 = transmute(_mm_conj_pch(a));
4134        transmute(simd_select_bitmask(k, r, transmute(src)))
4135    }
4136}
4137
4138/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4139/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4140/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4141/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4142///
4143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conj_pch)
4144#[inline]
4145#[target_feature(enable = "avx512fp16,avx512vl")]
4146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4147#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4148pub const fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
4149    _mm_mask_conj_pch(_mm_setzero_ph(), k, a)
4150}
4151
4152/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
4153/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4154/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4155///
4156/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conj_pch)
4157#[inline]
4158#[target_feature(enable = "avx512fp16,avx512vl")]
4159#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4160#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4161pub const fn _mm256_conj_pch(a: __m256h) -> __m256h {
4162    unsafe { transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN))) }
4163}
4164
4165/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4166/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4167/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4168/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4169///
4170/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conj_pch)
4171#[inline]
4172#[target_feature(enable = "avx512fp16,avx512vl")]
4173#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4174#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4175pub const fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
4176    unsafe {
4177        let r: __m256 = transmute(_mm256_conj_pch(a));
4178        transmute(simd_select_bitmask(k, r, transmute(src)))
4179    }
4180}
4181
4182/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4183/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4184/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4185/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4186///
4187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conj_pch)
4188#[inline]
4189#[target_feature(enable = "avx512fp16,avx512vl")]
4190#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4191#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4192pub const fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
4193    _mm256_mask_conj_pch(_mm256_setzero_ph(), k, a)
4194}
4195
4196/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
4197/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4198/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4199///
4200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conj_pch)
4201#[inline]
4202#[target_feature(enable = "avx512fp16")]
4203#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4204#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4205pub const fn _mm512_conj_pch(a: __m512h) -> __m512h {
4206    unsafe { transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN))) }
4207}
4208
4209/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4210/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4211/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4212/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4213///
4214/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conj_pch)
4215#[inline]
4216#[target_feature(enable = "avx512fp16")]
4217#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4218#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4219pub const fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
4220    unsafe {
4221        let r: __m512 = transmute(_mm512_conj_pch(a));
4222        transmute(simd_select_bitmask(k, r, transmute(src)))
4223    }
4224}
4225
4226/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4227/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4228/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4229/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4230///
4231/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conj_pch)
4232#[inline]
4233#[target_feature(enable = "avx512fp16")]
4234#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4235#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
4236pub const fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
4237    _mm512_mask_conj_pch(_mm512_setzero_ph(), k, a)
4238}
4239
4240/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4241/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4242/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4243///
4244/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pch)
4245#[inline]
4246#[target_feature(enable = "avx512fp16,avx512vl")]
4247#[cfg_attr(test, assert_instr(vfmaddcph))]
4248#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4249pub fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4250    _mm_mask3_fmadd_pch(a, b, c, 0xff)
4251}
4252
4253/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4254/// and store the results in dst using writemask k (the element is copied from a when the corresponding
4255/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4256/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4257///
4258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pch)
4259#[inline]
4260#[target_feature(enable = "avx512fp16,avx512vl")]
4261#[cfg_attr(test, assert_instr(vfmaddcph))]
4262#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4263pub fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4264    unsafe {
4265        let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4266        transmute(simd_select_bitmask(k, r, transmute(a)))
4267    }
4268}
4269
4270/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4271/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4272/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4273/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4274///
4275/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pch)
4276#[inline]
4277#[target_feature(enable = "avx512fp16,avx512vl")]
4278#[cfg_attr(test, assert_instr(vfmaddcph))]
4279#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4280pub fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4281    unsafe {
4282        transmute(vfmaddcph_mask3_128(
4283            transmute(a),
4284            transmute(b),
4285            transmute(c),
4286            k,
4287        ))
4288    }
4289}
4290
4291/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4292/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4293/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4294/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4295///
4296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pch)
4297#[inline]
4298#[target_feature(enable = "avx512fp16,avx512vl")]
4299#[cfg_attr(test, assert_instr(vfmaddcph))]
4300#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4301pub fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4302    unsafe {
4303        transmute(vfmaddcph_maskz_128(
4304            transmute(a),
4305            transmute(b),
4306            transmute(c),
4307            k,
4308        ))
4309    }
4310}
4311
4312/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4313/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4314/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4315///
4316/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pch)
4317#[inline]
4318#[target_feature(enable = "avx512fp16,avx512vl")]
4319#[cfg_attr(test, assert_instr(vfmaddcph))]
4320#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4321pub fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4322    _mm256_mask3_fmadd_pch(a, b, c, 0xff)
4323}
4324
4325/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4326/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4327/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4328/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4329///
4330/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pch)
4331#[inline]
4332#[target_feature(enable = "avx512fp16,avx512vl")]
4333#[cfg_attr(test, assert_instr(vfmaddcph))]
4334#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4335pub fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4336    unsafe {
4337        let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4338        transmute(simd_select_bitmask(k, r, transmute(a)))
4339    }
4340}
4341
4342/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4343/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4344/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4345/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4346///
4347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pch)
4348#[inline]
4349#[target_feature(enable = "avx512fp16,avx512vl")]
4350#[cfg_attr(test, assert_instr(vfmaddcph))]
4351#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4352pub fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4353    unsafe {
4354        transmute(vfmaddcph_mask3_256(
4355            transmute(a),
4356            transmute(b),
4357            transmute(c),
4358            k,
4359        ))
4360    }
4361}
4362
4363/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4364/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4365/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4366/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4367///
4368/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pch)
4369#[inline]
4370#[target_feature(enable = "avx512fp16,avx512vl")]
4371#[cfg_attr(test, assert_instr(vfmaddcph))]
4372#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4373pub fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4374    unsafe {
4375        transmute(vfmaddcph_maskz_256(
4376            transmute(a),
4377            transmute(b),
4378            transmute(c),
4379            k,
4380        ))
4381    }
4382}
4383
4384/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4385/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4386/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4387///
4388/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pch)
4389#[inline]
4390#[target_feature(enable = "avx512fp16")]
4391#[cfg_attr(test, assert_instr(vfmaddcph))]
4392#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4393pub fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4394    _mm512_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4395}
4396
4397/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4398/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4399/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4400/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4401///
4402/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pch)
4403#[inline]
4404#[target_feature(enable = "avx512fp16")]
4405#[cfg_attr(test, assert_instr(vfmaddcph))]
4406#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4407pub fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4408    _mm512_mask_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4409}
4410
4411/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4412/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4413/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4414/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4415///
4416/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pch)
4417#[inline]
4418#[target_feature(enable = "avx512fp16")]
4419#[cfg_attr(test, assert_instr(vfmaddcph))]
4420#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4421pub fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4422    _mm512_mask3_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4423}
4424
4425/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4426/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4427/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4428/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4429///
4430/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pch)
4431#[inline]
4432#[target_feature(enable = "avx512fp16")]
4433#[cfg_attr(test, assert_instr(vfmaddcph))]
4434#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4435pub fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4436    _mm512_maskz_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4437}
4438
4439/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4440/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4441/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4442///
4443/// Rounding is done according to the rounding parameter, which can be one of:
4444///
4445/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4446/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4447/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4448/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4449/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4450///
4451/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pch)
4452#[inline]
4453#[target_feature(enable = "avx512fp16")]
4454#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4455#[rustc_legacy_const_generics(3)]
4456#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4457pub fn _mm512_fmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4458    static_assert_rounding!(ROUNDING);
4459    _mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4460}
4461
4462/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4463/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4464/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4465/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4466///
4467/// Rounding is done according to the rounding parameter, which can be one of:
4468///
4469/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4470/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4471/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4472/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4473/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4474///
4475/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pch)
4476#[inline]
4477#[target_feature(enable = "avx512fp16")]
4478#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4479#[rustc_legacy_const_generics(4)]
4480#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4481pub fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
4482    a: __m512h,
4483    k: __mmask16,
4484    b: __m512h,
4485    c: __m512h,
4486) -> __m512h {
4487    unsafe {
4488        static_assert_rounding!(ROUNDING);
4489        let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4490        transmute(simd_select_bitmask(k, r, transmute(a)))
4491    }
4492}
4493
4494/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4495/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4496/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4497/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4498///
4499/// Rounding is done according to the rounding parameter, which can be one of:
4500///
4501/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4502/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4503/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4504/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4505/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4506///
4507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pch)
4508#[inline]
4509#[target_feature(enable = "avx512fp16")]
4510#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4511#[rustc_legacy_const_generics(4)]
4512#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4513pub fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
4514    a: __m512h,
4515    b: __m512h,
4516    c: __m512h,
4517    k: __mmask16,
4518) -> __m512h {
4519    unsafe {
4520        static_assert_rounding!(ROUNDING);
4521        transmute(vfmaddcph_mask3_512(
4522            transmute(a),
4523            transmute(b),
4524            transmute(c),
4525            k,
4526            ROUNDING,
4527        ))
4528    }
4529}
4530
4531/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4532/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4533/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4534/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4535///
4536/// Rounding is done according to the rounding parameter, which can be one of:
4537///
4538/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4539/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4540/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4541/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4542/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4543///
4544/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pch)
4545#[inline]
4546#[target_feature(enable = "avx512fp16")]
4547#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4548#[rustc_legacy_const_generics(4)]
4549#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4550pub fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
4551    k: __mmask16,
4552    a: __m512h,
4553    b: __m512h,
4554    c: __m512h,
4555) -> __m512h {
4556    unsafe {
4557        static_assert_rounding!(ROUNDING);
4558        transmute(vfmaddcph_maskz_512(
4559            transmute(a),
4560            transmute(b),
4561            transmute(c),
4562            k,
4563            ROUNDING,
4564        ))
4565    }
4566}
4567
4568/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4569/// store the result in the lower elements of dst, and copy the upper 6 packed elements from a to the
4570/// upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
4571/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4572///
4573/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sch)
4574#[inline]
4575#[target_feature(enable = "avx512fp16")]
4576#[cfg_attr(test, assert_instr(vfmaddcsh))]
4577#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4578pub fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4579    _mm_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4580}
4581
4582/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4583/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4584/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4585/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4586/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4587///
4588/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sch)
4589#[inline]
4590#[target_feature(enable = "avx512fp16")]
4591#[cfg_attr(test, assert_instr(vfmaddcsh))]
4592#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4593pub fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4594    _mm_mask_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4595}
4596
4597/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4598/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4599/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4600/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4601/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4602///
4603/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sch)
4604#[inline]
4605#[target_feature(enable = "avx512fp16")]
4606#[cfg_attr(test, assert_instr(vfmaddcsh))]
4607#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4608pub fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4609    _mm_mask3_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4610}
4611
4612/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4613/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4614/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4615/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4616/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4617///
4618/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sch)
4619#[inline]
4620#[target_feature(enable = "avx512fp16")]
4621#[cfg_attr(test, assert_instr(vfmaddcsh))]
4622#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4623pub fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4624    _mm_maskz_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4625}
4626
4627/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4628/// store the result in the lower elements of dst. Each complex number is composed of two adjacent
4629/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4630///
4631/// Rounding is done according to the rounding parameter, which can be one of:
4632///
4633/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4634/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4635/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4636/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4637/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4638///
4639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sch)
4640#[inline]
4641#[target_feature(enable = "avx512fp16")]
4642#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4643#[rustc_legacy_const_generics(3)]
4644#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4645pub fn _mm_fmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4646    unsafe {
4647        static_assert_rounding!(ROUNDING);
4648        transmute(vfmaddcsh_mask(
4649            transmute(a),
4650            transmute(b),
4651            transmute(c),
4652            0xff,
4653            ROUNDING,
4654        ))
4655    }
4656}
4657
4658/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4659/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4660/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4661/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4662/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4663///
4664/// Rounding is done according to the rounding parameter, which can be one of:
4665///
4666/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4667/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4668/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4669/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4670/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4671///
4672/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sch)
4673#[inline]
4674#[target_feature(enable = "avx512fp16")]
4675#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4676#[rustc_legacy_const_generics(4)]
4677#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4678pub fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
4679    a: __m128h,
4680    k: __mmask8,
4681    b: __m128h,
4682    c: __m128h,
4683) -> __m128h {
4684    unsafe {
4685        static_assert_rounding!(ROUNDING);
4686        let a = transmute(a);
4687        let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
4688        transmute(_mm_mask_move_ss(a, k, a, r))
4689    }
4690}
4691
4692/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4693/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4694/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4695/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4696/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4697///
4698/// Rounding is done according to the rounding parameter, which can be one of:
4699///
4700/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4701/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4702/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4703/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4704/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4705///
4706/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sch)
4707#[inline]
4708#[target_feature(enable = "avx512fp16")]
4709#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4710#[rustc_legacy_const_generics(4)]
4711#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4712pub fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
4713    a: __m128h,
4714    b: __m128h,
4715    c: __m128h,
4716    k: __mmask8,
4717) -> __m128h {
4718    unsafe {
4719        static_assert_rounding!(ROUNDING);
4720        let c = transmute(c);
4721        let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
4722        transmute(_mm_move_ss(c, r))
4723    }
4724}
4725
4726/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4727/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4728/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4729/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4730/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4731///
4732/// Rounding is done according to the rounding parameter, which can be one of:
4733///
4734/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4735/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4736/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4737/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4738/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4739///
4740/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sch)
4741#[inline]
4742#[target_feature(enable = "avx512fp16")]
4743#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4744#[rustc_legacy_const_generics(4)]
4745#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4746pub fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
4747    k: __mmask8,
4748    a: __m128h,
4749    b: __m128h,
4750    c: __m128h,
4751) -> __m128h {
4752    unsafe {
4753        static_assert_rounding!(ROUNDING);
4754        transmute(vfmaddcsh_maskz(
4755            transmute(a),
4756            transmute(b),
4757            transmute(c),
4758            k,
4759            ROUNDING,
4760        ))
4761    }
4762}
4763
4764/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4765/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4766/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4767/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4768///
4769/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_pch)
4770#[inline]
4771#[target_feature(enable = "avx512fp16,avx512vl")]
4772#[cfg_attr(test, assert_instr(vfcmaddcph))]
4773#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4774pub fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4775    _mm_mask3_fcmadd_pch(a, b, c, 0xff)
4776}
4777
4778/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4779/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4780/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4781/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4782/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4783///
4784/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_pch)
4785#[inline]
4786#[target_feature(enable = "avx512fp16,avx512vl")]
4787#[cfg_attr(test, assert_instr(vfcmaddcph))]
4788#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4789pub fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4790    unsafe {
4791        let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4792        transmute(simd_select_bitmask(k, r, transmute(a)))
4793    }
4794}
4795
4796/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4797/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4798/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4799/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4800/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4801///
4802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_pch)
4803#[inline]
4804#[target_feature(enable = "avx512fp16,avx512vl")]
4805#[cfg_attr(test, assert_instr(vfcmaddcph))]
4806#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4807pub fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4808    unsafe {
4809        transmute(vfcmaddcph_mask3_128(
4810            transmute(a),
4811            transmute(b),
4812            transmute(c),
4813            k,
4814        ))
4815    }
4816}
4817
4818/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4819/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4820/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4821/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4822/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4823///
4824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_pch)
4825#[inline]
4826#[target_feature(enable = "avx512fp16,avx512vl")]
4827#[cfg_attr(test, assert_instr(vfcmaddcph))]
4828#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4829pub fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4830    unsafe {
4831        transmute(vfcmaddcph_maskz_128(
4832            transmute(a),
4833            transmute(b),
4834            transmute(c),
4835            k,
4836        ))
4837    }
4838}
4839
4840/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4841/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4842/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4843/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4844///
4845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmadd_pch)
4846#[inline]
4847#[target_feature(enable = "avx512fp16,avx512vl")]
4848#[cfg_attr(test, assert_instr(vfcmaddcph))]
4849#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4850pub fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4851    _mm256_mask3_fcmadd_pch(a, b, c, 0xff)
4852}
4853
4854/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4855/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4856/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4857/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4858/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4859///
4860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmadd_pch)
4861#[inline]
4862#[target_feature(enable = "avx512fp16,avx512vl")]
4863#[cfg_attr(test, assert_instr(vfcmaddcph))]
4864#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4865pub fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4866    unsafe {
4867        let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4868        transmute(simd_select_bitmask(k, r, transmute(a)))
4869    }
4870}
4871
4872/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4873/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4874/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4875/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4876/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4877///
4878/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fcmadd_pch)
4879#[inline]
4880#[target_feature(enable = "avx512fp16,avx512vl")]
4881#[cfg_attr(test, assert_instr(vfcmaddcph))]
4882#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4883pub fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4884    unsafe {
4885        transmute(vfcmaddcph_mask3_256(
4886            transmute(a),
4887            transmute(b),
4888            transmute(c),
4889            k,
4890        ))
4891    }
4892}
4893
4894/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4895/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4896/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4897/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4898/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4899///
4900/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmadd_pch)
4901#[inline]
4902#[target_feature(enable = "avx512fp16,avx512vl")]
4903#[cfg_attr(test, assert_instr(vfcmaddcph))]
4904#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4905pub fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4906    unsafe {
4907        transmute(vfcmaddcph_maskz_256(
4908            transmute(a),
4909            transmute(b),
4910            transmute(c),
4911            k,
4912        ))
4913    }
4914}
4915
4916/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4917/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4918/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4919/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4920///
4921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_pch)
4922#[inline]
4923#[target_feature(enable = "avx512fp16")]
4924#[cfg_attr(test, assert_instr(vfcmaddcph))]
4925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4926pub fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4927    _mm512_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4928}
4929
4930/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4931/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4932/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4933/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4934/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4935///
4936/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_pch)
4937#[inline]
4938#[target_feature(enable = "avx512fp16")]
4939#[cfg_attr(test, assert_instr(vfcmaddcph))]
4940#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4941pub fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4942    _mm512_mask_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4943}
4944
4945/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4946/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4947/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4948/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4949/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4950///
4951/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_pch)
4952#[inline]
4953#[target_feature(enable = "avx512fp16")]
4954#[cfg_attr(test, assert_instr(vfcmaddcph))]
4955#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4956pub fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4957    _mm512_mask3_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4958}
4959
4960/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4961/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4962/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4963/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4964/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4965///
4966/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_pch)
4967#[inline]
4968#[target_feature(enable = "avx512fp16")]
4969#[cfg_attr(test, assert_instr(vfcmaddcph))]
4970#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4971pub fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4972    _mm512_maskz_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4973}
4974
4975/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4976/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4977/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4978/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4979///
4980/// Rounding is done according to the rounding parameter, which can be one of:
4981///
4982/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4983/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4984/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4985/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4986/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4987///
4988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_round_pch)
4989#[inline]
4990#[target_feature(enable = "avx512fp16")]
4991#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4992#[rustc_legacy_const_generics(3)]
4993#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4994pub fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4995    static_assert_rounding!(ROUNDING);
4996    _mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4997}
4998
4999/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
5000/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
5001/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
5002/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5003/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5004///
5005/// Rounding is done according to the rounding parameter, which can be one of:
5006///
5007/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5008/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5009/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5010/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5011/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5012///
5013/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_round_pch)
5014#[inline]
5015#[target_feature(enable = "avx512fp16")]
5016#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
5017#[rustc_legacy_const_generics(4)]
5018#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5019pub fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
5020    a: __m512h,
5021    k: __mmask16,
5022    b: __m512h,
5023    c: __m512h,
5024) -> __m512h {
5025    unsafe {
5026        static_assert_rounding!(ROUNDING);
5027        let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
5028        transmute(simd_select_bitmask(k, r, transmute(a)))
5029    }
5030}
5031
5032/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
5033/// to the corresponding complex numbers in c using writemask k (the element is copied from c when the corresponding
5034/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
5035/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
5036/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5037///
5038/// Rounding is done according to the rounding parameter, which can be one of:
5039///
5040/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5041/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5042/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5043/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5044/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5045///
5046/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_round_pch)
5047#[inline]
5048#[target_feature(enable = "avx512fp16")]
5049#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
5050#[rustc_legacy_const_generics(4)]
5051#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5052pub fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
5053    a: __m512h,
5054    b: __m512h,
5055    c: __m512h,
5056    k: __mmask16,
5057) -> __m512h {
5058    unsafe {
5059        static_assert_rounding!(ROUNDING);
5060        transmute(vfcmaddcph_mask3_512(
5061            transmute(a),
5062            transmute(b),
5063            transmute(c),
5064            k,
5065            ROUNDING,
5066        ))
5067    }
5068}
5069
5070/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
5071/// to the corresponding complex numbers in c using zeromask k (the element is zeroed out when the corresponding
5072/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
5073/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
5074/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5075///
5076/// Rounding is done according to the rounding parameter, which can be one of:
5077///
5078/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5079/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5080/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5081/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5082/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5083///
5084/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_round_pch)
5085#[inline]
5086#[target_feature(enable = "avx512fp16")]
5087#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
5088#[rustc_legacy_const_generics(4)]
5089#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5090pub fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
5091    k: __mmask16,
5092    a: __m512h,
5093    b: __m512h,
5094    c: __m512h,
5095) -> __m512h {
5096    unsafe {
5097        static_assert_rounding!(ROUNDING);
5098        transmute(vfcmaddcph_maskz_512(
5099            transmute(a),
5100            transmute(b),
5101            transmute(c),
5102            k,
5103            ROUNDING,
5104        ))
5105    }
5106}
5107
5108/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5109/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
5110/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
5111/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
5112/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5113///
5114/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_sch)
5115#[inline]
5116#[target_feature(enable = "avx512fp16")]
5117#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5118#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5119pub fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5120    _mm_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
5121}
5122
5123/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5124/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5125/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
5126/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5127/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5128/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5129///
5130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_sch)
5131#[inline]
5132#[target_feature(enable = "avx512fp16")]
5133#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5134#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5135pub fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5136    _mm_mask_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
5137}
5138
5139/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5140/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5141/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
5142/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5143/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5144/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5145///
5146/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_sch)
5147#[inline]
5148#[target_feature(enable = "avx512fp16")]
5149#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5150#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5151pub fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5152    _mm_mask3_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
5153}
5154
5155/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5156/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5157/// zeromask k (the element is zeroed out when the corresponding mask bit is not set), and copy the upper
5158/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5159/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5160/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5161///
5162/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_sch)
5163#[inline]
5164#[target_feature(enable = "avx512fp16")]
5165#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5166#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5167pub fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5168    _mm_maskz_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
5169}
5170
5171/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5172/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
5173/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
5174/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
5175/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5176///
5177/// Rounding is done according to the rounding parameter, which can be one of:
5178///
5179/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5180/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5181/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5182/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5183/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5184///
5185/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_round_sch)
5186#[inline]
5187#[target_feature(enable = "avx512fp16")]
5188#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5189#[rustc_legacy_const_generics(3)]
5190#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5191pub fn _mm_fcmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5192    unsafe {
5193        static_assert_rounding!(ROUNDING);
5194        transmute(vfcmaddcsh_mask(
5195            transmute(a),
5196            transmute(b),
5197            transmute(c),
5198            0xff,
5199            ROUNDING,
5200        ))
5201    }
5202}
5203
5204/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5205/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5206/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
5207/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5208/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5209/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5210///
5211/// Rounding is done according to the rounding parameter, which can be one of:
5212///
5213/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5214/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5215/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5216/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5217/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5218///
5219/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_round_sch)
5220#[inline]
5221#[target_feature(enable = "avx512fp16")]
5222#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5223#[rustc_legacy_const_generics(4)]
5224#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5225pub fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
5226    a: __m128h,
5227    k: __mmask8,
5228    b: __m128h,
5229    c: __m128h,
5230) -> __m128h {
5231    unsafe {
5232        static_assert_rounding!(ROUNDING);
5233        let a = transmute(a);
5234        let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
5235        transmute(_mm_mask_move_ss(a, k, a, r))
5236    }
5237}
5238
5239/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5240/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5241/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
5242/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5243/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5244/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5245///
5246/// Rounding is done according to the rounding parameter, which can be one of:
5247///
5248/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5249/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5250/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5251/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5252/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5253///
5254/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_round_sch)
5255#[inline]
5256#[target_feature(enable = "avx512fp16")]
5257#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5258#[rustc_legacy_const_generics(4)]
5259#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5260pub fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
5261    a: __m128h,
5262    b: __m128h,
5263    c: __m128h,
5264    k: __mmask8,
5265) -> __m128h {
5266    unsafe {
5267        static_assert_rounding!(ROUNDING);
5268        let c = transmute(c);
5269        let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
5270        transmute(_mm_move_ss(c, r))
5271    }
5272}
5273
5274/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5275/// accumulate to the lower complex number in c using zeromask k (the element is zeroed out when the corresponding
5276/// mask bit is not set), and store the result in the lower elements of dst, and copy the upper 6 packed elements
5277/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
5278/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
5279/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5280///
5281/// Rounding is done according to the rounding parameter, which can be one of:
5282///
5283/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5284/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5285/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5286/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5287/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5288///
5289/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_round_sch)
5290#[inline]
5291#[target_feature(enable = "avx512fp16")]
5292#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5293#[rustc_legacy_const_generics(4)]
5294#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5295pub fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
5296    k: __mmask8,
5297    a: __m128h,
5298    b: __m128h,
5299    c: __m128h,
5300) -> __m128h {
5301    unsafe {
5302        static_assert_rounding!(ROUNDING);
5303        transmute(vfcmaddcsh_maskz(
5304            transmute(a),
5305            transmute(b),
5306            transmute(c),
5307            k,
5308            ROUNDING,
5309        ))
5310    }
5311}
5312
5313/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5314/// result to packed elements in c, and store the results in dst.
5315///
5316/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ph)
5317#[inline]
5318#[target_feature(enable = "avx512fp16,avx512vl")]
5319#[cfg_attr(test, assert_instr(vfmadd))]
5320#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5321#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5322pub const fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5323    unsafe { simd_fma(a, b, c) }
5324}
5325
5326/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5327/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5328/// from a when the corresponding mask bit is not set).
5329///
5330/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ph)
5331#[inline]
5332#[target_feature(enable = "avx512fp16,avx512vl")]
5333#[cfg_attr(test, assert_instr(vfmadd))]
5334#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5335#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5336pub const fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5337    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a) }
5338}
5339
5340/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5341/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5342/// from c when the corresponding mask bit is not set).
5343///
5344/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ph)
5345#[inline]
5346#[target_feature(enable = "avx512fp16,avx512vl")]
5347#[cfg_attr(test, assert_instr(vfmadd))]
5348#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5349#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5350pub const fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5351    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c) }
5352}
5353
5354/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5355/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5356/// out when the corresponding mask bit is not set).
5357///
5358/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ph)
5359#[inline]
5360#[target_feature(enable = "avx512fp16,avx512vl")]
5361#[cfg_attr(test, assert_instr(vfmadd))]
5362#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5363#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5364pub const fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5365    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph()) }
5366}
5367
5368/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5369/// result to packed elements in c, and store the results in dst.
5370///
5371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ph)
5372#[inline]
5373#[target_feature(enable = "avx512fp16,avx512vl")]
5374#[cfg_attr(test, assert_instr(vfmadd))]
5375#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5376#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5377pub const fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5378    unsafe { simd_fma(a, b, c) }
5379}
5380
5381/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5382/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5383/// from a when the corresponding mask bit is not set).
5384///
5385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ph)
5386#[inline]
5387#[target_feature(enable = "avx512fp16,avx512vl")]
5388#[cfg_attr(test, assert_instr(vfmadd))]
5389#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5390#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5391pub const fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5392    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a) }
5393}
5394
5395/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5396/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5397/// from c when the corresponding mask bit is not set).
5398///
5399/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ph)
5400#[inline]
5401#[target_feature(enable = "avx512fp16,avx512vl")]
5402#[cfg_attr(test, assert_instr(vfmadd))]
5403#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5404#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5405pub const fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5406    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c) }
5407}
5408
5409/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5410/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5411/// out when the corresponding mask bit is not set).
5412///
5413/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ph)
5414#[inline]
5415#[target_feature(enable = "avx512fp16,avx512vl")]
5416#[cfg_attr(test, assert_instr(vfmadd))]
5417#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5418#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5419pub const fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5420    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph()) }
5421}
5422
5423/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5424/// result to packed elements in c, and store the results in dst.
5425///
5426/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ph)
5427#[inline]
5428#[target_feature(enable = "avx512fp16")]
5429#[cfg_attr(test, assert_instr(vfmadd))]
5430#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5431#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5432pub const fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5433    unsafe { simd_fma(a, b, c) }
5434}
5435
5436/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5437/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5438/// from a when the corresponding mask bit is not set).
5439///
5440/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ph)
5441#[inline]
5442#[target_feature(enable = "avx512fp16")]
5443#[cfg_attr(test, assert_instr(vfmadd))]
5444#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5445#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5446pub const fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5447    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a) }
5448}
5449
5450/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5451/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5452/// from c when the corresponding mask bit is not set).
5453///
5454/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ph)
5455#[inline]
5456#[target_feature(enable = "avx512fp16")]
5457#[cfg_attr(test, assert_instr(vfmadd))]
5458#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5459#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5460pub const fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5461    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c) }
5462}
5463
5464/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5465/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5466/// out when the corresponding mask bit is not set).
5467///
5468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ph)
5469#[inline]
5470#[target_feature(enable = "avx512fp16")]
5471#[cfg_attr(test, assert_instr(vfmadd))]
5472#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5473#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5474pub const fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5475    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph()) }
5476}
5477
5478/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5479/// result to packed elements in c, and store the results in dst.
5480///
5481/// Rounding is done according to the rounding parameter, which can be one of:
5482///
5483/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5484/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5485/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5486/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5487/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5488///
5489/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ph)
5490#[inline]
5491#[target_feature(enable = "avx512fp16")]
5492#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5493#[rustc_legacy_const_generics(3)]
5494#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5495pub fn _mm512_fmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5496    unsafe {
5497        static_assert_rounding!(ROUNDING);
5498        vfmaddph_512(a, b, c, ROUNDING)
5499    }
5500}
5501
5502/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5503/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5504/// from a when the corresponding mask bit is not set).
5505///
5506/// Rounding is done according to the rounding parameter, which can be one of:
5507///
5508/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5509/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5510/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5511/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5512/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5513///
5514/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ph)
5515#[inline]
5516#[target_feature(enable = "avx512fp16")]
5517#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5518#[rustc_legacy_const_generics(4)]
5519#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5520pub fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
5521    a: __m512h,
5522    k: __mmask32,
5523    b: __m512h,
5524    c: __m512h,
5525) -> __m512h {
5526    unsafe {
5527        static_assert_rounding!(ROUNDING);
5528        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
5529    }
5530}
5531
5532/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5533/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5534/// from c when the corresponding mask bit is not set).
5535///
5536/// Rounding is done according to the rounding parameter, which can be one of:
5537///
5538/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5539/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5540/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5541/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5542/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5543///
5544/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ph)
5545#[inline]
5546#[target_feature(enable = "avx512fp16")]
5547#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5548#[rustc_legacy_const_generics(4)]
5549#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5550pub fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
5551    a: __m512h,
5552    b: __m512h,
5553    c: __m512h,
5554    k: __mmask32,
5555) -> __m512h {
5556    unsafe {
5557        static_assert_rounding!(ROUNDING);
5558        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
5559    }
5560}
5561
5562/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5563/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5564/// out when the corresponding mask bit is not set).
5565///
5566/// Rounding is done according to the rounding parameter, which can be one of:
5567///
5568/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5569/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5570/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5571/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5572/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5573///
5574/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ph)
5575#[inline]
5576#[target_feature(enable = "avx512fp16")]
5577#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5578#[rustc_legacy_const_generics(4)]
5579#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5580pub fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
5581    k: __mmask32,
5582    a: __m512h,
5583    b: __m512h,
5584    c: __m512h,
5585) -> __m512h {
5586    unsafe {
5587        static_assert_rounding!(ROUNDING);
5588        simd_select_bitmask(
5589            k,
5590            _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
5591            _mm512_setzero_ph(),
5592        )
5593    }
5594}
5595
5596/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5597/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5598/// 7 packed elements from a to the upper elements of dst.
5599///
5600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sh)
5601#[inline]
5602#[target_feature(enable = "avx512fp16")]
5603#[cfg_attr(test, assert_instr(vfmadd))]
5604#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5605#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5606pub const fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5607    unsafe {
5608        let extracta: f16 = simd_extract!(a, 0);
5609        let extractb: f16 = simd_extract!(b, 0);
5610        let extractc: f16 = simd_extract!(c, 0);
5611        let r = fmaf16(extracta, extractb, extractc);
5612        simd_insert!(a, 0, r)
5613    }
5614}
5615
5616/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5617/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5618/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5619/// upper elements of dst.
5620///
5621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sh)
5622#[inline]
5623#[target_feature(enable = "avx512fp16")]
5624#[cfg_attr(test, assert_instr(vfmadd))]
5625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5626#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5627pub const fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5628    unsafe {
5629        let mut fmadd: f16 = simd_extract!(a, 0);
5630        if k & 1 != 0 {
5631            let extractb: f16 = simd_extract!(b, 0);
5632            let extractc: f16 = simd_extract!(c, 0);
5633            fmadd = fmaf16(fmadd, extractb, extractc);
5634        }
5635        simd_insert!(a, 0, fmadd)
5636    }
5637}
5638
5639/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5640/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5641/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5642/// upper elements of dst.
5643///
5644/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sh)
5645#[inline]
5646#[target_feature(enable = "avx512fp16")]
5647#[cfg_attr(test, assert_instr(vfmadd))]
5648#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5649#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5650pub const fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5651    unsafe {
5652        let mut fmadd: f16 = simd_extract!(c, 0);
5653        if k & 1 != 0 {
5654            let extracta: f16 = simd_extract!(a, 0);
5655            let extractb: f16 = simd_extract!(b, 0);
5656            fmadd = fmaf16(extracta, extractb, fmadd);
5657        }
5658        simd_insert!(c, 0, fmadd)
5659    }
5660}
5661
5662/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5663/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5664/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5665/// upper elements of dst.
5666///
5667/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sh)
5668#[inline]
5669#[target_feature(enable = "avx512fp16")]
5670#[cfg_attr(test, assert_instr(vfmadd))]
5671#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5672#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5673pub const fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5674    unsafe {
5675        let mut fmadd: f16 = 0.0;
5676        if k & 1 != 0 {
5677            let extracta: f16 = simd_extract!(a, 0);
5678            let extractb: f16 = simd_extract!(b, 0);
5679            let extractc: f16 = simd_extract!(c, 0);
5680            fmadd = fmaf16(extracta, extractb, extractc);
5681        }
5682        simd_insert!(a, 0, fmadd)
5683    }
5684}
5685
5686/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5687/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5688/// 7 packed elements from a to the upper elements of dst.
5689///
5690/// Rounding is done according to the rounding parameter, which can be one of:
5691///
5692/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5693/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5694/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5695/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5696/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5697///
5698/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sh)
5699#[inline]
5700#[target_feature(enable = "avx512fp16")]
5701#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5702#[rustc_legacy_const_generics(3)]
5703#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5704pub fn _mm_fmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5705    unsafe {
5706        static_assert_rounding!(ROUNDING);
5707        let extracta: f16 = simd_extract!(a, 0);
5708        let extractb: f16 = simd_extract!(b, 0);
5709        let extractc: f16 = simd_extract!(c, 0);
5710        let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5711        simd_insert!(a, 0, r)
5712    }
5713}
5714
5715/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5716/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5717/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5718/// upper elements of dst.
5719///
5720/// Rounding is done according to the rounding parameter, which can be one of:
5721///
5722/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5723/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5724/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5725/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5726/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5727///
5728/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sh)
5729#[inline]
5730#[target_feature(enable = "avx512fp16")]
5731#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5732#[rustc_legacy_const_generics(4)]
5733#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5734pub fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
5735    a: __m128h,
5736    k: __mmask8,
5737    b: __m128h,
5738    c: __m128h,
5739) -> __m128h {
5740    unsafe {
5741        static_assert_rounding!(ROUNDING);
5742        let mut fmadd: f16 = simd_extract!(a, 0);
5743        if k & 1 != 0 {
5744            let extractb: f16 = simd_extract!(b, 0);
5745            let extractc: f16 = simd_extract!(c, 0);
5746            fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
5747        }
5748        simd_insert!(a, 0, fmadd)
5749    }
5750}
5751
5752/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5753/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5754/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5755/// upper elements of dst.
5756///
5757/// Rounding is done according to the rounding parameter, which can be one of:
5758///
5759/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5760/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5761/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5762/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5763/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5764///
5765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sh)
5766#[inline]
5767#[target_feature(enable = "avx512fp16")]
5768#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5769#[rustc_legacy_const_generics(4)]
5770#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5771pub fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
5772    a: __m128h,
5773    b: __m128h,
5774    c: __m128h,
5775    k: __mmask8,
5776) -> __m128h {
5777    unsafe {
5778        static_assert_rounding!(ROUNDING);
5779        let mut fmadd: f16 = simd_extract!(c, 0);
5780        if k & 1 != 0 {
5781            let extracta: f16 = simd_extract!(a, 0);
5782            let extractb: f16 = simd_extract!(b, 0);
5783            fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
5784        }
5785        simd_insert!(c, 0, fmadd)
5786    }
5787}
5788
5789/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5790/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5791/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5792/// upper elements of dst.
5793///
5794/// Rounding is done according to the rounding parameter, which can be one of:
5795///
5796/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5797/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5798/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5799/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5800/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5801///
5802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sh)
5803#[inline]
5804#[target_feature(enable = "avx512fp16")]
5805#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5806#[rustc_legacy_const_generics(4)]
5807#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5808pub fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
5809    k: __mmask8,
5810    a: __m128h,
5811    b: __m128h,
5812    c: __m128h,
5813) -> __m128h {
5814    unsafe {
5815        static_assert_rounding!(ROUNDING);
5816        let mut fmadd: f16 = 0.0;
5817        if k & 1 != 0 {
5818            let extracta: f16 = simd_extract!(a, 0);
5819            let extractb: f16 = simd_extract!(b, 0);
5820            let extractc: f16 = simd_extract!(c, 0);
5821            fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5822        }
5823        simd_insert!(a, 0, fmadd)
5824    }
5825}
5826
5827/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5828/// in c from the intermediate result, and store the results in dst.
5829/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5830///
5831/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ph)
5832#[inline]
5833#[target_feature(enable = "avx512fp16,avx512vl")]
5834#[cfg_attr(test, assert_instr(vfmsub))]
5835#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5836#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5837pub const fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5838    unsafe { simd_fma(a, b, simd_neg(c)) }
5839}
5840
5841/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5842/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5843/// from a when the corresponding mask bit is not set).
5844///
5845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ph)
5846#[inline]
5847#[target_feature(enable = "avx512fp16,avx512vl")]
5848#[cfg_attr(test, assert_instr(vfmsub))]
5849#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5850#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5851pub const fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5852    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a) }
5853}
5854
5855/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5856/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5857/// from c when the corresponding mask bit is not set).
5858///
5859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ph)
5860#[inline]
5861#[target_feature(enable = "avx512fp16,avx512vl")]
5862#[cfg_attr(test, assert_instr(vfmsub))]
5863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5864#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5865pub const fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5866    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c) }
5867}
5868
5869/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5870/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5871/// out when the corresponding mask bit is not set).
5872///
5873/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ph)
5874#[inline]
5875#[target_feature(enable = "avx512fp16,avx512vl")]
5876#[cfg_attr(test, assert_instr(vfmsub))]
5877#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5878#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5879pub const fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5880    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph()) }
5881}
5882
5883/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5884/// in c from the intermediate result, and store the results in dst.
5885///
5886/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ph)
5887#[inline]
5888#[target_feature(enable = "avx512fp16,avx512vl")]
5889#[cfg_attr(test, assert_instr(vfmsub))]
5890#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5891#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5892pub const fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5893    unsafe { simd_fma(a, b, simd_neg(c)) }
5894}
5895
5896/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5897/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5898/// from a when the corresponding mask bit is not set).
5899///
5900/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ph)
5901#[inline]
5902#[target_feature(enable = "avx512fp16,avx512vl")]
5903#[cfg_attr(test, assert_instr(vfmsub))]
5904#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5905#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5906pub const fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5907    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a) }
5908}
5909
5910/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5911/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5912/// from c when the corresponding mask bit is not set).
5913///
5914/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ph)
5915#[inline]
5916#[target_feature(enable = "avx512fp16,avx512vl")]
5917#[cfg_attr(test, assert_instr(vfmsub))]
5918#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5919#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5920pub const fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5921    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c) }
5922}
5923
5924/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5925/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5926/// out when the corresponding mask bit is not set).
5927///
5928/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ph)
5929#[inline]
5930#[target_feature(enable = "avx512fp16,avx512vl")]
5931#[cfg_attr(test, assert_instr(vfmsub))]
5932#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5933#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5934pub const fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5935    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph()) }
5936}
5937
5938/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5939/// in c from the intermediate result, and store the results in dst.
5940///
5941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ph)
5942#[inline]
5943#[target_feature(enable = "avx512fp16")]
5944#[cfg_attr(test, assert_instr(vfmsub))]
5945#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5946#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5947pub const fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5948    unsafe { simd_fma(a, b, simd_neg(c)) }
5949}
5950
5951/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5952/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5953/// from a when the corresponding mask bit is not set).
5954///
5955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ph)
5956#[inline]
5957#[target_feature(enable = "avx512fp16")]
5958#[cfg_attr(test, assert_instr(vfmsub))]
5959#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5960#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5961pub const fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5962    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a) }
5963}
5964
5965/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5966/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5967/// from c when the corresponding mask bit is not set).
5968///
5969/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ph)
5970#[inline]
5971#[target_feature(enable = "avx512fp16")]
5972#[cfg_attr(test, assert_instr(vfmsub))]
5973#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5974#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5975pub const fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5976    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c) }
5977}
5978
5979/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5980/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5981/// out when the corresponding mask bit is not set).
5982///
5983/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ph)
5984#[inline]
5985#[target_feature(enable = "avx512fp16")]
5986#[cfg_attr(test, assert_instr(vfmsub))]
5987#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5988#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
5989pub const fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5990    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph()) }
5991}
5992
5993/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5994/// in c from the intermediate result, and store the results in dst.
5995///
5996/// Rounding is done according to the rounding parameter, which can be one of:
5997///
5998/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5999/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6000/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6001/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6002/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6003///
6004/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ph)
6005#[inline]
6006#[target_feature(enable = "avx512fp16")]
6007#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6008#[rustc_legacy_const_generics(3)]
6009#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6010pub fn _mm512_fmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6011    unsafe {
6012        static_assert_rounding!(ROUNDING);
6013        vfmaddph_512(a, b, simd_neg(c), ROUNDING)
6014    }
6015}
6016
6017/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6018/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
6019/// from a when the corresponding mask bit is not set).
6020///
6021/// Rounding is done according to the rounding parameter, which can be one of:
6022///
6023/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6024/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6025/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6026/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6027/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6028///
6029/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ph)
6030#[inline]
6031#[target_feature(enable = "avx512fp16")]
6032#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6033#[rustc_legacy_const_generics(4)]
6034#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6035pub fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
6036    a: __m512h,
6037    k: __mmask32,
6038    b: __m512h,
6039    c: __m512h,
6040) -> __m512h {
6041    unsafe {
6042        static_assert_rounding!(ROUNDING);
6043        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
6044    }
6045}
6046
6047/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6048/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
6049/// from c when the corresponding mask bit is not set).
6050///
6051/// Rounding is done according to the rounding parameter, which can be one of:
6052///
6053/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6054/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6055/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6056/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6057/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6058///
6059/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ph)
6060#[inline]
6061#[target_feature(enable = "avx512fp16")]
6062#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6063#[rustc_legacy_const_generics(4)]
6064#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6065pub fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
6066    a: __m512h,
6067    b: __m512h,
6068    c: __m512h,
6069    k: __mmask32,
6070) -> __m512h {
6071    unsafe {
6072        static_assert_rounding!(ROUNDING);
6073        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
6074    }
6075}
6076
6077/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6078/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
6079/// out when the corresponding mask bit is not set).
6080///
6081/// Rounding is done according to the rounding parameter, which can be one of:
6082///
6083/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6084/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6085/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6086/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6087/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6088///
6089/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ph)
6090#[inline]
6091#[target_feature(enable = "avx512fp16")]
6092#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6093#[rustc_legacy_const_generics(4)]
6094#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6095pub fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
6096    k: __mmask32,
6097    a: __m512h,
6098    b: __m512h,
6099    c: __m512h,
6100) -> __m512h {
6101    unsafe {
6102        static_assert_rounding!(ROUNDING);
6103        simd_select_bitmask(
6104            k,
6105            _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
6106            _mm512_setzero_ph(),
6107        )
6108    }
6109}
6110
6111/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6112/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
6113/// 7 packed elements from a to the upper elements of dst.
6114///
6115/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sh)
6116#[inline]
6117#[target_feature(enable = "avx512fp16")]
6118#[cfg_attr(test, assert_instr(vfmsub))]
6119#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6120#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6121pub const fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6122    unsafe {
6123        let extracta: f16 = simd_extract!(a, 0);
6124        let extractb: f16 = simd_extract!(b, 0);
6125        let extractc: f16 = simd_extract!(c, 0);
6126        let r = fmaf16(extracta, extractb, -extractc);
6127        simd_insert!(a, 0, r)
6128    }
6129}
6130
6131/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6132/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6133/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6134/// upper elements of dst.
6135///
6136/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sh)
6137#[inline]
6138#[target_feature(enable = "avx512fp16")]
6139#[cfg_attr(test, assert_instr(vfmsub))]
6140#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6141#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6142pub const fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6143    unsafe {
6144        let mut fmsub: f16 = simd_extract!(a, 0);
6145        if k & 1 != 0 {
6146            let extractb: f16 = simd_extract!(b, 0);
6147            let extractc: f16 = simd_extract!(c, 0);
6148            fmsub = fmaf16(fmsub, extractb, -extractc);
6149        }
6150        simd_insert!(a, 0, fmsub)
6151    }
6152}
6153
6154/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6155/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6156/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
6157/// upper elements of dst.
6158///
6159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sh)
6160#[inline]
6161#[target_feature(enable = "avx512fp16")]
6162#[cfg_attr(test, assert_instr(vfmsub))]
6163#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6164#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6165pub const fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6166    unsafe {
6167        let mut fmsub: f16 = simd_extract!(c, 0);
6168        if k & 1 != 0 {
6169            let extracta: f16 = simd_extract!(a, 0);
6170            let extractb: f16 = simd_extract!(b, 0);
6171            fmsub = fmaf16(extracta, extractb, -fmsub);
6172        }
6173        simd_insert!(c, 0, fmsub)
6174    }
6175}
6176
6177/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6178/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
6179/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6180/// upper elements of dst.
6181///
6182/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sh)
6183#[inline]
6184#[target_feature(enable = "avx512fp16")]
6185#[cfg_attr(test, assert_instr(vfmsub))]
6186#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6187#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6188pub const fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6189    unsafe {
6190        let mut fmsub: f16 = 0.0;
6191        if k & 1 != 0 {
6192            let extracta: f16 = simd_extract!(a, 0);
6193            let extractb: f16 = simd_extract!(b, 0);
6194            let extractc: f16 = simd_extract!(c, 0);
6195            fmsub = fmaf16(extracta, extractb, -extractc);
6196        }
6197        simd_insert!(a, 0, fmsub)
6198    }
6199}
6200
6201/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6202/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
6203/// 7 packed elements from a to the upper elements of dst.
6204///
6205/// Rounding is done according to the rounding parameter, which can be one of:
6206///
6207/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6208/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6209/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6210/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6211/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6212///
6213/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sh)
6214#[inline]
6215#[target_feature(enable = "avx512fp16")]
6216#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6217#[rustc_legacy_const_generics(3)]
6218#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6219pub fn _mm_fmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6220    unsafe {
6221        static_assert_rounding!(ROUNDING);
6222        let extracta: f16 = simd_extract!(a, 0);
6223        let extractb: f16 = simd_extract!(b, 0);
6224        let extractc: f16 = simd_extract!(c, 0);
6225        let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6226        simd_insert!(a, 0, r)
6227    }
6228}
6229
6230/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6231/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6232/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6233/// upper elements of dst.
6234///
6235/// Rounding is done according to the rounding parameter, which can be one of:
6236///
6237/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6238/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6239/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6240/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6241/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6242///
6243/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sh)
6244#[inline]
6245#[target_feature(enable = "avx512fp16")]
6246#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6247#[rustc_legacy_const_generics(4)]
6248#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6249pub fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
6250    a: __m128h,
6251    k: __mmask8,
6252    b: __m128h,
6253    c: __m128h,
6254) -> __m128h {
6255    unsafe {
6256        static_assert_rounding!(ROUNDING);
6257        let mut fmsub: f16 = simd_extract!(a, 0);
6258        if k & 1 != 0 {
6259            let extractb: f16 = simd_extract!(b, 0);
6260            let extractc: f16 = simd_extract!(c, 0);
6261            fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
6262        }
6263        simd_insert!(a, 0, fmsub)
6264    }
6265}
6266
6267/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6268/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6269/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
6270/// upper elements of dst.
6271///
6272/// Rounding is done according to the rounding parameter, which can be one of:
6273///
6274/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6275/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6276/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6277/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6278/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6279///
6280/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sh)
6281#[inline]
6282#[target_feature(enable = "avx512fp16")]
6283#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6284#[rustc_legacy_const_generics(4)]
6285#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6286pub fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
6287    a: __m128h,
6288    b: __m128h,
6289    c: __m128h,
6290    k: __mmask8,
6291) -> __m128h {
6292    unsafe {
6293        static_assert_rounding!(ROUNDING);
6294        let mut fmsub: f16 = simd_extract!(c, 0);
6295        if k & 1 != 0 {
6296            let extracta: f16 = simd_extract!(a, 0);
6297            let extractb: f16 = simd_extract!(b, 0);
6298            fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
6299        }
6300        simd_insert!(c, 0, fmsub)
6301    }
6302}
6303
6304/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6305/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
6306/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6307/// upper elements of dst.
6308///
6309/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sh)
6310#[inline]
6311#[target_feature(enable = "avx512fp16")]
6312#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6313#[rustc_legacy_const_generics(4)]
6314#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6315pub fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
6316    k: __mmask8,
6317    a: __m128h,
6318    b: __m128h,
6319    c: __m128h,
6320) -> __m128h {
6321    unsafe {
6322        static_assert_rounding!(ROUNDING);
6323        let mut fmsub: f16 = 0.0;
6324        if k & 1 != 0 {
6325            let extracta: f16 = simd_extract!(a, 0);
6326            let extractb: f16 = simd_extract!(b, 0);
6327            let extractc: f16 = simd_extract!(c, 0);
6328            fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6329        }
6330        simd_insert!(a, 0, fmsub)
6331    }
6332}
6333
6334/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6335/// result from packed elements in c, and store the results in dst.
6336///
6337/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ph)
6338#[inline]
6339#[target_feature(enable = "avx512fp16,avx512vl")]
6340#[cfg_attr(test, assert_instr(vfnmadd))]
6341#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6342#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6343pub const fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6344    unsafe { simd_fma(simd_neg(a), b, c) }
6345}
6346
6347/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6348/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6349/// from a when the corresponding mask bit is not set).
6350///
6351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ph)
6352#[inline]
6353#[target_feature(enable = "avx512fp16,avx512vl")]
6354#[cfg_attr(test, assert_instr(vfnmadd))]
6355#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6356#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6357pub const fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6358    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a) }
6359}
6360
6361/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6362/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6363/// from c when the corresponding mask bit is not set).
6364///
6365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ph)
6366#[inline]
6367#[target_feature(enable = "avx512fp16,avx512vl")]
6368#[cfg_attr(test, assert_instr(vfnmadd))]
6369#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6370#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6371pub const fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6372    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c) }
6373}
6374
6375/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6376/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6377/// out when the corresponding mask bit is not set).
6378///
6379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ph)
6380#[inline]
6381#[target_feature(enable = "avx512fp16,avx512vl")]
6382#[cfg_attr(test, assert_instr(vfnmadd))]
6383#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6384#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6385pub const fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6386    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph()) }
6387}
6388
6389/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6390/// result from packed elements in c, and store the results in dst.
6391///
6392/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ph)
6393#[inline]
6394#[target_feature(enable = "avx512fp16,avx512vl")]
6395#[cfg_attr(test, assert_instr(vfnmadd))]
6396#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6397#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6398pub const fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6399    unsafe { simd_fma(simd_neg(a), b, c) }
6400}
6401
6402/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6403/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6404/// from a when the corresponding mask bit is not set).
6405///
6406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ph)
6407#[inline]
6408#[target_feature(enable = "avx512fp16,avx512vl")]
6409#[cfg_attr(test, assert_instr(vfnmadd))]
6410#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6411#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6412pub const fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6413    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a) }
6414}
6415
6416/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6417/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6418/// from c when the corresponding mask bit is not set).
6419///
6420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ph)
6421#[inline]
6422#[target_feature(enable = "avx512fp16,avx512vl")]
6423#[cfg_attr(test, assert_instr(vfnmadd))]
6424#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6425#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6426pub const fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6427    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c) }
6428}
6429
6430/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6431/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6432/// out when the corresponding mask bit is not set).
6433///
6434/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ph)
6435#[inline]
6436#[target_feature(enable = "avx512fp16,avx512vl")]
6437#[cfg_attr(test, assert_instr(vfnmadd))]
6438#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6439#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6440pub const fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6441    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph()) }
6442}
6443
6444/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6445/// result from packed elements in c, and store the results in dst.
6446///
6447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ph)
6448#[inline]
6449#[target_feature(enable = "avx512fp16")]
6450#[cfg_attr(test, assert_instr(vfnmadd))]
6451#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6452#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6453pub const fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6454    unsafe { simd_fma(simd_neg(a), b, c) }
6455}
6456
6457/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6458/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6459/// from a when the corresponding mask bit is not set).
6460///
6461/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ph)
6462#[inline]
6463#[target_feature(enable = "avx512fp16")]
6464#[cfg_attr(test, assert_instr(vfnmadd))]
6465#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6466#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6467pub const fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6468    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a) }
6469}
6470
6471/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6472/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6473/// from c when the corresponding mask bit is not set).
6474///
6475/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ph)
6476#[inline]
6477#[target_feature(enable = "avx512fp16")]
6478#[cfg_attr(test, assert_instr(vfnmadd))]
6479#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6480#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6481pub const fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6482    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c) }
6483}
6484
6485/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6486/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6487/// out when the corresponding mask bit is not set).
6488///
6489/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ph)
6490#[inline]
6491#[target_feature(enable = "avx512fp16")]
6492#[cfg_attr(test, assert_instr(vfnmadd))]
6493#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6494#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6495pub const fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6496    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph()) }
6497}
6498
6499/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6500/// result from packed elements in c, and store the results in dst.
6501///
6502/// Rounding is done according to the rounding parameter, which can be one of:
6503///
6504/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6505/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6506/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6507/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6508/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6509///
6510/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ph)
6511#[inline]
6512#[target_feature(enable = "avx512fp16")]
6513#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6514#[rustc_legacy_const_generics(3)]
6515#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6516pub fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6517    unsafe {
6518        static_assert_rounding!(ROUNDING);
6519        vfmaddph_512(simd_neg(a), b, c, ROUNDING)
6520    }
6521}
6522
6523/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6524/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6525/// from a when the corresponding mask bit is not set).
6526///
6527/// Rounding is done according to the rounding parameter, which can be one of:
6528///
6529/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6530/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6531/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6532/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6533/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6534///
6535/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ph)
6536#[inline]
6537#[target_feature(enable = "avx512fp16")]
6538#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6539#[rustc_legacy_const_generics(4)]
6540#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6541pub fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
6542    a: __m512h,
6543    k: __mmask32,
6544    b: __m512h,
6545    c: __m512h,
6546) -> __m512h {
6547    unsafe {
6548        static_assert_rounding!(ROUNDING);
6549        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
6550    }
6551}
6552
6553/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6554/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6555/// from c when the corresponding mask bit is not set).
6556///
6557/// Rounding is done according to the rounding parameter, which can be one of:
6558///
6559/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6560/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6561/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6562/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6563/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6564///
6565/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ph)
6566#[inline]
6567#[target_feature(enable = "avx512fp16")]
6568#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6569#[rustc_legacy_const_generics(4)]
6570#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6571pub fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
6572    a: __m512h,
6573    b: __m512h,
6574    c: __m512h,
6575    k: __mmask32,
6576) -> __m512h {
6577    unsafe {
6578        static_assert_rounding!(ROUNDING);
6579        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
6580    }
6581}
6582
6583/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6584/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6585/// out when the corresponding mask bit is not set).
6586///
6587/// Rounding is done according to the rounding parameter, which can be one of:
6588///
6589/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6590/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6591/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6592/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6593/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6594///
6595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ph)
6596#[inline]
6597#[target_feature(enable = "avx512fp16")]
6598#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6599#[rustc_legacy_const_generics(4)]
6600#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6601pub fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
6602    k: __mmask32,
6603    a: __m512h,
6604    b: __m512h,
6605    c: __m512h,
6606) -> __m512h {
6607    unsafe {
6608        static_assert_rounding!(ROUNDING);
6609        simd_select_bitmask(
6610            k,
6611            _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
6612            _mm512_setzero_ph(),
6613        )
6614    }
6615}
6616
6617/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6618/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6619/// elements from a to the upper elements of dst.
6620///
6621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sh)
6622#[inline]
6623#[target_feature(enable = "avx512fp16")]
6624#[cfg_attr(test, assert_instr(vfnmadd))]
6625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6626#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6627pub const fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6628    unsafe {
6629        let extracta: f16 = simd_extract!(a, 0);
6630        let extractb: f16 = simd_extract!(b, 0);
6631        let extractc: f16 = simd_extract!(c, 0);
6632        let r = fmaf16(-extracta, extractb, extractc);
6633        simd_insert!(a, 0, r)
6634    }
6635}
6636
6637/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6638/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6639/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6640/// elements of dst.
6641///
6642/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_sh)
6643#[inline]
6644#[target_feature(enable = "avx512fp16")]
6645#[cfg_attr(test, assert_instr(vfnmadd))]
6646#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6647#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6648pub const fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6649    unsafe {
6650        let mut fnmadd: f16 = simd_extract!(a, 0);
6651        if k & 1 != 0 {
6652            let extractb: f16 = simd_extract!(b, 0);
6653            let extractc: f16 = simd_extract!(c, 0);
6654            fnmadd = fmaf16(-fnmadd, extractb, extractc);
6655        }
6656        simd_insert!(a, 0, fnmadd)
6657    }
6658}
6659
6660/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6661/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6662/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6663/// elements of dst.
6664///
6665/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_sh)
6666#[inline]
6667#[target_feature(enable = "avx512fp16")]
6668#[cfg_attr(test, assert_instr(vfnmadd))]
6669#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6670#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6671pub const fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6672    unsafe {
6673        let mut fnmadd: f16 = simd_extract!(c, 0);
6674        if k & 1 != 0 {
6675            let extracta: f16 = simd_extract!(a, 0);
6676            let extractb: f16 = simd_extract!(b, 0);
6677            fnmadd = fmaf16(-extracta, extractb, fnmadd);
6678        }
6679        simd_insert!(c, 0, fnmadd)
6680    }
6681}
6682
6683/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6684/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6685/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6686/// elements of dst.
6687///
6688/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_sh)
6689#[inline]
6690#[target_feature(enable = "avx512fp16")]
6691#[cfg_attr(test, assert_instr(vfnmadd))]
6692#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6693#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6694pub const fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6695    unsafe {
6696        let mut fnmadd: f16 = 0.0;
6697        if k & 1 != 0 {
6698            let extracta: f16 = simd_extract!(a, 0);
6699            let extractb: f16 = simd_extract!(b, 0);
6700            let extractc: f16 = simd_extract!(c, 0);
6701            fnmadd = fmaf16(-extracta, extractb, extractc);
6702        }
6703        simd_insert!(a, 0, fnmadd)
6704    }
6705}
6706
6707/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6708/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6709/// elements from a to the upper elements of dst.
6710///
6711/// Rounding is done according to the rounding parameter, which can be one of:
6712///
6713/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6714/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6715/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6716/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6717/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6718///
6719/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sh)
6720#[inline]
6721#[target_feature(enable = "avx512fp16")]
6722#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6723#[rustc_legacy_const_generics(3)]
6724#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6725pub fn _mm_fnmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6726    unsafe {
6727        static_assert_rounding!(ROUNDING);
6728        let extracta: f16 = simd_extract!(a, 0);
6729        let extractb: f16 = simd_extract!(b, 0);
6730        let extractc: f16 = simd_extract!(c, 0);
6731        let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6732        simd_insert!(a, 0, r)
6733    }
6734}
6735
6736/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6737/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6738/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6739/// elements of dst.
6740///
6741/// Rounding is done according to the rounding parameter, which can be one of:
6742///
6743/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6744/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6745/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6746/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6747/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6748///
6749/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_round_sh)
6750#[inline]
6751#[target_feature(enable = "avx512fp16")]
6752#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6753#[rustc_legacy_const_generics(4)]
6754#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6755pub fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
6756    a: __m128h,
6757    k: __mmask8,
6758    b: __m128h,
6759    c: __m128h,
6760) -> __m128h {
6761    unsafe {
6762        static_assert_rounding!(ROUNDING);
6763        let mut fnmadd: f16 = simd_extract!(a, 0);
6764        if k & 1 != 0 {
6765            let extractb: f16 = simd_extract!(b, 0);
6766            let extractc: f16 = simd_extract!(c, 0);
6767            fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
6768        }
6769        simd_insert!(a, 0, fnmadd)
6770    }
6771}
6772
6773/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6774/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6775/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6776/// elements of dst.
6777///
6778/// Rounding is done according to the rounding parameter, which can be one of:
6779///
6780/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6781/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6782/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6783/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6784/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6785///
6786/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_round_sh)
6787#[inline]
6788#[target_feature(enable = "avx512fp16")]
6789#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6790#[rustc_legacy_const_generics(4)]
6791#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6792pub fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
6793    a: __m128h,
6794    b: __m128h,
6795    c: __m128h,
6796    k: __mmask8,
6797) -> __m128h {
6798    unsafe {
6799        static_assert_rounding!(ROUNDING);
6800        let mut fnmadd: f16 = simd_extract!(c, 0);
6801        if k & 1 != 0 {
6802            let extracta: f16 = simd_extract!(a, 0);
6803            let extractb: f16 = simd_extract!(b, 0);
6804            fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
6805        }
6806        simd_insert!(c, 0, fnmadd)
6807    }
6808}
6809
6810/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6811/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6812/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6813/// elements of dst.
6814///
6815/// Rounding is done according to the rounding parameter, which can be one of:
6816///
6817/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6818/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6819/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6820/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6821/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6822///
6823/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_round_sh)
6824#[inline]
6825#[target_feature(enable = "avx512fp16")]
6826#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6827#[rustc_legacy_const_generics(4)]
6828#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6829pub fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
6830    k: __mmask8,
6831    a: __m128h,
6832    b: __m128h,
6833    c: __m128h,
6834) -> __m128h {
6835    unsafe {
6836        static_assert_rounding!(ROUNDING);
6837        let mut fnmadd: f16 = 0.0;
6838        if k & 1 != 0 {
6839            let extracta: f16 = simd_extract!(a, 0);
6840            let extractb: f16 = simd_extract!(b, 0);
6841            let extractc: f16 = simd_extract!(c, 0);
6842            fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6843        }
6844        simd_insert!(a, 0, fnmadd)
6845    }
6846}
6847
6848/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6849/// in c from the negated intermediate result, and store the results in dst.
6850///
6851/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ph)
6852#[inline]
6853#[target_feature(enable = "avx512fp16,avx512vl")]
6854#[cfg_attr(test, assert_instr(vfnmsub))]
6855#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6856#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6857pub const fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6858    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6859}
6860
6861/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6862/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6863/// copied from a when the corresponding mask bit is not set).
6864///
6865/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ph)
6866#[inline]
6867#[target_feature(enable = "avx512fp16,avx512vl")]
6868#[cfg_attr(test, assert_instr(vfnmsub))]
6869#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6870#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6871pub const fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6872    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a) }
6873}
6874
6875/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6876/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6877/// copied from c when the corresponding mask bit is not set).
6878///
6879/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ph)
6880#[inline]
6881#[target_feature(enable = "avx512fp16,avx512vl")]
6882#[cfg_attr(test, assert_instr(vfnmsub))]
6883#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6884#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6885pub const fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6886    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c) }
6887}
6888
6889/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6890/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6891/// zeroed out when the corresponding mask bit is not set).
6892///
6893/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ph)
6894#[inline]
6895#[target_feature(enable = "avx512fp16,avx512vl")]
6896#[cfg_attr(test, assert_instr(vfnmsub))]
6897#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6898#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6899pub const fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6900    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph()) }
6901}
6902
6903/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6904/// in c from the negated intermediate result, and store the results in dst.
6905///
6906/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ph)
6907#[inline]
6908#[target_feature(enable = "avx512fp16,avx512vl")]
6909#[cfg_attr(test, assert_instr(vfnmsub))]
6910#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6911#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6912pub const fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6913    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6914}
6915
6916/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6917/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6918/// copied from a when the corresponding mask bit is not set).
6919///
6920/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ph)
6921#[inline]
6922#[target_feature(enable = "avx512fp16,avx512vl")]
6923#[cfg_attr(test, assert_instr(vfnmsub))]
6924#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6925#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6926pub const fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6927    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a) }
6928}
6929
6930/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6931/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6932/// copied from c when the corresponding mask bit is not set).
6933///
6934/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ph)
6935#[inline]
6936#[target_feature(enable = "avx512fp16,avx512vl")]
6937#[cfg_attr(test, assert_instr(vfnmsub))]
6938#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6939#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6940pub const fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6941    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c) }
6942}
6943
6944/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6945/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6946/// zeroed out when the corresponding mask bit is not set).
6947///
6948/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ph)
6949#[inline]
6950#[target_feature(enable = "avx512fp16,avx512vl")]
6951#[cfg_attr(test, assert_instr(vfnmsub))]
6952#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6953#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6954pub const fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6955    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph()) }
6956}
6957
6958/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6959/// in c from the negated intermediate result, and store the results in dst.
6960///
6961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ph)
6962#[inline]
6963#[target_feature(enable = "avx512fp16")]
6964#[cfg_attr(test, assert_instr(vfnmsub))]
6965#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6966#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6967pub const fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6968    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6969}
6970
6971/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6972/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6973/// copied from a when the corresponding mask bit is not set).
6974///
6975/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ph)
6976#[inline]
6977#[target_feature(enable = "avx512fp16")]
6978#[cfg_attr(test, assert_instr(vfnmsub))]
6979#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6980#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6981pub const fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6982    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a) }
6983}
6984
6985/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6986/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6987/// copied from c when the corresponding mask bit is not set).
6988///
6989/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ph)
6990#[inline]
6991#[target_feature(enable = "avx512fp16")]
6992#[cfg_attr(test, assert_instr(vfnmsub))]
6993#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6994#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
6995pub const fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6996    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c) }
6997}
6998
6999/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
7000/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
7001/// zeroed out when the corresponding mask bit is not set).
7002///
7003/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ph)
7004#[inline]
7005#[target_feature(enable = "avx512fp16")]
7006#[cfg_attr(test, assert_instr(vfnmsub))]
7007#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7008#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7009pub const fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7010    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph()) }
7011}
7012
7013/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
7014/// in c from the negated intermediate result, and store the results in dst.
7015///
7016/// Rounding is done according to the rounding parameter, which can be one of:
7017///
7018/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7019/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7020/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7021/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7022/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7023///
7024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ph)
7025#[inline]
7026#[target_feature(enable = "avx512fp16")]
7027#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7028#[rustc_legacy_const_generics(3)]
7029#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7030pub fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7031    unsafe {
7032        static_assert_rounding!(ROUNDING);
7033        vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
7034    }
7035}
7036
7037/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
7038/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
7039/// copied from a when the corresponding mask bit is not set).
7040///
7041/// Rounding is done according to the rounding parameter, which can be one of:
7042///
7043/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7044/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7045/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7046/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7047/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7048///
7049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ph)
7050#[inline]
7051#[target_feature(enable = "avx512fp16")]
7052#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7053#[rustc_legacy_const_generics(4)]
7054#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7055pub fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
7056    a: __m512h,
7057    k: __mmask32,
7058    b: __m512h,
7059    c: __m512h,
7060) -> __m512h {
7061    unsafe {
7062        static_assert_rounding!(ROUNDING);
7063        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
7064    }
7065}
7066
7067/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
7068/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
7069/// copied from c when the corresponding mask bit is not set).
7070///
7071/// Rounding is done according to the rounding parameter, which can be one of:
7072///
7073/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7074/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7075/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7076/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7077/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7078///
7079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ph)
7080#[inline]
7081#[target_feature(enable = "avx512fp16")]
7082#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7083#[rustc_legacy_const_generics(4)]
7084#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7085pub fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
7086    a: __m512h,
7087    b: __m512h,
7088    c: __m512h,
7089    k: __mmask32,
7090) -> __m512h {
7091    unsafe {
7092        static_assert_rounding!(ROUNDING);
7093        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
7094    }
7095}
7096
7097/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
7098/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
7099/// zeroed out when the corresponding mask bit is not set).
7100///
7101/// Rounding is done according to the rounding parameter, which can be one of:
7102///
7103/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7104/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7105/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7106/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7107/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7108///
7109/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ph)
7110#[inline]
7111#[target_feature(enable = "avx512fp16")]
7112#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7113#[rustc_legacy_const_generics(4)]
7114#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7115pub fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
7116    k: __mmask32,
7117    a: __m512h,
7118    b: __m512h,
7119    c: __m512h,
7120) -> __m512h {
7121    unsafe {
7122        static_assert_rounding!(ROUNDING);
7123        simd_select_bitmask(
7124            k,
7125            _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
7126            _mm512_setzero_ph(),
7127        )
7128    }
7129}
7130
7131/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7132/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
7133/// elements from a to the upper elements of dst.
7134///
7135/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sh)
7136#[inline]
7137#[target_feature(enable = "avx512fp16")]
7138#[cfg_attr(test, assert_instr(vfnmsub))]
7139#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7140#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7141pub const fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7142    unsafe {
7143        let extracta: f16 = simd_extract!(a, 0);
7144        let extractb: f16 = simd_extract!(b, 0);
7145        let extractc: f16 = simd_extract!(c, 0);
7146        let r = fmaf16(-extracta, extractb, -extractc);
7147        simd_insert!(a, 0, r)
7148    }
7149}
7150
7151/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7152/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7153/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7154/// elements of dst.
7155///
7156/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sh)
7157#[inline]
7158#[target_feature(enable = "avx512fp16")]
7159#[cfg_attr(test, assert_instr(vfnmsub))]
7160#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7161#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7162pub const fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7163    unsafe {
7164        let mut fnmsub: f16 = simd_extract!(a, 0);
7165        if k & 1 != 0 {
7166            let extractb: f16 = simd_extract!(b, 0);
7167            let extractc: f16 = simd_extract!(c, 0);
7168            fnmsub = fmaf16(-fnmsub, extractb, -extractc);
7169        }
7170        simd_insert!(a, 0, fnmsub)
7171    }
7172}
7173
7174/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7175/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7176/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
7177/// elements of dst.
7178///
7179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sh)
7180#[inline]
7181#[target_feature(enable = "avx512fp16")]
7182#[cfg_attr(test, assert_instr(vfnmsub))]
7183#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7184#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7185pub const fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7186    unsafe {
7187        let mut fnmsub: f16 = simd_extract!(c, 0);
7188        if k & 1 != 0 {
7189            let extracta: f16 = simd_extract!(a, 0);
7190            let extractb: f16 = simd_extract!(b, 0);
7191            fnmsub = fmaf16(-extracta, extractb, -fnmsub);
7192        }
7193        simd_insert!(c, 0, fnmsub)
7194    }
7195}
7196
7197/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7198/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
7199/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7200/// elements of dst.
7201///
7202/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sh)
7203#[inline]
7204#[target_feature(enable = "avx512fp16")]
7205#[cfg_attr(test, assert_instr(vfnmsub))]
7206#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7207#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7208pub const fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7209    unsafe {
7210        let mut fnmsub: f16 = 0.0;
7211        if k & 1 != 0 {
7212            let extracta: f16 = simd_extract!(a, 0);
7213            let extractb: f16 = simd_extract!(b, 0);
7214            let extractc: f16 = simd_extract!(c, 0);
7215            fnmsub = fmaf16(-extracta, extractb, -extractc);
7216        }
7217        simd_insert!(a, 0, fnmsub)
7218    }
7219}
7220
7221/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7222/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
7223/// elements from a to the upper elements of dst.
7224///
7225/// Rounding is done according to the rounding parameter, which can be one of:
7226///
7227/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7228/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7229/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7230/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7231/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7232///
7233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sh)
7234#[inline]
7235#[target_feature(enable = "avx512fp16")]
7236#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7237#[rustc_legacy_const_generics(3)]
7238#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7239pub fn _mm_fnmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7240    unsafe {
7241        static_assert_rounding!(ROUNDING);
7242        let extracta: f16 = simd_extract!(a, 0);
7243        let extractb: f16 = simd_extract!(b, 0);
7244        let extractc: f16 = simd_extract!(c, 0);
7245        let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
7246        simd_insert!(a, 0, r)
7247    }
7248}
7249
7250/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7251/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7252/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7253/// elements of dst.
7254///
7255/// Rounding is done according to the rounding parameter, which can be one of:
7256///
7257/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7258/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7259/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7260/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7261/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7262///
7263/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_sh)
7264#[inline]
7265#[target_feature(enable = "avx512fp16")]
7266#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7267#[rustc_legacy_const_generics(4)]
7268#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7269pub fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
7270    a: __m128h,
7271    k: __mmask8,
7272    b: __m128h,
7273    c: __m128h,
7274) -> __m128h {
7275    unsafe {
7276        static_assert_rounding!(ROUNDING);
7277        let mut fnmsub: f16 = simd_extract!(a, 0);
7278        if k & 1 != 0 {
7279            let extractb: f16 = simd_extract!(b, 0);
7280            let extractc: f16 = simd_extract!(c, 0);
7281            fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
7282        }
7283        simd_insert!(a, 0, fnmsub)
7284    }
7285}
7286
7287/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7288/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7289/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
7290/// elements of dst.
7291///
7292/// Rounding is done according to the rounding parameter, which can be one of:
7293///
7294/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7295/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7296/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7297/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7298/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7299///
7300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_sh)
7301#[inline]
7302#[target_feature(enable = "avx512fp16")]
7303#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7304#[rustc_legacy_const_generics(4)]
7305#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7306pub fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
7307    a: __m128h,
7308    b: __m128h,
7309    c: __m128h,
7310    k: __mmask8,
7311) -> __m128h {
7312    unsafe {
7313        static_assert_rounding!(ROUNDING);
7314        let mut fnmsub: f16 = simd_extract!(c, 0);
7315        if k & 1 != 0 {
7316            let extracta: f16 = simd_extract!(a, 0);
7317            let extractb: f16 = simd_extract!(b, 0);
7318            fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
7319        }
7320        simd_insert!(c, 0, fnmsub)
7321    }
7322}
7323
7324/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7325/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
7326/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7327/// elements of dst.
7328///
7329/// Rounding is done according to the rounding parameter, which can be one of:
7330///
7331/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7332/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7333/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7334/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7335/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7336///
7337/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_sh)
7338#[inline]
7339#[target_feature(enable = "avx512fp16")]
7340#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7341#[rustc_legacy_const_generics(4)]
7342#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7343pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
7344    k: __mmask8,
7345    a: __m128h,
7346    b: __m128h,
7347    c: __m128h,
7348) -> __m128h {
7349    unsafe {
7350        static_assert_rounding!(ROUNDING);
7351        let mut fnmsub: f16 = 0.0;
7352        if k & 1 != 0 {
7353            let extracta: f16 = simd_extract!(a, 0);
7354            let extractb: f16 = simd_extract!(b, 0);
7355            let extractc: f16 = simd_extract!(c, 0);
7356            fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
7357        }
7358        simd_insert!(a, 0, fnmsub)
7359    }
7360}
7361
7362/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7363/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7364///
7365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ph)
7366#[inline]
7367#[target_feature(enable = "avx512fp16,avx512vl")]
7368#[cfg_attr(test, assert_instr(vfmaddsub))]
7369#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7370#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7371pub const fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7372    unsafe {
7373        let add = simd_fma(a, b, c);
7374        let sub = simd_fma(a, b, simd_neg(c));
7375        simd_shuffle!(sub, add, [0, 9, 2, 11, 4, 13, 6, 15])
7376    }
7377}
7378
7379/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7380/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7381/// (the element is copied from a when the corresponding mask bit is not set).
7382///
7383/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ph)
7384#[inline]
7385#[target_feature(enable = "avx512fp16,avx512vl")]
7386#[cfg_attr(test, assert_instr(vfmaddsub))]
7387#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7388#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7389pub const fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7390    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a) }
7391}
7392
7393/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7394/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7395/// (the element is copied from c when the corresponding mask bit is not set).
7396///
7397/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ph)
7398#[inline]
7399#[target_feature(enable = "avx512fp16,avx512vl")]
7400#[cfg_attr(test, assert_instr(vfmaddsub))]
7401#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7402#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7403pub const fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7404    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c) }
7405}
7406
7407/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7408/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7409/// (the element is zeroed out when the corresponding mask bit is not set).
7410///
7411/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ph)
7412#[inline]
7413#[target_feature(enable = "avx512fp16,avx512vl")]
7414#[cfg_attr(test, assert_instr(vfmaddsub))]
7415#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7416#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7417pub const fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7418    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph()) }
7419}
7420
7421/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7422/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7423///
7424/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ph)
7425#[inline]
7426#[target_feature(enable = "avx512fp16,avx512vl")]
7427#[cfg_attr(test, assert_instr(vfmaddsub))]
7428#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7429#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7430pub const fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7431    unsafe {
7432        let add = simd_fma(a, b, c);
7433        let sub = simd_fma(a, b, simd_neg(c));
7434        simd_shuffle!(
7435            sub,
7436            add,
7437            [0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31]
7438        )
7439    }
7440}
7441
7442/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7443/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7444/// (the element is copied from a when the corresponding mask bit is not set).
7445///
7446/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ph)
7447#[inline]
7448#[target_feature(enable = "avx512fp16,avx512vl")]
7449#[cfg_attr(test, assert_instr(vfmaddsub))]
7450#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7451#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7452pub const fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7453    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a) }
7454}
7455
7456/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7457/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7458/// (the element is copied from c when the corresponding mask bit is not set).
7459///
7460/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ph)
7461#[inline]
7462#[target_feature(enable = "avx512fp16,avx512vl")]
7463#[cfg_attr(test, assert_instr(vfmaddsub))]
7464#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7465#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7466pub const fn _mm256_mask3_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7467    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c) }
7468}
7469
7470/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7471/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7472/// (the element is zeroed out when the corresponding mask bit is not set).
7473///
7474/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ph)
7475#[inline]
7476#[target_feature(enable = "avx512fp16,avx512vl")]
7477#[cfg_attr(test, assert_instr(vfmaddsub))]
7478#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7479#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7480pub const fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7481    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph()) }
7482}
7483
7484/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7485/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7486///
7487/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ph)
7488#[inline]
7489#[target_feature(enable = "avx512fp16")]
7490#[cfg_attr(test, assert_instr(vfmaddsub))]
7491#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7492#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7493pub const fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7494    unsafe {
7495        let add = simd_fma(a, b, c);
7496        let sub = simd_fma(a, b, simd_neg(c));
7497        simd_shuffle!(
7498            sub,
7499            add,
7500            [
7501                0, 33, 2, 35, 4, 37, 6, 39, 8, 41, 10, 43, 12, 45, 14, 47, 16, 49, 18, 51, 20, 53,
7502                22, 55, 24, 57, 26, 59, 28, 61, 30, 63
7503            ]
7504        )
7505    }
7506}
7507
7508/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7509/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7510/// (the element is copied from a when the corresponding mask bit is not set).
7511///
7512/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ph)
7513#[inline]
7514#[target_feature(enable = "avx512fp16")]
7515#[cfg_attr(test, assert_instr(vfmaddsub))]
7516#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7517#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7518pub const fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7519    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a) }
7520}
7521
7522/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7523/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7524/// (the element is copied from c when the corresponding mask bit is not set).
7525///
7526/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ph)
7527#[inline]
7528#[target_feature(enable = "avx512fp16")]
7529#[cfg_attr(test, assert_instr(vfmaddsub))]
7530#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7531#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7532pub const fn _mm512_mask3_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7533    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c) }
7534}
7535
7536/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7537/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7538/// (the element is zeroed out when the corresponding mask bit is not set).
7539///
7540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ph)
7541#[inline]
7542#[target_feature(enable = "avx512fp16")]
7543#[cfg_attr(test, assert_instr(vfmaddsub))]
7544#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7545#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7546pub const fn _mm512_maskz_fmaddsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7547    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph()) }
7548}
7549
7550/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7551/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7552///
7553/// Rounding is done according to the rounding parameter, which can be one of:
7554///
7555/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7556/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7557/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7558/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7559/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7560///
7561/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ph)
7562#[inline]
7563#[target_feature(enable = "avx512fp16")]
7564#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7565#[rustc_legacy_const_generics(3)]
7566#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7567pub fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
7568    a: __m512h,
7569    b: __m512h,
7570    c: __m512h,
7571) -> __m512h {
7572    unsafe {
7573        static_assert_rounding!(ROUNDING);
7574        vfmaddsubph_512(a, b, c, ROUNDING)
7575    }
7576}
7577
7578/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7579/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7580/// (the element is copied from a when the corresponding mask bit is not set).
7581///
7582/// Rounding is done according to the rounding parameter, which can be one of:
7583///
7584/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7585/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7586/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7587/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7588/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7589///
7590/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ph)
7591#[inline]
7592#[target_feature(enable = "avx512fp16")]
7593#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7594#[rustc_legacy_const_generics(4)]
7595#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7596pub fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
7597    a: __m512h,
7598    k: __mmask32,
7599    b: __m512h,
7600    c: __m512h,
7601) -> __m512h {
7602    unsafe {
7603        static_assert_rounding!(ROUNDING);
7604        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
7605    }
7606}
7607
7608/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7609/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7610/// (the element is copied from c when the corresponding mask bit is not set).
7611///
7612/// Rounding is done according to the rounding parameter, which can be one of:
7613///
7614/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7615/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7616/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7617/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7618/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7619///
7620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ph)
7621#[inline]
7622#[target_feature(enable = "avx512fp16")]
7623#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7624#[rustc_legacy_const_generics(4)]
7625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7626pub fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
7627    a: __m512h,
7628    b: __m512h,
7629    c: __m512h,
7630    k: __mmask32,
7631) -> __m512h {
7632    unsafe {
7633        static_assert_rounding!(ROUNDING);
7634        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
7635    }
7636}
7637
7638/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7639/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7640/// (the element is zeroed out when the corresponding mask bit is not set).
7641///
7642/// Rounding is done according to the rounding parameter, which can be one of:
7643///
7644/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7645/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7646/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7647/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7648/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7649///
7650/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ph)
7651#[inline]
7652#[target_feature(enable = "avx512fp16")]
7653#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7654#[rustc_legacy_const_generics(4)]
7655#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7656pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
7657    k: __mmask32,
7658    a: __m512h,
7659    b: __m512h,
7660    c: __m512h,
7661) -> __m512h {
7662    unsafe {
7663        static_assert_rounding!(ROUNDING);
7664        simd_select_bitmask(
7665            k,
7666            _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
7667            _mm512_setzero_ph(),
7668        )
7669    }
7670}
7671
7672/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7673/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7674///
7675/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ph)
7676#[inline]
7677#[target_feature(enable = "avx512fp16,avx512vl")]
7678#[cfg_attr(test, assert_instr(vfmsubadd))]
7679#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7680#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7681pub const fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7682    _mm_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
7683}
7684
7685/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7686/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7687/// (the element is copied from a when the corresponding mask bit is not set).
7688///
7689/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ph)
7690#[inline]
7691#[target_feature(enable = "avx512fp16,avx512vl")]
7692#[cfg_attr(test, assert_instr(vfmsubadd))]
7693#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7694#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7695pub const fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7696    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a) }
7697}
7698
7699/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7700/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7701/// (the element is copied from c when the corresponding mask bit is not set).
7702///
7703/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ph)
7704#[inline]
7705#[target_feature(enable = "avx512fp16,avx512vl")]
7706#[cfg_attr(test, assert_instr(vfmsubadd))]
7707#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7708#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7709pub const fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7710    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c) }
7711}
7712
7713/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7714/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7715/// (the element is zeroed out when the corresponding mask bit is not set).
7716///
7717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ph)
7718#[inline]
7719#[target_feature(enable = "avx512fp16,avx512vl")]
7720#[cfg_attr(test, assert_instr(vfmsubadd))]
7721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7722#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7723pub const fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7724    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph()) }
7725}
7726
7727/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7728/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7729///
7730/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ph)
7731#[inline]
7732#[target_feature(enable = "avx512fp16,avx512vl")]
7733#[cfg_attr(test, assert_instr(vfmsubadd))]
7734#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7735#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7736pub const fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7737    _mm256_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
7738}
7739
7740/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7741/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7742/// (the element is copied from a when the corresponding mask bit is not set).
7743///
7744/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ph)
7745#[inline]
7746#[target_feature(enable = "avx512fp16,avx512vl")]
7747#[cfg_attr(test, assert_instr(vfmsubadd))]
7748#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7749#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7750pub const fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7751    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a) }
7752}
7753
7754/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7755/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7756/// (the element is copied from c when the corresponding mask bit is not set).
7757///
7758/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ph)
7759#[inline]
7760#[target_feature(enable = "avx512fp16,avx512vl")]
7761#[cfg_attr(test, assert_instr(vfmsubadd))]
7762#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7763#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7764pub const fn _mm256_mask3_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7765    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c) }
7766}
7767
7768/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7769/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7770/// (the element is zeroed out when the corresponding mask bit is not set).
7771///
7772/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ph)
7773#[inline]
7774#[target_feature(enable = "avx512fp16,avx512vl")]
7775#[cfg_attr(test, assert_instr(vfmsubadd))]
7776#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7777#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7778pub const fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7779    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph()) }
7780}
7781
7782/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7783/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7784///
7785/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ph)
7786#[inline]
7787#[target_feature(enable = "avx512fp16")]
7788#[cfg_attr(test, assert_instr(vfmsubadd))]
7789#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7790#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7791pub const fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7792    _mm512_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
7793}
7794
7795/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7796/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7797/// (the element is copied from a when the corresponding mask bit is not set).
7798///
7799/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ph)
7800#[inline]
7801#[target_feature(enable = "avx512fp16")]
7802#[cfg_attr(test, assert_instr(vfmsubadd))]
7803#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7804#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7805pub const fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7806    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a) }
7807}
7808
7809/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7810/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7811/// (the element is copied from c when the corresponding mask bit is not set).
7812///
7813/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ph)
7814#[inline]
7815#[target_feature(enable = "avx512fp16")]
7816#[cfg_attr(test, assert_instr(vfmsubadd))]
7817#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7818#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7819pub const fn _mm512_mask3_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7820    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c) }
7821}
7822
7823/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7824/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7825/// (the element is zeroed out when the corresponding mask bit is not set).
7826///
7827/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ph)
7828#[inline]
7829#[target_feature(enable = "avx512fp16")]
7830#[cfg_attr(test, assert_instr(vfmsubadd))]
7831#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7832#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
7833pub const fn _mm512_maskz_fmsubadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7834    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph()) }
7835}
7836
7837/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7838/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7839///
7840/// Rounding is done according to the rounding parameter, which can be one of:
7841///
7842/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7843/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7844/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7845/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7846/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7847///
7848/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ph)
7849#[inline]
7850#[target_feature(enable = "avx512fp16")]
7851#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7852#[rustc_legacy_const_generics(3)]
7853#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7854pub fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
7855    a: __m512h,
7856    b: __m512h,
7857    c: __m512h,
7858) -> __m512h {
7859    unsafe {
7860        static_assert_rounding!(ROUNDING);
7861        vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
7862    }
7863}
7864
7865/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7866/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7867/// (the element is copied from a when the corresponding mask bit is not set).
7868///
7869/// Rounding is done according to the rounding parameter, which can be one of:
7870///
7871/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7872/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7873/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7874/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7875/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7876///
7877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ph)
7878#[inline]
7879#[target_feature(enable = "avx512fp16")]
7880#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7881#[rustc_legacy_const_generics(4)]
7882#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7883pub fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
7884    a: __m512h,
7885    k: __mmask32,
7886    b: __m512h,
7887    c: __m512h,
7888) -> __m512h {
7889    unsafe {
7890        static_assert_rounding!(ROUNDING);
7891        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
7892    }
7893}
7894
7895/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7896/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7897/// (the element is copied from c when the corresponding mask bit is not set).
7898///
7899/// Rounding is done according to the rounding parameter, which can be one of:
7900///
7901/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7902/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7903/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7904/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7905/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7906///
7907/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ph)
7908#[inline]
7909#[target_feature(enable = "avx512fp16")]
7910#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7911#[rustc_legacy_const_generics(4)]
7912#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7913pub fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
7914    a: __m512h,
7915    b: __m512h,
7916    c: __m512h,
7917    k: __mmask32,
7918) -> __m512h {
7919    unsafe {
7920        static_assert_rounding!(ROUNDING);
7921        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
7922    }
7923}
7924
7925/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7926/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7927/// (the element is zeroed out when the corresponding mask bit is not set).
7928///
7929/// Rounding is done according to the rounding parameter, which can be one of:
7930///
7931/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7932/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7933/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7934/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7935/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7936///
7937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ph)
7938#[inline]
7939#[target_feature(enable = "avx512fp16")]
7940#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7941#[rustc_legacy_const_generics(4)]
7942#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7943pub fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
7944    k: __mmask32,
7945    a: __m512h,
7946    b: __m512h,
7947    c: __m512h,
7948) -> __m512h {
7949    unsafe {
7950        static_assert_rounding!(ROUNDING);
7951        simd_select_bitmask(
7952            k,
7953            _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
7954            _mm512_setzero_ph(),
7955        )
7956    }
7957}
7958
7959/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7960/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7961///
7962/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ph)
7963#[inline]
7964#[target_feature(enable = "avx512fp16,avx512vl")]
7965#[cfg_attr(test, assert_instr(vrcpph))]
7966#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7967pub fn _mm_rcp_ph(a: __m128h) -> __m128h {
7968    _mm_mask_rcp_ph(_mm_undefined_ph(), 0xff, a)
7969}
7970
7971/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7972/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7973/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7974///
7975/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_ph)
7976#[inline]
7977#[target_feature(enable = "avx512fp16,avx512vl")]
7978#[cfg_attr(test, assert_instr(vrcpph))]
7979#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7980pub fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7981    unsafe { vrcpph_128(a, src, k) }
7982}
7983
7984/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7985/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7986/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7987///
7988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_ph)
7989#[inline]
7990#[target_feature(enable = "avx512fp16,avx512vl")]
7991#[cfg_attr(test, assert_instr(vrcpph))]
7992#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7993pub fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
7994    _mm_mask_rcp_ph(_mm_setzero_ph(), k, a)
7995}
7996
7997/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7998/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7999///
8000/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ph)
8001#[inline]
8002#[target_feature(enable = "avx512fp16,avx512vl")]
8003#[cfg_attr(test, assert_instr(vrcpph))]
8004#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8005pub fn _mm256_rcp_ph(a: __m256h) -> __m256h {
8006    _mm256_mask_rcp_ph(_mm256_undefined_ph(), 0xffff, a)
8007}
8008
8009/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
8010/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
8011/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8012///
8013/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp_ph)
8014#[inline]
8015#[target_feature(enable = "avx512fp16,avx512vl")]
8016#[cfg_attr(test, assert_instr(vrcpph))]
8017#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8018pub fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8019    unsafe { vrcpph_256(a, src, k) }
8020}
8021
8022/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
8023/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
8024/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8025///
8026/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp_ph)
8027#[inline]
8028#[target_feature(enable = "avx512fp16,avx512vl")]
8029#[cfg_attr(test, assert_instr(vrcpph))]
8030#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8031pub fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
8032    _mm256_mask_rcp_ph(_mm256_setzero_ph(), k, a)
8033}
8034
8035/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
8036/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8037///
8038/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp_ph)
8039#[inline]
8040#[target_feature(enable = "avx512fp16")]
8041#[cfg_attr(test, assert_instr(vrcpph))]
8042#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8043pub fn _mm512_rcp_ph(a: __m512h) -> __m512h {
8044    _mm512_mask_rcp_ph(_mm512_undefined_ph(), 0xffffffff, a)
8045}
8046
8047/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
8048/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
8049/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8050///
8051/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp_ph)
8052#[inline]
8053#[target_feature(enable = "avx512fp16")]
8054#[cfg_attr(test, assert_instr(vrcpph))]
8055#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8056pub fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8057    unsafe { vrcpph_512(a, src, k) }
8058}
8059
8060/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
8061/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
8062/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8063///
8064/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp_ph)
8065#[inline]
8066#[target_feature(enable = "avx512fp16")]
8067#[cfg_attr(test, assert_instr(vrcpph))]
8068#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8069pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
8070    _mm512_mask_rcp_ph(_mm512_setzero_ph(), k, a)
8071}
8072
8073/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
8074/// store the result in the lower element of dst, and copy the upper 7 packed elements from a to the
8075/// upper elements of dst.
8076/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8077///
8078/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_sh)
8079#[inline]
8080#[target_feature(enable = "avx512fp16")]
8081#[cfg_attr(test, assert_instr(vrcpsh))]
8082#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8083pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
8084    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
8085}
8086
8087/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
8088/// store the result in the lower element of dst using writemask k (the element is copied from src when
8089/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8090/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8091///
8092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_sh)
8093#[inline]
8094#[target_feature(enable = "avx512fp16")]
8095#[cfg_attr(test, assert_instr(vrcpsh))]
8096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8097pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8098    unsafe { vrcpsh(a, b, src, k) }
8099}
8100
8101/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
8102/// store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8103/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8104/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8105///
8106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_sh)
8107#[inline]
8108#[target_feature(enable = "avx512fp16")]
8109#[cfg_attr(test, assert_instr(vrcpsh))]
8110#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8111pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8112    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), k, a, b)
8113}
8114
8115/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8116/// elements in a, and store the results in dst.
8117/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8118///
8119/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ph)
8120#[inline]
8121#[target_feature(enable = "avx512fp16,avx512vl")]
8122#[cfg_attr(test, assert_instr(vrsqrtph))]
8123#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8124pub fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
8125    _mm_mask_rsqrt_ph(_mm_undefined_ph(), 0xff, a)
8126}
8127
8128/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8129/// elements in a, and store the results in dst using writemask k (elements are copied from src when
8130/// the corresponding mask bit is not set).
8131/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8132///
8133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_ph)
8134#[inline]
8135#[target_feature(enable = "avx512fp16,avx512vl")]
8136#[cfg_attr(test, assert_instr(vrsqrtph))]
8137#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8138pub fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8139    unsafe { vrsqrtph_128(a, src, k) }
8140}
8141
8142/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8143/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
8144/// corresponding mask bit is not set).
8145/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8146///
8147/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_ph)
8148#[inline]
8149#[target_feature(enable = "avx512fp16,avx512vl")]
8150#[cfg_attr(test, assert_instr(vrsqrtph))]
8151#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8152pub fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
8153    _mm_mask_rsqrt_ph(_mm_setzero_ph(), k, a)
8154}
8155
8156/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8157/// elements in a, and store the results in dst.
8158/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8159///
8160/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ph)
8161#[inline]
8162#[target_feature(enable = "avx512fp16,avx512vl")]
8163#[cfg_attr(test, assert_instr(vrsqrtph))]
8164#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8165pub fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
8166    _mm256_mask_rsqrt_ph(_mm256_undefined_ph(), 0xffff, a)
8167}
8168
8169/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8170/// elements in a, and store the results in dst using writemask k (elements are copied from src when
8171/// the corresponding mask bit is not set).
8172/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8173///
8174/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt_ph)
8175#[inline]
8176#[target_feature(enable = "avx512fp16,avx512vl")]
8177#[cfg_attr(test, assert_instr(vrsqrtph))]
8178#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8179pub fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8180    unsafe { vrsqrtph_256(a, src, k) }
8181}
8182
8183/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8184/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
8185/// corresponding mask bit is not set).
8186/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8187///
8188/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt_ph)
8189#[inline]
8190#[target_feature(enable = "avx512fp16,avx512vl")]
8191#[cfg_attr(test, assert_instr(vrsqrtph))]
8192#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8193pub fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
8194    _mm256_mask_rsqrt_ph(_mm256_setzero_ph(), k, a)
8195}
8196
8197/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8198/// elements in a, and store the results in dst.
8199/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8200///
8201/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt_ph)
8202#[inline]
8203#[target_feature(enable = "avx512fp16")]
8204#[cfg_attr(test, assert_instr(vrsqrtph))]
8205#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8206pub fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
8207    _mm512_mask_rsqrt_ph(_mm512_undefined_ph(), 0xffffffff, a)
8208}
8209
8210/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8211/// elements in a, and store the results in dst using writemask k (elements are copied from src when
8212/// the corresponding mask bit is not set).
8213/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8214///
8215/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt_ph)
8216#[inline]
8217#[target_feature(enable = "avx512fp16")]
8218#[cfg_attr(test, assert_instr(vrsqrtph))]
8219#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8220pub fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8221    unsafe { vrsqrtph_512(a, src, k) }
8222}
8223
8224/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8225/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
8226/// corresponding mask bit is not set).
8227/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8228///
8229/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt_ph)
8230#[inline]
8231#[target_feature(enable = "avx512fp16")]
8232#[cfg_attr(test, assert_instr(vrsqrtph))]
8233#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8234pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
8235    _mm512_mask_rsqrt_ph(_mm512_setzero_ph(), k, a)
8236}
8237
8238/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
8239/// element in b, store the result in the lower element of dst, and copy the upper 7 packed elements from a
8240/// to the upper elements of dst.
8241/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8242///
8243/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_sh)
8244#[inline]
8245#[target_feature(enable = "avx512fp16")]
8246#[cfg_attr(test, assert_instr(vrsqrtsh))]
8247#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8248pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
8249    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
8250}
8251
8252/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
8253/// element in b, store the result in the lower element of dst using writemask k (the element is copied from src
8254/// when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8255/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8256///
8257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_sh)
8258#[inline]
8259#[target_feature(enable = "avx512fp16")]
8260#[cfg_attr(test, assert_instr(vrsqrtsh))]
8261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8262pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8263    unsafe { vrsqrtsh(a, b, src, k) }
8264}
8265
8266/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
8267/// element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when
8268/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8269/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8270///
8271/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_sh)
8272#[inline]
8273#[target_feature(enable = "avx512fp16")]
8274#[cfg_attr(test, assert_instr(vrsqrtsh))]
8275#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8276pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8277    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
8278}
8279
8280/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8281/// results in dst.
8282///
8283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ph)
8284#[inline]
8285#[target_feature(enable = "avx512fp16,avx512vl")]
8286#[cfg_attr(test, assert_instr(vsqrtph))]
8287#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8288pub fn _mm_sqrt_ph(a: __m128h) -> __m128h {
8289    unsafe { simd_fsqrt(a) }
8290}
8291
8292/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8293/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8294///
8295/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ph)
8296#[inline]
8297#[target_feature(enable = "avx512fp16,avx512vl")]
8298#[cfg_attr(test, assert_instr(vsqrtph))]
8299#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8300pub fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8301    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), src) }
8302}
8303
8304/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8305/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8306///
8307/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ph)
8308#[inline]
8309#[target_feature(enable = "avx512fp16,avx512vl")]
8310#[cfg_attr(test, assert_instr(vsqrtph))]
8311#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8312pub fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
8313    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph()) }
8314}
8315
8316/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8317/// results in dst.
8318///
8319/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ph)
8320#[inline]
8321#[target_feature(enable = "avx512fp16,avx512vl")]
8322#[cfg_attr(test, assert_instr(vsqrtph))]
8323#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8324pub fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
8325    unsafe { simd_fsqrt(a) }
8326}
8327
8328/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8329/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8330///
8331/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ph)
8332#[inline]
8333#[target_feature(enable = "avx512fp16,avx512vl")]
8334#[cfg_attr(test, assert_instr(vsqrtph))]
8335#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8336pub fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8337    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), src) }
8338}
8339
8340/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8341/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8342///
8343/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ph)
8344#[inline]
8345#[target_feature(enable = "avx512fp16,avx512vl")]
8346#[cfg_attr(test, assert_instr(vsqrtph))]
8347#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8348pub fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
8349    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph()) }
8350}
8351
8352/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8353/// results in dst.
8354///
8355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ph)
8356#[inline]
8357#[target_feature(enable = "avx512fp16")]
8358#[cfg_attr(test, assert_instr(vsqrtph))]
8359#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8360pub fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
8361    unsafe { simd_fsqrt(a) }
8362}
8363
8364/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8365/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8366///
8367/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ph)
8368#[inline]
8369#[target_feature(enable = "avx512fp16")]
8370#[cfg_attr(test, assert_instr(vsqrtph))]
8371#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8372pub fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8373    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), src) }
8374}
8375
8376/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8377/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8378///
8379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ph)
8380#[inline]
8381#[target_feature(enable = "avx512fp16")]
8382#[cfg_attr(test, assert_instr(vsqrtph))]
8383#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8384pub fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
8385    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph()) }
8386}
8387
8388/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8389/// results in dst.
8390/// Rounding is done according to the rounding parameter, which can be one of:
8391///
8392/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8393/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8394/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8395/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8396/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8397///
8398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ph)
8399#[inline]
8400#[target_feature(enable = "avx512fp16")]
8401#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8402#[rustc_legacy_const_generics(1)]
8403#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8404pub fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
8405    unsafe {
8406        static_assert_rounding!(ROUNDING);
8407        vsqrtph_512(a, ROUNDING)
8408    }
8409}
8410
8411/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8412/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8413/// Rounding is done according to the rounding parameter, which can be one of:
8414///
8415/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8416/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8417/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8418/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8419/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8420///
8421/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ph)
8422#[inline]
8423#[target_feature(enable = "avx512fp16")]
8424#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8425#[rustc_legacy_const_generics(3)]
8426#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8427pub fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
8428    src: __m512h,
8429    k: __mmask32,
8430    a: __m512h,
8431) -> __m512h {
8432    unsafe {
8433        static_assert_rounding!(ROUNDING);
8434        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
8435    }
8436}
8437
8438/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8439/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8440/// Rounding is done according to the rounding parameter, which can be one of:
8441///
8442/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8443/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8444/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8445/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8446/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8447///
8448/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ph)
8449#[inline]
8450#[target_feature(enable = "avx512fp16")]
8451#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8452#[rustc_legacy_const_generics(2)]
8453#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8454pub fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
8455    unsafe {
8456        static_assert_rounding!(ROUNDING);
8457        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
8458    }
8459}
8460
8461/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8462/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8463/// elements of dst.
8464///
8465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sh)
8466#[inline]
8467#[target_feature(enable = "avx512fp16")]
8468#[cfg_attr(test, assert_instr(vsqrtsh))]
8469#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8470pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
8471    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
8472}
8473
8474/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8475/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8476/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8477///
8478/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sh)
8479#[inline]
8480#[target_feature(enable = "avx512fp16")]
8481#[cfg_attr(test, assert_instr(vsqrtsh))]
8482#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8483pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8484    _mm_mask_sqrt_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8485}
8486
8487/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8488/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8489/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8490///
8491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sh)
8492#[inline]
8493#[target_feature(enable = "avx512fp16")]
8494#[cfg_attr(test, assert_instr(vsqrtsh))]
8495#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8496pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8497    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
8498}
8499
8500/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8501/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8502/// elements of dst.
8503/// Rounding is done according to the rounding parameter, which can be one of:
8504///
8505/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8506/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8507/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8508/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8509/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8510///
8511/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sh)
8512#[inline]
8513#[target_feature(enable = "avx512fp16")]
8514#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8515#[rustc_legacy_const_generics(2)]
8516#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8517pub fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
8518    static_assert_rounding!(ROUNDING);
8519    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
8520}
8521
8522/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8523/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8524/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8525/// Rounding is done according to the rounding parameter, which can be one of:
8526///
8527/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8528/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8529/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8530/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8531/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8532///
8533/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sh)
8534#[inline]
8535#[target_feature(enable = "avx512fp16")]
8536#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8537#[rustc_legacy_const_generics(4)]
8538#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8539pub fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
8540    src: __m128h,
8541    k: __mmask8,
8542    a: __m128h,
8543    b: __m128h,
8544) -> __m128h {
8545    unsafe {
8546        static_assert_rounding!(ROUNDING);
8547        vsqrtsh(a, b, src, k, ROUNDING)
8548    }
8549}
8550
8551/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8552/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8553/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8554/// Rounding is done according to the rounding parameter, which can be one of:
8555///
8556/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8557/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8558/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8559/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8560/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8561///
8562/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sh)
8563#[inline]
8564#[target_feature(enable = "avx512fp16")]
8565#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8566#[rustc_legacy_const_generics(3)]
8567#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8568pub fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
8569    k: __mmask8,
8570    a: __m128h,
8571    b: __m128h,
8572) -> __m128h {
8573    static_assert_rounding!(ROUNDING);
8574    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
8575}
8576
8577/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8578/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8579/// value when inputs are NaN or signed-zero values.
8580///
8581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ph)
8582#[inline]
8583#[target_feature(enable = "avx512fp16,avx512vl")]
8584#[cfg_attr(test, assert_instr(vmaxph))]
8585#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8586pub fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
8587    unsafe { vmaxph_128(a, b) }
8588}
8589
8590/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8591/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8592/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8593/// NaN or signed-zero values.
8594///
8595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ph)
8596#[inline]
8597#[target_feature(enable = "avx512fp16,avx512vl")]
8598#[cfg_attr(test, assert_instr(vmaxph))]
8599#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8600pub fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8601    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), src) }
8602}
8603
8604/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8605/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8606/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8607/// NaN or signed-zero values.
8608///
8609/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ph)
8610#[inline]
8611#[target_feature(enable = "avx512fp16,avx512vl")]
8612#[cfg_attr(test, assert_instr(vmaxph))]
8613#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8614pub fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8615    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph()) }
8616}
8617
8618/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8619/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8620/// value when inputs are NaN or signed-zero values.
8621///
8622/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ph)
8623#[inline]
8624#[target_feature(enable = "avx512fp16,avx512vl")]
8625#[cfg_attr(test, assert_instr(vmaxph))]
8626#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8627pub fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
8628    unsafe { vmaxph_256(a, b) }
8629}
8630
8631/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8632/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8633/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8634/// NaN or signed-zero values.
8635///
8636/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ph)
8637#[inline]
8638#[target_feature(enable = "avx512fp16,avx512vl")]
8639#[cfg_attr(test, assert_instr(vmaxph))]
8640#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8641pub fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8642    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), src) }
8643}
8644
8645/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8646/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8647/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8648/// NaN or signed-zero values.
8649///
8650/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ph)
8651#[inline]
8652#[target_feature(enable = "avx512fp16,avx512vl")]
8653#[cfg_attr(test, assert_instr(vmaxph))]
8654#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8655pub fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8656    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph()) }
8657}
8658
8659/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8660/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8661/// value when inputs are NaN or signed-zero values.
8662///
8663/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ph)
8664#[inline]
8665#[target_feature(enable = "avx512fp16")]
8666#[cfg_attr(test, assert_instr(vmaxph))]
8667#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8668pub fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
8669    _mm512_max_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8670}
8671
8672/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8673/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8674/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8675/// NaN or signed-zero values.
8676///
8677/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ph)
8678#[inline]
8679#[target_feature(enable = "avx512fp16")]
8680#[cfg_attr(test, assert_instr(vmaxph))]
8681#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8682pub fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8683    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), src) }
8684}
8685
8686/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8687/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8688/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8689/// NaN or signed-zero values.
8690///
8691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ph)
8692#[inline]
8693#[target_feature(enable = "avx512fp16")]
8694#[cfg_attr(test, assert_instr(vmaxph))]
8695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8696pub fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8697    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph()) }
8698}
8699
8700/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8701/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8702/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8703/// NaN or signed-zero values.
8704///
8705/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ph)
8706#[inline]
8707#[target_feature(enable = "avx512fp16")]
8708#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8709#[rustc_legacy_const_generics(2)]
8710#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8711pub fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8712    unsafe {
8713        static_assert_sae!(SAE);
8714        vmaxph_512(a, b, SAE)
8715    }
8716}
8717
8718/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8719/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8720/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8721/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8722///
8723/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ph)
8724#[inline]
8725#[target_feature(enable = "avx512fp16")]
8726#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8727#[rustc_legacy_const_generics(4)]
8728#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8729pub fn _mm512_mask_max_round_ph<const SAE: i32>(
8730    src: __m512h,
8731    k: __mmask32,
8732    a: __m512h,
8733    b: __m512h,
8734) -> __m512h {
8735    unsafe {
8736        static_assert_sae!(SAE);
8737        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
8738    }
8739}
8740
8741/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8742/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8743/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8744/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8745///
8746/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ph)
8747#[inline]
8748#[target_feature(enable = "avx512fp16")]
8749#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8750#[rustc_legacy_const_generics(3)]
8751#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8752pub fn _mm512_maskz_max_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8753    unsafe {
8754        static_assert_sae!(SAE);
8755        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8756    }
8757}
8758
8759/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8760/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8761/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value
8762/// when inputs are NaN or signed-zero values.
8763///
8764/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sh)
8765#[inline]
8766#[target_feature(enable = "avx512fp16,avx512vl")]
8767#[cfg_attr(test, assert_instr(vmaxsh))]
8768#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8769pub fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
8770    _mm_mask_max_sh(_mm_undefined_ph(), 0xff, a, b)
8771}
8772
8773/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8774/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8775/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8776/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8777///
8778/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sh)
8779#[inline]
8780#[target_feature(enable = "avx512fp16,avx512vl")]
8781#[cfg_attr(test, assert_instr(vmaxsh))]
8782#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8783pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8784    _mm_mask_max_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8785}
8786
8787/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8788/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8789/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8790/// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8791///
8792/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sh)
8793#[inline]
8794#[target_feature(enable = "avx512fp16,avx512vl")]
8795#[cfg_attr(test, assert_instr(vmaxsh))]
8796#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8797pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8798    _mm_mask_max_sh(f16x8::ZERO.as_m128h(), k, a, b)
8799}
8800
8801/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8802/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8803/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8804/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8805///
8806/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sh)
8807#[inline]
8808#[target_feature(enable = "avx512fp16,avx512vl")]
8809#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8810#[rustc_legacy_const_generics(2)]
8811#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8812pub fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8813    static_assert_sae!(SAE);
8814    _mm_mask_max_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8815}
8816
8817/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8818/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8819/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8820/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8821/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8822///
8823/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sh)
8824#[inline]
8825#[target_feature(enable = "avx512fp16,avx512vl")]
8826#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8827#[rustc_legacy_const_generics(4)]
8828#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8829pub fn _mm_mask_max_round_sh<const SAE: i32>(
8830    src: __m128h,
8831    k: __mmask8,
8832    a: __m128h,
8833    b: __m128h,
8834) -> __m128h {
8835    unsafe {
8836        static_assert_sae!(SAE);
8837        vmaxsh(a, b, src, k, SAE)
8838    }
8839}
8840
8841/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8842/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8843/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8844/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8845/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8846///
8847/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sh)
8848#[inline]
8849#[target_feature(enable = "avx512fp16,avx512vl")]
8850#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8851#[rustc_legacy_const_generics(3)]
8852#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8853pub fn _mm_maskz_max_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8854    static_assert_sae!(SAE);
8855    _mm_mask_max_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
8856}
8857
8858/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8859/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8860/// when inputs are NaN or signed-zero values.
8861///
8862/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ph)
8863#[inline]
8864#[target_feature(enable = "avx512fp16,avx512vl")]
8865#[cfg_attr(test, assert_instr(vminph))]
8866#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8867pub fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
8868    unsafe { vminph_128(a, b) }
8869}
8870
8871/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8872/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8873/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8874/// NaN or signed-zero values.
8875///
8876/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ph)
8877#[inline]
8878#[target_feature(enable = "avx512fp16,avx512vl")]
8879#[cfg_attr(test, assert_instr(vminph))]
8880#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8881pub fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8882    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), src) }
8883}
8884
8885/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8886/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8887/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8888/// NaN or signed-zero values.
8889///
8890/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ph)
8891#[inline]
8892#[target_feature(enable = "avx512fp16,avx512vl")]
8893#[cfg_attr(test, assert_instr(vminph))]
8894#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8895pub fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8896    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph()) }
8897}
8898
8899/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8900/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8901/// when inputs are NaN or signed-zero values.
8902///
8903/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ph)
8904#[inline]
8905#[target_feature(enable = "avx512fp16,avx512vl")]
8906#[cfg_attr(test, assert_instr(vminph))]
8907#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8908pub fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
8909    unsafe { vminph_256(a, b) }
8910}
8911
8912/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8913/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8914/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8915/// NaN or signed-zero values.
8916///
8917/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ph)
8918#[inline]
8919#[target_feature(enable = "avx512fp16,avx512vl")]
8920#[cfg_attr(test, assert_instr(vminph))]
8921#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8922pub fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8923    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), src) }
8924}
8925
8926/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8927/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8928/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8929/// NaN or signed-zero values.
8930///
8931/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ph)
8932#[inline]
8933#[target_feature(enable = "avx512fp16,avx512vl")]
8934#[cfg_attr(test, assert_instr(vminph))]
8935#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8936pub fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8937    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph()) }
8938}
8939
8940/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8941/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8942/// when inputs are NaN or signed-zero values.
8943///
8944/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ph)
8945#[inline]
8946#[target_feature(enable = "avx512fp16")]
8947#[cfg_attr(test, assert_instr(vminph))]
8948#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8949pub fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
8950    _mm512_min_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8951}
8952
8953/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8954/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8955/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8956/// NaN or signed-zero values.
8957///
8958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ph)
8959#[inline]
8960#[target_feature(enable = "avx512fp16")]
8961#[cfg_attr(test, assert_instr(vminph))]
8962#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8963pub fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8964    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), src) }
8965}
8966
8967/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8968/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8969/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8970/// NaN or signed-zero values.
8971///
8972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ph)
8973#[inline]
8974#[target_feature(enable = "avx512fp16")]
8975#[cfg_attr(test, assert_instr(vminph))]
8976#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8977pub fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8978    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph()) }
8979}
8980
8981/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8982/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not
8983/// follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8984///
8985/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ph)
8986#[inline]
8987#[target_feature(enable = "avx512fp16")]
8988#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8989#[rustc_legacy_const_generics(2)]
8990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8991pub fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8992    unsafe {
8993        static_assert_sae!(SAE);
8994        vminph_512(a, b, SAE)
8995    }
8996}
8997
8998/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8999/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9000/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
9001/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9002///
9003/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ph)
9004#[inline]
9005#[target_feature(enable = "avx512fp16")]
9006#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
9007#[rustc_legacy_const_generics(4)]
9008#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9009pub fn _mm512_mask_min_round_ph<const SAE: i32>(
9010    src: __m512h,
9011    k: __mmask32,
9012    a: __m512h,
9013    b: __m512h,
9014) -> __m512h {
9015    unsafe {
9016        static_assert_sae!(SAE);
9017        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
9018    }
9019}
9020
9021/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
9022/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9023/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
9024/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9025///
9026/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ph)
9027#[inline]
9028#[target_feature(enable = "avx512fp16")]
9029#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
9030#[rustc_legacy_const_generics(3)]
9031#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9032pub fn _mm512_maskz_min_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
9033    unsafe {
9034        static_assert_sae!(SAE);
9035        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
9036    }
9037}
9038
9039/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
9040/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
9041/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
9042/// inputs are NaN or signed-zero values.
9043///
9044/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sh)
9045#[inline]
9046#[target_feature(enable = "avx512fp16,avx512vl")]
9047#[cfg_attr(test, assert_instr(vminsh))]
9048#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9049pub fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
9050    _mm_mask_min_sh(_mm_undefined_ph(), 0xff, a, b)
9051}
9052
9053/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
9054/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
9055/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
9056/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9057///
9058/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sh)
9059#[inline]
9060#[target_feature(enable = "avx512fp16,avx512vl")]
9061#[cfg_attr(test, assert_instr(vminsh))]
9062#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9063pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9064    _mm_mask_min_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9065}
9066
9067/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
9068/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
9069/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
9070/// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9071///
9072/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sh)
9073#[inline]
9074#[target_feature(enable = "avx512fp16,avx512vl")]
9075#[cfg_attr(test, assert_instr(vminsh))]
9076#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9077pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9078    _mm_mask_min_sh(f16x8::ZERO.as_m128h(), k, a, b)
9079}
9080
9081/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
9082/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
9083/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
9084/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9085///
9086/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sh)
9087#[inline]
9088#[target_feature(enable = "avx512fp16,avx512vl")]
9089#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
9090#[rustc_legacy_const_generics(2)]
9091#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9092pub fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
9093    static_assert_sae!(SAE);
9094    _mm_mask_min_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
9095}
9096
9097/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
9098/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9099/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
9100/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
9101/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9102///
9103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sh)
9104#[inline]
9105#[target_feature(enable = "avx512fp16,avx512vl")]
9106#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
9107#[rustc_legacy_const_generics(4)]
9108#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9109pub fn _mm_mask_min_round_sh<const SAE: i32>(
9110    src: __m128h,
9111    k: __mmask8,
9112    a: __m128h,
9113    b: __m128h,
9114) -> __m128h {
9115    unsafe {
9116        static_assert_sae!(SAE);
9117        vminsh(a, b, src, k, SAE)
9118    }
9119}
9120
9121/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
9122/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
9123/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
9124/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
9125/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
9126///
9127/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sh)
9128#[inline]
9129#[target_feature(enable = "avx512fp16,avx512vl")]
9130#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
9131#[rustc_legacy_const_generics(3)]
9132#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9133pub fn _mm_maskz_min_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9134    static_assert_sae!(SAE);
9135    _mm_mask_min_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
9136}
9137
9138/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9139/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9140/// This intrinsic essentially calculates `floor(log2(x))` for each element.
9141///
9142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
9143#[inline]
9144#[target_feature(enable = "avx512fp16,avx512vl")]
9145#[cfg_attr(test, assert_instr(vgetexpph))]
9146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9147pub fn _mm_getexp_ph(a: __m128h) -> __m128h {
9148    _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
9149}
9150
9151/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9152/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9153/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9154/// `floor(log2(x))` for each element.
9155///
9156/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
9157#[inline]
9158#[target_feature(enable = "avx512fp16,avx512vl")]
9159#[cfg_attr(test, assert_instr(vgetexpph))]
9160#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9161pub fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
9162    unsafe { vgetexpph_128(a, src, k) }
9163}
9164
9165/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9166/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9167/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9168/// `floor(log2(x))` for each element.
9169///
9170/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
9171#[inline]
9172#[target_feature(enable = "avx512fp16,avx512vl")]
9173#[cfg_attr(test, assert_instr(vgetexpph))]
9174#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9175pub fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
9176    _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
9177}
9178
9179/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9180/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9181/// This intrinsic essentially calculates `floor(log2(x))` for each element.
9182///
9183/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
9184#[inline]
9185#[target_feature(enable = "avx512fp16,avx512vl")]
9186#[cfg_attr(test, assert_instr(vgetexpph))]
9187#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9188pub fn _mm256_getexp_ph(a: __m256h) -> __m256h {
9189    _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
9190}
9191
9192/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9193/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9194/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9195/// `floor(log2(x))` for each element.
9196///
9197/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
9198#[inline]
9199#[target_feature(enable = "avx512fp16,avx512vl")]
9200#[cfg_attr(test, assert_instr(vgetexpph))]
9201#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9202pub fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
9203    unsafe { vgetexpph_256(a, src, k) }
9204}
9205
9206/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9207/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9208/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9209/// `floor(log2(x))` for each element.
9210///
9211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
9212#[inline]
9213#[target_feature(enable = "avx512fp16,avx512vl")]
9214#[cfg_attr(test, assert_instr(vgetexpph))]
9215#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9216pub fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
9217    _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
9218}
9219
9220/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9221/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9222/// This intrinsic essentially calculates `floor(log2(x))` for each element.
9223///
9224/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
9225#[inline]
9226#[target_feature(enable = "avx512fp16")]
9227#[cfg_attr(test, assert_instr(vgetexpph))]
9228#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9229pub fn _mm512_getexp_ph(a: __m512h) -> __m512h {
9230    _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
9231}
9232
9233/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9234/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9235/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9236/// `floor(log2(x))` for each element.
9237///
9238/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
9239#[inline]
9240#[target_feature(enable = "avx512fp16")]
9241#[cfg_attr(test, assert_instr(vgetexpph))]
9242#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9243pub fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
9244    _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
9245}
9246
9247/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9248/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9249/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9250/// `floor(log2(x))` for each element.
9251///
9252/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
9253#[inline]
9254#[target_feature(enable = "avx512fp16")]
9255#[cfg_attr(test, assert_instr(vgetexpph))]
9256#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9257pub fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
9258    _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
9259}
9260
9261/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9262/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9263/// This intrinsic essentially calculates `floor(log2(x))` for each element. Exceptions can be suppressed
9264/// by passing _MM_FROUND_NO_EXC in the sae parameter
9265///
9266/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
9267#[inline]
9268#[target_feature(enable = "avx512fp16")]
9269#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9270#[rustc_legacy_const_generics(1)]
9271#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9272pub fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
9273    static_assert_sae!(SAE);
9274    _mm512_mask_getexp_round_ph::<SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9275}
9276
9277/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9278/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9279/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9280/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9281///
9282/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
9283#[inline]
9284#[target_feature(enable = "avx512fp16")]
9285#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9286#[rustc_legacy_const_generics(3)]
9287#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9288pub fn _mm512_mask_getexp_round_ph<const SAE: i32>(
9289    src: __m512h,
9290    k: __mmask32,
9291    a: __m512h,
9292) -> __m512h {
9293    unsafe {
9294        static_assert_sae!(SAE);
9295        vgetexpph_512(a, src, k, SAE)
9296    }
9297}
9298
9299/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9300/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9301/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9302/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9303///
9304/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
9305#[inline]
9306#[target_feature(enable = "avx512fp16")]
9307#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9308#[rustc_legacy_const_generics(2)]
9309#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9310pub fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
9311    static_assert_sae!(SAE);
9312    _mm512_mask_getexp_round_ph::<SAE>(_mm512_setzero_ph(), k, a)
9313}
9314
9315/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9316/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9317/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9318/// calculates `floor(log2(x))` for the lower element.
9319///
9320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
9321#[inline]
9322#[target_feature(enable = "avx512fp16")]
9323#[cfg_attr(test, assert_instr(vgetexpsh))]
9324#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9325pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
9326    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
9327}
9328
9329/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9330/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9331/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9332/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9333/// for the lower element.
9334///
9335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
9336#[inline]
9337#[target_feature(enable = "avx512fp16")]
9338#[cfg_attr(test, assert_instr(vgetexpsh))]
9339#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9340pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9341    _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9342}
9343
9344/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9345/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9346/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9347/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9348/// lower element.
9349///
9350/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
9351#[inline]
9352#[target_feature(enable = "avx512fp16")]
9353#[cfg_attr(test, assert_instr(vgetexpsh))]
9354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9355pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9356    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), k, a, b)
9357}
9358
9359/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9360/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9361/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9362/// calculates `floor(log2(x))` for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9363/// in the sae parameter
9364///
9365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
9366#[inline]
9367#[target_feature(enable = "avx512fp16")]
9368#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9369#[rustc_legacy_const_generics(2)]
9370#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9371pub fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
9372    static_assert_sae!(SAE);
9373    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9374}
9375
9376/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9377/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9378/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9379/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9380/// for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9381///
9382/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
9383#[inline]
9384#[target_feature(enable = "avx512fp16")]
9385#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9386#[rustc_legacy_const_generics(4)]
9387#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9388pub fn _mm_mask_getexp_round_sh<const SAE: i32>(
9389    src: __m128h,
9390    k: __mmask8,
9391    a: __m128h,
9392    b: __m128h,
9393) -> __m128h {
9394    unsafe {
9395        static_assert_sae!(SAE);
9396        vgetexpsh(a, b, src, k, SAE)
9397    }
9398}
9399
9400/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9401/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9402/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9403/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9404/// lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9405///
9406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
9407#[inline]
9408#[target_feature(enable = "avx512fp16")]
9409#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9410#[rustc_legacy_const_generics(3)]
9411#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9412pub fn _mm_maskz_getexp_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9413    static_assert_sae!(SAE);
9414    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
9415}
9416
9417/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9418/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9419/// on the interval range defined by norm and the sign depends on sign and the source sign.
9420///
9421/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9422///
9423///     _MM_MANT_NORM_1_2     // interval [1, 2)
9424///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9425///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9426///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9427///
9428/// The sign is determined by sc which can take the following values:
9429///
9430///     _MM_MANT_SIGN_src     // sign = sign(src)
9431///     _MM_MANT_SIGN_zero    // sign = 0
9432///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9433///
9434/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
9435#[inline]
9436#[target_feature(enable = "avx512fp16,avx512vl")]
9437#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9438#[rustc_legacy_const_generics(1, 2)]
9439#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9440pub fn _mm_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9441    a: __m128h,
9442) -> __m128h {
9443    static_assert_uimm_bits!(NORM, 4);
9444    static_assert_uimm_bits!(SIGN, 2);
9445    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a)
9446}
9447
9448/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9449/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9450/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9451/// by norm and the sign depends on sign and the source sign.
9452///
9453/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9454///
9455///     _MM_MANT_NORM_1_2     // interval [1, 2)
9456///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9457///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9458///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9459///
9460/// The sign is determined by sc which can take the following values:
9461///
9462///     _MM_MANT_SIGN_src     // sign = sign(src)
9463///     _MM_MANT_SIGN_zero    // sign = 0
9464///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9465///
9466/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
9467#[inline]
9468#[target_feature(enable = "avx512fp16,avx512vl")]
9469#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9470#[rustc_legacy_const_generics(3, 4)]
9471#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9472pub fn _mm_mask_getmant_ph<
9473    const NORM: _MM_MANTISSA_NORM_ENUM,
9474    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9475>(
9476    src: __m128h,
9477    k: __mmask8,
9478    a: __m128h,
9479) -> __m128h {
9480    unsafe {
9481        static_assert_uimm_bits!(NORM, 4);
9482        static_assert_uimm_bits!(SIGN, 2);
9483        vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
9484    }
9485}
9486
9487/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9488/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9489/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9490/// by norm and the sign depends on sign and the source sign.
9491///
9492/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9493///
9494///     _MM_MANT_NORM_1_2     // interval [1, 2)
9495///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9496///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9497///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9498///
9499/// The sign is determined by sc which can take the following values:
9500///
9501///     _MM_MANT_SIGN_src     // sign = sign(src)
9502///     _MM_MANT_SIGN_zero    // sign = 0
9503///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9504///
9505/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
9506#[inline]
9507#[target_feature(enable = "avx512fp16,avx512vl")]
9508#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9509#[rustc_legacy_const_generics(2, 3)]
9510#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9511pub fn _mm_maskz_getmant_ph<
9512    const NORM: _MM_MANTISSA_NORM_ENUM,
9513    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9514>(
9515    k: __mmask8,
9516    a: __m128h,
9517) -> __m128h {
9518    static_assert_uimm_bits!(NORM, 4);
9519    static_assert_uimm_bits!(SIGN, 2);
9520    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_setzero_ph(), k, a)
9521}
9522
9523/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9524/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9525/// on the interval range defined by norm and the sign depends on sign and the source sign.
9526///
9527/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9528///
9529///     _MM_MANT_NORM_1_2     // interval [1, 2)
9530///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9531///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9532///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9533///
9534/// The sign is determined by sc which can take the following values:
9535///
9536///     _MM_MANT_SIGN_src     // sign = sign(src)
9537///     _MM_MANT_SIGN_zero    // sign = 0
9538///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9539///
9540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
9541#[inline]
9542#[target_feature(enable = "avx512fp16,avx512vl")]
9543#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9544#[rustc_legacy_const_generics(1, 2)]
9545#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9546pub fn _mm256_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9547    a: __m256h,
9548) -> __m256h {
9549    static_assert_uimm_bits!(NORM, 4);
9550    static_assert_uimm_bits!(SIGN, 2);
9551    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_undefined_ph(), 0xffff, a)
9552}
9553
9554/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9555/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9556/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9557/// by norm and the sign depends on sign and the source sign.
9558///
9559/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9560///
9561///     _MM_MANT_NORM_1_2     // interval [1, 2)
9562///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9563///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9564///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9565///
9566/// The sign is determined by sc which can take the following values:
9567///
9568///     _MM_MANT_SIGN_src     // sign = sign(src)
9569///     _MM_MANT_SIGN_zero    // sign = 0
9570///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9571///
9572/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
9573#[inline]
9574#[target_feature(enable = "avx512fp16,avx512vl")]
9575#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9576#[rustc_legacy_const_generics(3, 4)]
9577#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9578pub fn _mm256_mask_getmant_ph<
9579    const NORM: _MM_MANTISSA_NORM_ENUM,
9580    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9581>(
9582    src: __m256h,
9583    k: __mmask16,
9584    a: __m256h,
9585) -> __m256h {
9586    unsafe {
9587        static_assert_uimm_bits!(NORM, 4);
9588        static_assert_uimm_bits!(SIGN, 2);
9589        vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
9590    }
9591}
9592
9593/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9594/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9595/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9596/// by norm and the sign depends on sign and the source sign.
9597///
9598/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9599///
9600///     _MM_MANT_NORM_1_2     // interval [1, 2)
9601///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9602///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9603///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9604///
9605/// The sign is determined by sc which can take the following values:
9606///
9607///     _MM_MANT_SIGN_src     // sign = sign(src)
9608///     _MM_MANT_SIGN_zero    // sign = 0
9609///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9610///
9611/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
9612#[inline]
9613#[target_feature(enable = "avx512fp16,avx512vl")]
9614#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9615#[rustc_legacy_const_generics(2, 3)]
9616#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9617pub fn _mm256_maskz_getmant_ph<
9618    const NORM: _MM_MANTISSA_NORM_ENUM,
9619    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9620>(
9621    k: __mmask16,
9622    a: __m256h,
9623) -> __m256h {
9624    static_assert_uimm_bits!(NORM, 4);
9625    static_assert_uimm_bits!(SIGN, 2);
9626    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_setzero_ph(), k, a)
9627}
9628
9629/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9630/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9631/// on the interval range defined by norm and the sign depends on sign and the source sign.
9632///
9633/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9634///
9635///     _MM_MANT_NORM_1_2     // interval [1, 2)
9636///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9637///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9638///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9639///
9640/// The sign is determined by sc which can take the following values:
9641///
9642///     _MM_MANT_SIGN_src     // sign = sign(src)
9643///     _MM_MANT_SIGN_zero    // sign = 0
9644///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9645///
9646/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
9647#[inline]
9648#[target_feature(enable = "avx512fp16")]
9649#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9650#[rustc_legacy_const_generics(1, 2)]
9651#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9652pub fn _mm512_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9653    a: __m512h,
9654) -> __m512h {
9655    static_assert_uimm_bits!(NORM, 4);
9656    static_assert_uimm_bits!(SIGN, 2);
9657    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_undefined_ph(), 0xffffffff, a)
9658}
9659
9660/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9661/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9662/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9663/// by norm and the sign depends on sign and the source sign.
9664///
9665/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9666///
9667///     _MM_MANT_NORM_1_2     // interval [1, 2)
9668///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9669///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9670///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9671///
9672/// The sign is determined by sc which can take the following values:
9673///
9674///     _MM_MANT_SIGN_src     // sign = sign(src)
9675///     _MM_MANT_SIGN_zero    // sign = 0
9676///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9677///
9678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
9679#[inline]
9680#[target_feature(enable = "avx512fp16")]
9681#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9682#[rustc_legacy_const_generics(3, 4)]
9683#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9684pub fn _mm512_mask_getmant_ph<
9685    const NORM: _MM_MANTISSA_NORM_ENUM,
9686    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9687>(
9688    src: __m512h,
9689    k: __mmask32,
9690    a: __m512h,
9691) -> __m512h {
9692    static_assert_uimm_bits!(NORM, 4);
9693    static_assert_uimm_bits!(SIGN, 2);
9694    _mm512_mask_getmant_round_ph::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9695}
9696
9697/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9698/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9699/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9700/// by norm and the sign depends on sign and the source sign.
9701///
9702/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9703///
9704///     _MM_MANT_NORM_1_2     // interval [1, 2)
9705///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9706///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9707///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9708///
9709/// The sign is determined by sc which can take the following values:
9710///
9711///     _MM_MANT_SIGN_src     // sign = sign(src)
9712///     _MM_MANT_SIGN_zero    // sign = 0
9713///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9714///
9715/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
9716#[inline]
9717#[target_feature(enable = "avx512fp16")]
9718#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9719#[rustc_legacy_const_generics(2, 3)]
9720#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9721pub fn _mm512_maskz_getmant_ph<
9722    const NORM: _MM_MANTISSA_NORM_ENUM,
9723    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9724>(
9725    k: __mmask32,
9726    a: __m512h,
9727) -> __m512h {
9728    static_assert_uimm_bits!(NORM, 4);
9729    static_assert_uimm_bits!(SIGN, 2);
9730    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_setzero_ph(), k, a)
9731}
9732
9733/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9734/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9735/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9736/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9737///
9738/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9739///
9740///     _MM_MANT_NORM_1_2     // interval [1, 2)
9741///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9742///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9743///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9744///
9745/// The sign is determined by sc which can take the following values:
9746///
9747///     _MM_MANT_SIGN_src     // sign = sign(src)
9748///     _MM_MANT_SIGN_zero    // sign = 0
9749///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9750///
9751/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9752///
9753/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
9754#[inline]
9755#[target_feature(enable = "avx512fp16")]
9756#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9757#[rustc_legacy_const_generics(1, 2, 3)]
9758#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9759pub fn _mm512_getmant_round_ph<
9760    const NORM: _MM_MANTISSA_NORM_ENUM,
9761    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9762    const SAE: i32,
9763>(
9764    a: __m512h,
9765) -> __m512h {
9766    static_assert_uimm_bits!(NORM, 4);
9767    static_assert_uimm_bits!(SIGN, 2);
9768    static_assert_sae!(SAE);
9769    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9770}
9771
9772/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9773/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9774/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9775/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9776/// in the sae parameter
9777///
9778/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9779///
9780///     _MM_MANT_NORM_1_2     // interval [1, 2)
9781///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9782///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9783///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9784///
9785/// The sign is determined by sc which can take the following values:
9786///
9787///     _MM_MANT_SIGN_src     // sign = sign(src)
9788///     _MM_MANT_SIGN_zero    // sign = 0
9789///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9790///
9791/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9792///
9793/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
9794#[inline]
9795#[target_feature(enable = "avx512fp16")]
9796#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9797#[rustc_legacy_const_generics(3, 4, 5)]
9798#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9799pub fn _mm512_mask_getmant_round_ph<
9800    const NORM: _MM_MANTISSA_NORM_ENUM,
9801    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9802    const SAE: i32,
9803>(
9804    src: __m512h,
9805    k: __mmask32,
9806    a: __m512h,
9807) -> __m512h {
9808    unsafe {
9809        static_assert_uimm_bits!(NORM, 4);
9810        static_assert_uimm_bits!(SIGN, 2);
9811        static_assert_sae!(SAE);
9812        vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
9813    }
9814}
9815
9816/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9817/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9818/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9819/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9820/// in the sae parameter
9821///
9822/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9823///
9824///     _MM_MANT_NORM_1_2     // interval [1, 2)
9825///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9826///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9827///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9828///
9829/// The sign is determined by sc which can take the following values:
9830///
9831///     _MM_MANT_SIGN_src     // sign = sign(src)
9832///     _MM_MANT_SIGN_zero    // sign = 0
9833///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9834///
9835/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9836///
9837/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
9838#[inline]
9839#[target_feature(enable = "avx512fp16")]
9840#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9841#[rustc_legacy_const_generics(2, 3, 4)]
9842#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9843pub fn _mm512_maskz_getmant_round_ph<
9844    const NORM: _MM_MANTISSA_NORM_ENUM,
9845    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9846    const SAE: i32,
9847>(
9848    k: __mmask32,
9849    a: __m512h,
9850) -> __m512h {
9851    static_assert_uimm_bits!(NORM, 4);
9852    static_assert_uimm_bits!(SIGN, 2);
9853    static_assert_sae!(SAE);
9854    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_setzero_ph(), k, a)
9855}
9856
9857/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9858/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9859/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9860/// on the interval range defined by norm and the sign depends on sign and the source sign.
9861///
9862/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9863///
9864///     _MM_MANT_NORM_1_2     // interval [1, 2)
9865///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9866///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9867///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9868///
9869/// The sign is determined by sc which can take the following values:
9870///
9871///     _MM_MANT_SIGN_src     // sign = sign(src)
9872///     _MM_MANT_SIGN_zero    // sign = 0
9873///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9874///
9875/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
9876#[inline]
9877#[target_feature(enable = "avx512fp16")]
9878#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9879#[rustc_legacy_const_generics(2, 3)]
9880#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9881pub fn _mm_getmant_sh<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9882    a: __m128h,
9883    b: __m128h,
9884) -> __m128h {
9885    static_assert_uimm_bits!(NORM, 4);
9886    static_assert_uimm_bits!(SIGN, 2);
9887    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9888}
9889
9890/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9891/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9892/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9893/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9894/// the source sign.
9895///
9896/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9897///
9898///     _MM_MANT_NORM_1_2     // interval [1, 2)
9899///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9900///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9901///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9902///
9903/// The sign is determined by sc which can take the following values:
9904///
9905///     _MM_MANT_SIGN_src     // sign = sign(src)
9906///     _MM_MANT_SIGN_zero    // sign = 0
9907///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9908///
9909/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
9910#[inline]
9911#[target_feature(enable = "avx512fp16")]
9912#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9913#[rustc_legacy_const_generics(4, 5)]
9914#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9915pub fn _mm_mask_getmant_sh<
9916    const NORM: _MM_MANTISSA_NORM_ENUM,
9917    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9918>(
9919    src: __m128h,
9920    k: __mmask8,
9921    a: __m128h,
9922    b: __m128h,
9923) -> __m128h {
9924    static_assert_uimm_bits!(NORM, 4);
9925    static_assert_uimm_bits!(SIGN, 2);
9926    _mm_mask_getmant_round_sh::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9927}
9928
9929/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9930/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9931/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9932/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9933/// the source sign.
9934///
9935/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9936///
9937///     _MM_MANT_NORM_1_2     // interval [1, 2)
9938///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9939///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9940///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9941///
9942/// The sign is determined by sc which can take the following values:
9943///
9944///     _MM_MANT_SIGN_src     // sign = sign(src)
9945///     _MM_MANT_SIGN_zero    // sign = 0
9946///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9947///
9948/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
9949#[inline]
9950#[target_feature(enable = "avx512fp16")]
9951#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9952#[rustc_legacy_const_generics(3, 4)]
9953#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9954pub fn _mm_maskz_getmant_sh<
9955    const NORM: _MM_MANTISSA_NORM_ENUM,
9956    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9957>(
9958    k: __mmask8,
9959    a: __m128h,
9960    b: __m128h,
9961) -> __m128h {
9962    static_assert_uimm_bits!(NORM, 4);
9963    static_assert_uimm_bits!(SIGN, 2);
9964    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), k, a, b)
9965}
9966
9967/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9968/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9969/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9970/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9971/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9972///
9973/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9974///
9975///     _MM_MANT_NORM_1_2     // interval [1, 2)
9976///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9977///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9978///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9979///
9980/// The sign is determined by sc which can take the following values:
9981///
9982///     _MM_MANT_SIGN_src     // sign = sign(src)
9983///     _MM_MANT_SIGN_zero    // sign = 0
9984///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9985///
9986/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9987///
9988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
9989#[inline]
9990#[target_feature(enable = "avx512fp16")]
9991#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9992#[rustc_legacy_const_generics(2, 3, 4)]
9993#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9994pub fn _mm_getmant_round_sh<
9995    const NORM: _MM_MANTISSA_NORM_ENUM,
9996    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9997    const SAE: i32,
9998>(
9999    a: __m128h,
10000    b: __m128h,
10001) -> __m128h {
10002    static_assert_uimm_bits!(NORM, 4);
10003    static_assert_uimm_bits!(SIGN, 2);
10004    static_assert_sae!(SAE);
10005    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10006}
10007
10008/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
10009/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10010/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
10011/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
10012/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10013///
10014/// The mantissa is normalized to the interval specified by interv, which can take the following values:
10015///
10016///     _MM_MANT_NORM_1_2     // interval [1, 2)
10017///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
10018///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
10019///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
10020///
10021/// The sign is determined by sc which can take the following values:
10022///
10023///     _MM_MANT_SIGN_src     // sign = sign(src)
10024///     _MM_MANT_SIGN_zero    // sign = 0
10025///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
10026///
10027/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10028///
10029/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
10030#[inline]
10031#[target_feature(enable = "avx512fp16")]
10032#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
10033#[rustc_legacy_const_generics(4, 5, 6)]
10034#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10035pub fn _mm_mask_getmant_round_sh<
10036    const NORM: _MM_MANTISSA_NORM_ENUM,
10037    const SIGN: _MM_MANTISSA_SIGN_ENUM,
10038    const SAE: i32,
10039>(
10040    src: __m128h,
10041    k: __mmask8,
10042    a: __m128h,
10043    b: __m128h,
10044) -> __m128h {
10045    unsafe {
10046        static_assert_uimm_bits!(NORM, 4);
10047        static_assert_uimm_bits!(SIGN, 2);
10048        static_assert_sae!(SAE);
10049        vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
10050    }
10051}
10052
10053/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
10054/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10055/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
10056/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
10057/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10058///
10059/// The mantissa is normalized to the interval specified by interv, which can take the following values:
10060///
10061///     _MM_MANT_NORM_1_2     // interval [1, 2)
10062///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
10063///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
10064///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
10065///
10066/// The sign is determined by sc which can take the following values:
10067///
10068///     _MM_MANT_SIGN_src     // sign = sign(src)
10069///     _MM_MANT_SIGN_zero    // sign = 0
10070///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
10071///
10072/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10073///
10074/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
10075#[inline]
10076#[target_feature(enable = "avx512fp16")]
10077#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
10078#[rustc_legacy_const_generics(3, 4, 5)]
10079#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10080pub fn _mm_maskz_getmant_round_sh<
10081    const NORM: _MM_MANTISSA_NORM_ENUM,
10082    const SIGN: _MM_MANTISSA_SIGN_ENUM,
10083    const SAE: i32,
10084>(
10085    k: __mmask8,
10086    a: __m128h,
10087    b: __m128h,
10088) -> __m128h {
10089    static_assert_uimm_bits!(NORM, 4);
10090    static_assert_uimm_bits!(SIGN, 2);
10091    static_assert_sae!(SAE);
10092    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
10093}
10094
10095/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10096/// specified by imm8, and store the results in dst.
10097///
10098/// Rounding is done according to the imm8 parameter, which can be one of:
10099///
10100/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10101/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10102/// * [`_MM_FROUND_TO_POS_INF`] : round up
10103/// * [`_MM_FROUND_TO_ZERO`] : truncate
10104/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10105///
10106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
10107#[inline]
10108#[target_feature(enable = "avx512fp16,avx512vl")]
10109#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10110#[rustc_legacy_const_generics(1)]
10111#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10112pub fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
10113    static_assert_uimm_bits!(IMM8, 8);
10114    _mm_mask_roundscale_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
10115}
10116
10117/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10118/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10119/// the corresponding mask bit is not set).
10120///
10121/// Rounding is done according to the imm8 parameter, which can be one of:
10122///
10123/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10124/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10125/// * [`_MM_FROUND_TO_POS_INF`] : round up
10126/// * [`_MM_FROUND_TO_ZERO`] : truncate
10127/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10128///
10129/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
10130#[inline]
10131#[target_feature(enable = "avx512fp16,avx512vl")]
10132#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10133#[rustc_legacy_const_generics(3)]
10134#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10135pub fn _mm_mask_roundscale_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
10136    unsafe {
10137        static_assert_uimm_bits!(IMM8, 8);
10138        vrndscaleph_128(a, IMM8, src, k)
10139    }
10140}
10141
10142/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10143/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10144/// mask bit is not set).
10145///
10146/// Rounding is done according to the imm8 parameter, which can be one of:
10147///
10148/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10149/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10150/// * [`_MM_FROUND_TO_POS_INF`] : round up
10151/// * [`_MM_FROUND_TO_ZERO`] : truncate
10152/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10153///
10154/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
10155#[inline]
10156#[target_feature(enable = "avx512fp16,avx512vl")]
10157#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10158#[rustc_legacy_const_generics(2)]
10159#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10160pub fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
10161    static_assert_uimm_bits!(IMM8, 8);
10162    _mm_mask_roundscale_ph::<IMM8>(_mm_setzero_ph(), k, a)
10163}
10164
10165/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10166/// specified by imm8, and store the results in dst.
10167///
10168/// Rounding is done according to the imm8 parameter, which can be one of:
10169///
10170/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10171/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10172/// * [`_MM_FROUND_TO_POS_INF`] : round up
10173/// * [`_MM_FROUND_TO_ZERO`] : truncate
10174/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10175///
10176/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
10177#[inline]
10178#[target_feature(enable = "avx512fp16,avx512vl")]
10179#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10180#[rustc_legacy_const_generics(1)]
10181#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10182pub fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
10183    static_assert_uimm_bits!(IMM8, 8);
10184    _mm256_mask_roundscale_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
10185}
10186
10187/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10188/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10189/// the corresponding mask bit is not set).
10190///
10191/// Rounding is done according to the imm8 parameter, which can be one of:
10192///
10193/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10194/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10195/// * [`_MM_FROUND_TO_POS_INF`] : round up
10196/// * [`_MM_FROUND_TO_ZERO`] : truncate
10197/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10198///
10199/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
10200#[inline]
10201#[target_feature(enable = "avx512fp16,avx512vl")]
10202#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10203#[rustc_legacy_const_generics(3)]
10204#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10205pub fn _mm256_mask_roundscale_ph<const IMM8: i32>(
10206    src: __m256h,
10207    k: __mmask16,
10208    a: __m256h,
10209) -> __m256h {
10210    unsafe {
10211        static_assert_uimm_bits!(IMM8, 8);
10212        vrndscaleph_256(a, IMM8, src, k)
10213    }
10214}
10215
10216/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10217/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10218/// mask bit is not set).
10219///
10220/// Rounding is done according to the imm8 parameter, which can be one of:
10221///
10222/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10223/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10224/// * [`_MM_FROUND_TO_POS_INF`] : round up
10225/// * [`_MM_FROUND_TO_ZERO`] : truncate
10226/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10227///
10228/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
10229#[inline]
10230#[target_feature(enable = "avx512fp16,avx512vl")]
10231#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10232#[rustc_legacy_const_generics(2)]
10233#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10234pub fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
10235    static_assert_uimm_bits!(IMM8, 8);
10236    _mm256_mask_roundscale_ph::<IMM8>(_mm256_setzero_ph(), k, a)
10237}
10238
10239/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10240/// specified by imm8, and store the results in dst.
10241///
10242/// Rounding is done according to the imm8 parameter, which can be one of:
10243///
10244/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10245/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10246/// * [`_MM_FROUND_TO_POS_INF`] : round up
10247/// * [`_MM_FROUND_TO_ZERO`] : truncate
10248/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10249///
10250/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
10251#[inline]
10252#[target_feature(enable = "avx512fp16")]
10253#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10254#[rustc_legacy_const_generics(1)]
10255#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10256pub fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
10257    static_assert_uimm_bits!(IMM8, 8);
10258    _mm512_mask_roundscale_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
10259}
10260
10261/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10262/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10263/// the corresponding mask bit is not set).
10264///
10265/// Rounding is done according to the imm8 parameter, which can be one of:
10266///
10267/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10268/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10269/// * [`_MM_FROUND_TO_POS_INF`] : round up
10270/// * [`_MM_FROUND_TO_ZERO`] : truncate
10271/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10272///
10273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
10274#[inline]
10275#[target_feature(enable = "avx512fp16")]
10276#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10277#[rustc_legacy_const_generics(3)]
10278#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10279pub fn _mm512_mask_roundscale_ph<const IMM8: i32>(
10280    src: __m512h,
10281    k: __mmask32,
10282    a: __m512h,
10283) -> __m512h {
10284    static_assert_uimm_bits!(IMM8, 8);
10285    _mm512_mask_roundscale_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
10286}
10287
10288/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10289/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10290/// mask bit is not set).
10291///
10292/// Rounding is done according to the imm8 parameter, which can be one of:
10293///
10294/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10295/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10296/// * [`_MM_FROUND_TO_POS_INF`] : round up
10297/// * [`_MM_FROUND_TO_ZERO`] : truncate
10298/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10299///
10300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
10301#[inline]
10302#[target_feature(enable = "avx512fp16")]
10303#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10304#[rustc_legacy_const_generics(2)]
10305#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10306pub fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10307    static_assert_uimm_bits!(IMM8, 8);
10308    _mm512_mask_roundscale_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10309}
10310
10311/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10312/// specified by imm8, and store the results in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10313/// in the sae parameter
10314///
10315/// Rounding is done according to the imm8 parameter, which can be one of:
10316///
10317/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10318/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10319/// * [`_MM_FROUND_TO_POS_INF`] : round up
10320/// * [`_MM_FROUND_TO_ZERO`] : truncate
10321/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10322///
10323/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
10324#[inline]
10325#[target_feature(enable = "avx512fp16")]
10326#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10327#[rustc_legacy_const_generics(1, 2)]
10328#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10329pub fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10330    static_assert_uimm_bits!(IMM8, 8);
10331    static_assert_sae!(SAE);
10332    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10333}
10334
10335/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10336/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10337/// the corresponding mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10338/// in the sae parameter
10339///
10340/// Rounding is done according to the imm8 parameter, which can be one of:
10341///
10342/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10343/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10344/// * [`_MM_FROUND_TO_POS_INF`] : round up
10345/// * [`_MM_FROUND_TO_ZERO`] : truncate
10346/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10347///
10348/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
10349#[inline]
10350#[target_feature(enable = "avx512fp16")]
10351#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10352#[rustc_legacy_const_generics(3, 4)]
10353#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10354pub fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10355    src: __m512h,
10356    k: __mmask32,
10357    a: __m512h,
10358) -> __m512h {
10359    unsafe {
10360        static_assert_uimm_bits!(IMM8, 8);
10361        static_assert_sae!(SAE);
10362        vrndscaleph_512(a, IMM8, src, k, SAE)
10363    }
10364}
10365
10366/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10367/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10368/// mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10369///
10370/// Rounding is done according to the imm8 parameter, which can be one of:
10371///
10372/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10373/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10374/// * [`_MM_FROUND_TO_POS_INF`] : round up
10375/// * [`_MM_FROUND_TO_ZERO`] : truncate
10376/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10377///
10378/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
10379#[inline]
10380#[target_feature(enable = "avx512fp16")]
10381#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10382#[rustc_legacy_const_generics(2, 3)]
10383#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10384pub fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10385    k: __mmask32,
10386    a: __m512h,
10387) -> __m512h {
10388    static_assert_uimm_bits!(IMM8, 8);
10389    static_assert_sae!(SAE);
10390    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10391}
10392
10393/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10394/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10395/// from a to the upper elements of dst.
10396///
10397/// Rounding is done according to the imm8 parameter, which can be one of:
10398///
10399/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10400/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10401/// * [`_MM_FROUND_TO_POS_INF`] : round up
10402/// * [`_MM_FROUND_TO_ZERO`] : truncate
10403/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10404///
10405/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
10406#[inline]
10407#[target_feature(enable = "avx512fp16")]
10408#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10409#[rustc_legacy_const_generics(2)]
10410#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10411pub fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10412    static_assert_uimm_bits!(IMM8, 8);
10413    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10414}
10415
10416/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10417/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10418/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10419///
10420/// Rounding is done according to the imm8 parameter, which can be one of:
10421///
10422/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10423/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10424/// * [`_MM_FROUND_TO_POS_INF`] : round up
10425/// * [`_MM_FROUND_TO_ZERO`] : truncate
10426/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10427///
10428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
10429#[inline]
10430#[target_feature(enable = "avx512fp16")]
10431#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10432#[rustc_legacy_const_generics(4)]
10433#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10434pub fn _mm_mask_roundscale_sh<const IMM8: i32>(
10435    src: __m128h,
10436    k: __mmask8,
10437    a: __m128h,
10438    b: __m128h,
10439) -> __m128h {
10440    static_assert_uimm_bits!(IMM8, 8);
10441    _mm_mask_roundscale_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10442}
10443
10444/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10445/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10446/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10447///
10448/// Rounding is done according to the imm8 parameter, which can be one of:
10449///
10450/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10451/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10452/// * [`_MM_FROUND_TO_POS_INF`] : round up
10453/// * [`_MM_FROUND_TO_ZERO`] : truncate
10454/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10455///
10456/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
10457#[inline]
10458#[target_feature(enable = "avx512fp16")]
10459#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10460#[rustc_legacy_const_generics(3)]
10461#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10462pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10463    static_assert_uimm_bits!(IMM8, 8);
10464    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
10465}
10466
10467/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10468/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10469/// from a to the upper elements of dst.
10470///
10471/// Rounding is done according to the imm8 parameter, which can be one of:
10472///
10473/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10474/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10475/// * [`_MM_FROUND_TO_POS_INF`] : round up
10476/// * [`_MM_FROUND_TO_ZERO`] : truncate
10477/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10478///
10479/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10480///
10481/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
10482#[inline]
10483#[target_feature(enable = "avx512fp16")]
10484#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10485#[rustc_legacy_const_generics(2, 3)]
10486#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10487pub fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
10488    static_assert_uimm_bits!(IMM8, 8);
10489    static_assert_sae!(SAE);
10490    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10491}
10492
10493/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10494/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10495/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10496///
10497/// Rounding is done according to the imm8 parameter, which can be one of:
10498///
10499/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10500/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10501/// * [`_MM_FROUND_TO_POS_INF`] : round up
10502/// * [`_MM_FROUND_TO_ZERO`] : truncate
10503/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10504///
10505/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10506///
10507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
10508#[inline]
10509#[target_feature(enable = "avx512fp16")]
10510#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10511#[rustc_legacy_const_generics(4, 5)]
10512#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10513pub fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10514    src: __m128h,
10515    k: __mmask8,
10516    a: __m128h,
10517    b: __m128h,
10518) -> __m128h {
10519    unsafe {
10520        static_assert_uimm_bits!(IMM8, 8);
10521        static_assert_sae!(SAE);
10522        vrndscalesh(a, b, src, k, IMM8, SAE)
10523    }
10524}
10525
10526/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10527/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10528/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10529///
10530/// Rounding is done according to the imm8 parameter, which can be one of:
10531///
10532/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10533/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10534/// * [`_MM_FROUND_TO_POS_INF`] : round up
10535/// * [`_MM_FROUND_TO_ZERO`] : truncate
10536/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10537///
10538/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10539///
10540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
10541#[inline]
10542#[target_feature(enable = "avx512fp16")]
10543#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10544#[rustc_legacy_const_generics(3, 4)]
10545#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10546pub fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10547    k: __mmask8,
10548    a: __m128h,
10549    b: __m128h,
10550) -> __m128h {
10551    static_assert_uimm_bits!(IMM8, 8);
10552    static_assert_sae!(SAE);
10553    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
10554}
10555
10556/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10557/// the results in dst.
10558///
10559/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
10560#[inline]
10561#[target_feature(enable = "avx512fp16,avx512vl")]
10562#[cfg_attr(test, assert_instr(vscalefph))]
10563#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10564pub fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
10565    _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
10566}
10567
10568/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10569/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10570///
10571/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
10572#[inline]
10573#[target_feature(enable = "avx512fp16,avx512vl")]
10574#[cfg_attr(test, assert_instr(vscalefph))]
10575#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10576pub fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10577    unsafe { vscalefph_128(a, b, src, k) }
10578}
10579
10580/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10581/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10582///
10583/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
10584#[inline]
10585#[target_feature(enable = "avx512fp16,avx512vl")]
10586#[cfg_attr(test, assert_instr(vscalefph))]
10587#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10588pub fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10589    _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
10590}
10591
10592/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10593/// the results in dst.
10594///
10595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
10596#[inline]
10597#[target_feature(enable = "avx512fp16,avx512vl")]
10598#[cfg_attr(test, assert_instr(vscalefph))]
10599#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10600pub fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
10601    _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
10602}
10603
10604/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10605/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10606///
10607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
10608#[inline]
10609#[target_feature(enable = "avx512fp16,avx512vl")]
10610#[cfg_attr(test, assert_instr(vscalefph))]
10611#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10612pub fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10613    unsafe { vscalefph_256(a, b, src, k) }
10614}
10615
10616/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10617/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10618///
10619/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
10620#[inline]
10621#[target_feature(enable = "avx512fp16,avx512vl")]
10622#[cfg_attr(test, assert_instr(vscalefph))]
10623#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10624pub fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10625    _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
10626}
10627
10628/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10629/// the results in dst.
10630///
10631/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
10632#[inline]
10633#[target_feature(enable = "avx512fp16")]
10634#[cfg_attr(test, assert_instr(vscalefph))]
10635#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10636pub fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
10637    _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
10638}
10639
10640/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10641/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10642///
10643/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
10644#[inline]
10645#[target_feature(enable = "avx512fp16")]
10646#[cfg_attr(test, assert_instr(vscalefph))]
10647#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10648pub fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10649    _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10650}
10651
10652/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10653/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10654///
10655/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
10656#[inline]
10657#[target_feature(enable = "avx512fp16")]
10658#[cfg_attr(test, assert_instr(vscalefph))]
10659#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10660pub fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10661    _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
10662}
10663
10664/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10665/// the results in dst.
10666///
10667/// Rounding is done according to the rounding parameter, which can be one of:
10668///
10669/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10670/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10671/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10672/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10673/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10674///
10675/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
10676#[inline]
10677#[target_feature(enable = "avx512fp16")]
10678#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10679#[rustc_legacy_const_generics(2)]
10680#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10681pub fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
10682    static_assert_rounding!(ROUNDING);
10683    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_undefined_ph(), 0xffffffff, a, b)
10684}
10685
10686/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10687/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10688///
10689/// Rounding is done according to the rounding parameter, which can be one of:
10690///
10691/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10692/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10693/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10694/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10695/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10696///
10697/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
10698#[inline]
10699#[target_feature(enable = "avx512fp16")]
10700#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10701#[rustc_legacy_const_generics(4)]
10702#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10703pub fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
10704    src: __m512h,
10705    k: __mmask32,
10706    a: __m512h,
10707    b: __m512h,
10708) -> __m512h {
10709    unsafe {
10710        static_assert_rounding!(ROUNDING);
10711        vscalefph_512(a, b, src, k, ROUNDING)
10712    }
10713}
10714
10715/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10716/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10717///
10718/// Rounding is done according to the rounding parameter, which can be one of:
10719///
10720/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10721/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10722/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10723/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10724/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10725///
10726/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
10727#[inline]
10728#[target_feature(enable = "avx512fp16")]
10729#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10730#[rustc_legacy_const_generics(3)]
10731#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10732pub fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
10733    k: __mmask32,
10734    a: __m512h,
10735    b: __m512h,
10736) -> __m512h {
10737    static_assert_rounding!(ROUNDING);
10738    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
10739}
10740
10741/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10742/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10743/// elements of dst.
10744///
10745/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
10746#[inline]
10747#[target_feature(enable = "avx512fp16")]
10748#[cfg_attr(test, assert_instr(vscalefsh))]
10749#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10750pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
10751    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
10752}
10753
10754/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10755/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10756/// and copy the upper 7 packed elements from a to the upper elements of dst.
10757///
10758/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
10759#[inline]
10760#[target_feature(enable = "avx512fp16")]
10761#[cfg_attr(test, assert_instr(vscalefsh))]
10762#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10763pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10764    _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10765}
10766
10767/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10768/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10769/// and copy the upper 7 packed elements from a to the upper elements of dst.
10770///
10771/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
10772#[inline]
10773#[target_feature(enable = "avx512fp16")]
10774#[cfg_attr(test, assert_instr(vscalefsh))]
10775#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10776pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10777    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), k, a, b)
10778}
10779
10780/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10781/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10782/// elements of dst.
10783///
10784/// Rounding is done according to the rounding parameter, which can be one of:
10785///
10786/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10787/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10788/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10789/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10790/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10791///
10792/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
10793#[inline]
10794#[target_feature(enable = "avx512fp16")]
10795#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10796#[rustc_legacy_const_generics(2)]
10797#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10798pub fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
10799    static_assert_rounding!(ROUNDING);
10800    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10801}
10802
10803/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10804/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10805/// and copy the upper 7 packed elements from a to the upper elements of dst.
10806///
10807/// Rounding is done according to the rounding parameter, which can be one of:
10808///
10809/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10810/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10811/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10812/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10813/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10814///
10815/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
10816#[inline]
10817#[target_feature(enable = "avx512fp16")]
10818#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10819#[rustc_legacy_const_generics(4)]
10820#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10821pub fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
10822    src: __m128h,
10823    k: __mmask8,
10824    a: __m128h,
10825    b: __m128h,
10826) -> __m128h {
10827    unsafe {
10828        static_assert_rounding!(ROUNDING);
10829        vscalefsh(a, b, src, k, ROUNDING)
10830    }
10831}
10832
10833/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10834/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10835/// and copy the upper 7 packed elements from a to the upper elements of dst.
10836///
10837/// Rounding is done according to the rounding parameter, which can be one of:
10838///
10839/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10840/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10841/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10842/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10843/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10844///
10845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
10846#[inline]
10847#[target_feature(enable = "avx512fp16")]
10848#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10849#[rustc_legacy_const_generics(3)]
10850#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10851pub fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
10852    k: __mmask8,
10853    a: __m128h,
10854    b: __m128h,
10855) -> __m128h {
10856    static_assert_rounding!(ROUNDING);
10857    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
10858}
10859
10860/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10861/// number of bits specified by imm8, and store the results in dst.
10862///
10863/// Rounding is done according to the imm8 parameter, which can be one of:
10864///
10865/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10866/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10867/// * [`_MM_FROUND_TO_POS_INF`] : round up
10868/// * [`_MM_FROUND_TO_ZERO`] : truncate
10869/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10870///
10871/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
10872#[inline]
10873#[target_feature(enable = "avx512fp16,avx512vl")]
10874#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10875#[rustc_legacy_const_generics(1)]
10876#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10877pub fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
10878    static_assert_uimm_bits!(IMM8, 8);
10879    _mm_mask_reduce_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
10880}
10881
10882/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10883/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10884/// from src when the corresponding mask bit is not set).
10885///
10886/// Rounding is done according to the imm8 parameter, which can be one of:
10887///
10888/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10889/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10890/// * [`_MM_FROUND_TO_POS_INF`] : round up
10891/// * [`_MM_FROUND_TO_ZERO`] : truncate
10892/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10893///
10894/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
10895#[inline]
10896#[target_feature(enable = "avx512fp16,avx512vl")]
10897#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10898#[rustc_legacy_const_generics(3)]
10899#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10900pub fn _mm_mask_reduce_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
10901    unsafe {
10902        static_assert_uimm_bits!(IMM8, 8);
10903        vreduceph_128(a, IMM8, src, k)
10904    }
10905}
10906
10907/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10908/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10909/// out when the corresponding mask bit is not set).
10910///
10911/// Rounding is done according to the imm8 parameter, which can be one of:
10912///
10913/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10914/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10915/// * [`_MM_FROUND_TO_POS_INF`] : round up
10916/// * [`_MM_FROUND_TO_ZERO`] : truncate
10917/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10918///
10919/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
10920#[inline]
10921#[target_feature(enable = "avx512fp16,avx512vl")]
10922#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10923#[rustc_legacy_const_generics(2)]
10924#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10925pub fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
10926    static_assert_uimm_bits!(IMM8, 8);
10927    _mm_mask_reduce_ph::<IMM8>(_mm_setzero_ph(), k, a)
10928}
10929
10930/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10931/// number of bits specified by imm8, and store the results in dst.
10932///
10933/// Rounding is done according to the imm8 parameter, which can be one of:
10934///
10935/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10936/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10937/// * [`_MM_FROUND_TO_POS_INF`] : round up
10938/// * [`_MM_FROUND_TO_ZERO`] : truncate
10939/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10940///
10941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
10942#[inline]
10943#[target_feature(enable = "avx512fp16,avx512vl")]
10944#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10945#[rustc_legacy_const_generics(1)]
10946#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10947pub fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
10948    static_assert_uimm_bits!(IMM8, 8);
10949    _mm256_mask_reduce_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
10950}
10951
10952/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10953/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10954/// from src when the corresponding mask bit is not set).
10955///
10956/// Rounding is done according to the imm8 parameter, which can be one of:
10957///
10958/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10959/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10960/// * [`_MM_FROUND_TO_POS_INF`] : round up
10961/// * [`_MM_FROUND_TO_ZERO`] : truncate
10962/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10963///
10964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
10965#[inline]
10966#[target_feature(enable = "avx512fp16,avx512vl")]
10967#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10968#[rustc_legacy_const_generics(3)]
10969#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10970pub fn _mm256_mask_reduce_ph<const IMM8: i32>(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
10971    unsafe {
10972        static_assert_uimm_bits!(IMM8, 8);
10973        vreduceph_256(a, IMM8, src, k)
10974    }
10975}
10976
10977/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10978/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10979/// out when the corresponding mask bit is not set).
10980///
10981/// Rounding is done according to the imm8 parameter, which can be one of:
10982///
10983/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10984/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10985/// * [`_MM_FROUND_TO_POS_INF`] : round up
10986/// * [`_MM_FROUND_TO_ZERO`] : truncate
10987/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10988///
10989/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
10990#[inline]
10991#[target_feature(enable = "avx512fp16,avx512vl")]
10992#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10993#[rustc_legacy_const_generics(2)]
10994#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10995pub fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
10996    static_assert_uimm_bits!(IMM8, 8);
10997    _mm256_mask_reduce_ph::<IMM8>(_mm256_setzero_ph(), k, a)
10998}
10999
11000/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11001/// number of bits specified by imm8, and store the results in dst.
11002///
11003/// Rounding is done according to the imm8 parameter, which can be one of:
11004///
11005/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11006/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11007/// * [`_MM_FROUND_TO_POS_INF`] : round up
11008/// * [`_MM_FROUND_TO_ZERO`] : truncate
11009/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11010///
11011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
11012#[inline]
11013#[target_feature(enable = "avx512fp16")]
11014#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
11015#[rustc_legacy_const_generics(1)]
11016#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11017pub fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
11018    static_assert_uimm_bits!(IMM8, 8);
11019    _mm512_mask_reduce_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
11020}
11021
11022/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11023/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
11024/// from src when the corresponding mask bit is not set).
11025///
11026/// Rounding is done according to the imm8 parameter, which can be one of:
11027///
11028/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11029/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11030/// * [`_MM_FROUND_TO_POS_INF`] : round up
11031/// * [`_MM_FROUND_TO_ZERO`] : truncate
11032/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11033///
11034/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
11035#[inline]
11036#[target_feature(enable = "avx512fp16")]
11037#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
11038#[rustc_legacy_const_generics(3)]
11039#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11040pub fn _mm512_mask_reduce_ph<const IMM8: i32>(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
11041    static_assert_uimm_bits!(IMM8, 8);
11042    _mm512_mask_reduce_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
11043}
11044
11045/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11046/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
11047/// out when the corresponding mask bit is not set).
11048///
11049/// Rounding is done according to the imm8 parameter, which can be one of:
11050///
11051/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11052/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11053/// * [`_MM_FROUND_TO_POS_INF`] : round up
11054/// * [`_MM_FROUND_TO_ZERO`] : truncate
11055/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11056///
11057/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
11058#[inline]
11059#[target_feature(enable = "avx512fp16")]
11060#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
11061#[rustc_legacy_const_generics(2)]
11062#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11063pub fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
11064    static_assert_uimm_bits!(IMM8, 8);
11065    _mm512_mask_reduce_ph::<IMM8>(_mm512_setzero_ph(), k, a)
11066}
11067
11068/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11069/// number of bits specified by imm8, and store the results in dst.
11070///
11071/// Rounding is done according to the imm8 parameter, which can be one of:
11072///
11073/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11074/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11075/// * [`_MM_FROUND_TO_POS_INF`] : round up
11076/// * [`_MM_FROUND_TO_ZERO`] : truncate
11077/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11078///
11079/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11080///
11081/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
11082#[inline]
11083#[target_feature(enable = "avx512fp16")]
11084#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
11085#[rustc_legacy_const_generics(1, 2)]
11086#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11087pub fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
11088    static_assert_uimm_bits!(IMM8, 8);
11089    static_assert_sae!(SAE);
11090    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
11091}
11092
11093/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11094/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
11095/// from src when the corresponding mask bit is not set).
11096///
11097/// Rounding is done according to the imm8 parameter, which can be one of:
11098///
11099/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11100/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11101/// * [`_MM_FROUND_TO_POS_INF`] : round up
11102/// * [`_MM_FROUND_TO_ZERO`] : truncate
11103/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11104///
11105/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11106///
11107/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
11108#[inline]
11109#[target_feature(enable = "avx512fp16")]
11110#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
11111#[rustc_legacy_const_generics(3, 4)]
11112#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11113pub fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
11114    src: __m512h,
11115    k: __mmask32,
11116    a: __m512h,
11117) -> __m512h {
11118    unsafe {
11119        static_assert_uimm_bits!(IMM8, 8);
11120        static_assert_sae!(SAE);
11121        vreduceph_512(a, IMM8, src, k, SAE)
11122    }
11123}
11124
11125/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
11126/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
11127/// out when the corresponding mask bit is not set).
11128///
11129/// Rounding is done according to the imm8 parameter, which can be one of:
11130///
11131/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11132/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11133/// * [`_MM_FROUND_TO_POS_INF`] : round up
11134/// * [`_MM_FROUND_TO_ZERO`] : truncate
11135/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11136///
11137/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11138///
11139/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
11140#[inline]
11141#[target_feature(enable = "avx512fp16")]
11142#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
11143#[rustc_legacy_const_generics(2, 3)]
11144#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11145pub fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
11146    k: __mmask32,
11147    a: __m512h,
11148) -> __m512h {
11149    static_assert_uimm_bits!(IMM8, 8);
11150    static_assert_sae!(SAE);
11151    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
11152}
11153
11154/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11155/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the
11156/// upper 7 packed elements from a to the upper elements of dst.
11157///
11158/// Rounding is done according to the imm8 parameter, which can be one of:
11159///
11160/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11161/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11162/// * [`_MM_FROUND_TO_POS_INF`] : round up
11163/// * [`_MM_FROUND_TO_ZERO`] : truncate
11164/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11165///
11166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
11167#[inline]
11168#[target_feature(enable = "avx512fp16")]
11169#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
11170#[rustc_legacy_const_generics(2)]
11171#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11172pub fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
11173    static_assert_uimm_bits!(IMM8, 8);
11174    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
11175}
11176
11177/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11178/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
11179/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from
11180/// a to the upper elements of dst.
11181///
11182/// Rounding is done according to the imm8 parameter, which can be one of:
11183///
11184/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11185/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11186/// * [`_MM_FROUND_TO_POS_INF`] : round up
11187/// * [`_MM_FROUND_TO_ZERO`] : truncate
11188/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11189///
11190/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
11191#[inline]
11192#[target_feature(enable = "avx512fp16")]
11193#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
11194#[rustc_legacy_const_generics(4)]
11195#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11196pub fn _mm_mask_reduce_sh<const IMM8: i32>(
11197    src: __m128h,
11198    k: __mmask8,
11199    a: __m128h,
11200    b: __m128h,
11201) -> __m128h {
11202    static_assert_uimm_bits!(IMM8, 8);
11203    _mm_mask_reduce_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
11204}
11205
11206/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11207/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
11208/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
11209/// to the upper elements of dst.
11210///
11211/// Rounding is done according to the imm8 parameter, which can be one of:
11212///
11213/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11214/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11215/// * [`_MM_FROUND_TO_POS_INF`] : round up
11216/// * [`_MM_FROUND_TO_ZERO`] : truncate
11217/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11218///
11219/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
11220#[inline]
11221#[target_feature(enable = "avx512fp16")]
11222#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
11223#[rustc_legacy_const_generics(3)]
11224#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11225pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
11226    static_assert_uimm_bits!(IMM8, 8);
11227    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
11228}
11229
11230/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11231/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper
11232/// 7 packed elements from a to the upper elements of dst.
11233///
11234/// Rounding is done according to the imm8 parameter, which can be one of:
11235///
11236/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11237/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11238/// * [`_MM_FROUND_TO_POS_INF`] : round up
11239/// * [`_MM_FROUND_TO_ZERO`] : truncate
11240/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11241///
11242/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11243///
11244/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
11245#[inline]
11246#[target_feature(enable = "avx512fp16")]
11247#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11248#[rustc_legacy_const_generics(2, 3)]
11249#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11250pub fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
11251    static_assert_uimm_bits!(IMM8, 8);
11252    static_assert_sae!(SAE);
11253    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
11254}
11255
11256/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11257/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
11258/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a
11259/// to the upper elements of dst.
11260///
11261/// Rounding is done according to the imm8 parameter, which can be one of:
11262///
11263/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11264/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11265/// * [`_MM_FROUND_TO_POS_INF`] : round up
11266/// * [`_MM_FROUND_TO_ZERO`] : truncate
11267/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11268///
11269/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11270///
11271/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
11272#[inline]
11273#[target_feature(enable = "avx512fp16")]
11274#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11275#[rustc_legacy_const_generics(4, 5)]
11276#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11277pub fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
11278    src: __m128h,
11279    k: __mmask8,
11280    a: __m128h,
11281    b: __m128h,
11282) -> __m128h {
11283    unsafe {
11284        static_assert_uimm_bits!(IMM8, 8);
11285        static_assert_sae!(SAE);
11286        vreducesh(a, b, src, k, IMM8, SAE)
11287    }
11288}
11289
11290/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11291/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
11292/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
11293/// to the upper elements of dst.
11294///
11295/// Rounding is done according to the imm8 parameter, which can be one of:
11296///
11297/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11298/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11299/// * [`_MM_FROUND_TO_POS_INF`] : round up
11300/// * [`_MM_FROUND_TO_ZERO`] : truncate
11301/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11302///
11303/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11304///
11305/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
11306#[inline]
11307#[target_feature(enable = "avx512fp16")]
11308#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11309#[rustc_legacy_const_generics(3, 4)]
11310#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11311pub fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
11312    k: __mmask8,
11313    a: __m128h,
11314    b: __m128h,
11315) -> __m128h {
11316    static_assert_uimm_bits!(IMM8, 8);
11317    static_assert_sae!(SAE);
11318    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
11319}
11320
11321/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11322/// sum of all elements in a.
11323///
11324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_ph)
11325#[inline]
11326#[target_feature(enable = "avx512fp16,avx512vl")]
11327#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11328#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11329pub const fn _mm_reduce_add_ph(a: __m128h) -> f16 {
11330    unsafe {
11331        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11332        let a = _mm_add_ph(a, b);
11333        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11334        let a = _mm_add_ph(a, b);
11335        simd_extract!(a, 0, f16) + simd_extract!(a, 1, f16)
11336    }
11337}
11338
11339/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11340/// sum of all elements in a.
11341///
11342/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_ph)
11343#[inline]
11344#[target_feature(enable = "avx512fp16,avx512vl")]
11345#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11346#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11347pub const fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
11348    unsafe {
11349        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11350        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11351        _mm_reduce_add_ph(_mm_add_ph(p, q))
11352    }
11353}
11354
11355/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11356/// sum of all elements in a.
11357///
11358/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ph)
11359#[inline]
11360#[target_feature(enable = "avx512fp16")]
11361#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11362#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11363pub const fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
11364    unsafe {
11365        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11366        let q = simd_shuffle!(
11367            a,
11368            a,
11369            [
11370                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11371            ]
11372        );
11373        _mm256_reduce_add_ph(_mm256_add_ph(p, q))
11374    }
11375}
11376
11377/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11378/// the product of all elements in a.
11379///
11380/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_ph)
11381#[inline]
11382#[target_feature(enable = "avx512fp16,avx512vl")]
11383#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11384#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11385pub const fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
11386    unsafe {
11387        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11388        let a = _mm_mul_ph(a, b);
11389        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11390        let a = _mm_mul_ph(a, b);
11391        simd_extract!(a, 0, f16) * simd_extract!(a, 1, f16)
11392    }
11393}
11394
11395/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11396/// the product of all elements in a.
11397///
11398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_ph)
11399#[inline]
11400#[target_feature(enable = "avx512fp16,avx512vl")]
11401#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11402#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11403pub const fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
11404    unsafe {
11405        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11406        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11407        _mm_reduce_mul_ph(_mm_mul_ph(p, q))
11408    }
11409}
11410
11411/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11412/// the product of all elements in a.
11413///
11414/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ph)
11415#[inline]
11416#[target_feature(enable = "avx512fp16")]
11417#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11418#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11419pub const fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
11420    unsafe {
11421        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11422        let q = simd_shuffle!(
11423            a,
11424            a,
11425            [
11426                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11427            ]
11428        );
11429        _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
11430    }
11431}
11432
11433/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11434/// minimum of all elements in a.
11435///
11436/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_ph)
11437#[inline]
11438#[target_feature(enable = "avx512fp16,avx512vl")]
11439#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11440pub fn _mm_reduce_min_ph(a: __m128h) -> f16 {
11441    unsafe {
11442        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11443        let a = _mm_min_ph(a, b);
11444        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11445        let a = _mm_min_ph(a, b);
11446        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11447        simd_extract!(_mm_min_sh(a, b), 0)
11448    }
11449}
11450
11451/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11452/// minimum of all elements in a.
11453///
11454/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_ph)
11455#[inline]
11456#[target_feature(enable = "avx512fp16,avx512vl")]
11457#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11458pub fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
11459    unsafe {
11460        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11461        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11462        _mm_reduce_min_ph(_mm_min_ph(p, q))
11463    }
11464}
11465
11466/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11467/// minimum of all elements in a.
11468///
11469/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ph)
11470#[inline]
11471#[target_feature(enable = "avx512fp16")]
11472#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11473pub fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
11474    unsafe {
11475        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11476        let q = simd_shuffle!(
11477            a,
11478            a,
11479            [
11480                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11481            ]
11482        );
11483        _mm256_reduce_min_ph(_mm256_min_ph(p, q))
11484    }
11485}
11486
11487/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11488/// maximum of all elements in a.
11489///
11490/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_ph)
11491#[inline]
11492#[target_feature(enable = "avx512fp16,avx512vl")]
11493#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11494pub fn _mm_reduce_max_ph(a: __m128h) -> f16 {
11495    unsafe {
11496        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11497        let a = _mm_max_ph(a, b);
11498        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11499        let a = _mm_max_ph(a, b);
11500        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11501        simd_extract!(_mm_max_sh(a, b), 0)
11502    }
11503}
11504
11505/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11506/// maximum of all elements in a.
11507///
11508/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_ph)
11509#[inline]
11510#[target_feature(enable = "avx512fp16,avx512vl")]
11511#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11512pub fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
11513    unsafe {
11514        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11515        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11516        _mm_reduce_max_ph(_mm_max_ph(p, q))
11517    }
11518}
11519
11520/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11521/// maximum of all elements in a.
11522///
11523/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ph)
11524#[inline]
11525#[target_feature(enable = "avx512fp16")]
11526#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11527pub fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
11528    unsafe {
11529        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11530        let q = simd_shuffle!(
11531            a,
11532            a,
11533            [
11534                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11535            ]
11536        );
11537        _mm256_reduce_max_ph(_mm256_max_ph(p, q))
11538    }
11539}
11540
11541macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
11542    ($mask_type: ty, $reg: ident, $a: expr) => {{
11543        let dst: $mask_type;
11544        asm!(
11545            "vfpclassph {k}, {src}, {imm8}",
11546            k = lateout(kreg) dst,
11547            src = in($reg) $a,
11548            imm8 = const IMM8,
11549            options(pure, nomem, nostack)
11550        );
11551        dst
11552    }};
11553    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
11554        let dst: $mask_type;
11555        asm!(
11556            "vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
11557            k = lateout(kreg) dst,
11558            mask = in(kreg) $mask,
11559            src = in($reg) $a,
11560            imm8 = const IMM8,
11561            options(pure, nomem, nostack)
11562        );
11563        dst
11564    }};
11565}
11566
11567/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11568/// by imm8, and store the results in mask vector k.
11569/// imm can be a combination of:
11570///
11571///     0x01 // QNaN
11572///     0x02 // Positive Zero
11573///     0x04 // Negative Zero
11574///     0x08 // Positive Infinity
11575///     0x10 // Negative Infinity
11576///     0x20 // Denormal
11577///     0x40 // Negative
11578///     0x80 // SNaN
11579///
11580/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask)
11581#[inline]
11582#[target_feature(enable = "avx512fp16,avx512vl")]
11583#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11584#[rustc_legacy_const_generics(1)]
11585#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11586pub fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11587    unsafe {
11588        static_assert_uimm_bits!(IMM8, 8);
11589        fpclass_asm!(__mmask8, xmm_reg, a)
11590    }
11591}
11592
11593/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11594/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11595/// corresponding mask bit is not set).
11596/// imm can be a combination of:
11597///
11598///     0x01 // QNaN
11599///     0x02 // Positive Zero
11600///     0x04 // Negative Zero
11601///     0x08 // Positive Infinity
11602///     0x10 // Negative Infinity
11603///     0x20 // Denormal
11604///     0x40 // Negative
11605///     0x80 // SNaN
11606///
11607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask)
11608#[inline]
11609#[target_feature(enable = "avx512fp16,avx512vl")]
11610#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11611#[rustc_legacy_const_generics(2)]
11612#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11613pub fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11614    unsafe {
11615        static_assert_uimm_bits!(IMM8, 8);
11616        fpclass_asm!(__mmask8, k1, xmm_reg, a)
11617    }
11618}
11619
11620/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11621/// by imm8, and store the results in mask vector k.
11622/// imm can be a combination of:
11623///
11624///     0x01 // QNaN
11625///     0x02 // Positive Zero
11626///     0x04 // Negative Zero
11627///     0x08 // Positive Infinity
11628///     0x10 // Negative Infinity
11629///     0x20 // Denormal
11630///     0x40 // Negative
11631///     0x80 // SNaN
11632///
11633/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask)
11634#[inline]
11635#[target_feature(enable = "avx512fp16,avx512vl")]
11636#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11637#[rustc_legacy_const_generics(1)]
11638#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11639pub fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
11640    unsafe {
11641        static_assert_uimm_bits!(IMM8, 8);
11642        fpclass_asm!(__mmask16, ymm_reg, a)
11643    }
11644}
11645
11646/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11647/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11648/// corresponding mask bit is not set).
11649/// imm can be a combination of:
11650///
11651///     0x01 // QNaN
11652///     0x02 // Positive Zero
11653///     0x04 // Negative Zero
11654///     0x08 // Positive Infinity
11655///     0x10 // Negative Infinity
11656///     0x20 // Denormal
11657///     0x40 // Negative
11658///     0x80 // SNaN
11659///
11660/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask)
11661#[inline]
11662#[target_feature(enable = "avx512fp16,avx512vl")]
11663#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11664#[rustc_legacy_const_generics(2)]
11665#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11666pub fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
11667    unsafe {
11668        static_assert_uimm_bits!(IMM8, 8);
11669        fpclass_asm!(__mmask16, k1, ymm_reg, a)
11670    }
11671}
11672
11673/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11674/// by imm8, and store the results in mask vector k.
11675/// imm can be a combination of:
11676///
11677///     0x01 // QNaN
11678///     0x02 // Positive Zero
11679///     0x04 // Negative Zero
11680///     0x08 // Positive Infinity
11681///     0x10 // Negative Infinity
11682///     0x20 // Denormal
11683///     0x40 // Negative
11684///     0x80 // SNaN
11685///
11686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fpclass_ph_mask)
11687#[inline]
11688#[target_feature(enable = "avx512fp16")]
11689#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11690#[rustc_legacy_const_generics(1)]
11691#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11692pub fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
11693    unsafe {
11694        static_assert_uimm_bits!(IMM8, 8);
11695        fpclass_asm!(__mmask32, zmm_reg, a)
11696    }
11697}
11698
11699/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11700/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11701/// corresponding mask bit is not set).
11702/// imm can be a combination of:
11703///
11704///     0x01 // QNaN
11705///     0x02 // Positive Zero
11706///     0x04 // Negative Zero
11707///     0x08 // Positive Infinity
11708///     0x10 // Negative Infinity
11709///     0x20 // Denormal
11710///     0x40 // Negative
11711///     0x80 // SNaN
11712///
11713/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fpclass_ph_mask)
11714#[inline]
11715#[target_feature(enable = "avx512fp16")]
11716#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11717#[rustc_legacy_const_generics(2)]
11718#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11719pub fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
11720    unsafe {
11721        static_assert_uimm_bits!(IMM8, 8);
11722        fpclass_asm!(__mmask32, k1, zmm_reg, a)
11723    }
11724}
11725
11726/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11727/// by imm8, and store the result in mask vector k.
11728/// imm can be a combination of:
11729///
11730///     0x01 // QNaN
11731///     0x02 // Positive Zero
11732///     0x04 // Negative Zero
11733///     0x08 // Positive Infinity
11734///     0x10 // Negative Infinity
11735///     0x20 // Denormal
11736///     0x40 // Negative
11737///     0x80 // SNaN
11738///
11739/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_sh_mask)
11740#[inline]
11741#[target_feature(enable = "avx512fp16")]
11742#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11743#[rustc_legacy_const_generics(1)]
11744#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11745pub fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11746    _mm_mask_fpclass_sh_mask::<IMM8>(0xff, a)
11747}
11748
11749/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11750/// by imm8, and store the result in mask vector k using zeromask k (elements are zeroed out when the
11751/// corresponding mask bit is not set).
11752/// imm can be a combination of:
11753///
11754///     0x01 // QNaN
11755///     0x02 // Positive Zero
11756///     0x04 // Negative Zero
11757///     0x08 // Positive Infinity
11758///     0x10 // Negative Infinity
11759///     0x20 // Denormal
11760///     0x40 // Negative
11761///     0x80 // SNaN
11762///
11763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_sh_mask)
11764#[inline]
11765#[target_feature(enable = "avx512fp16")]
11766#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11767#[rustc_legacy_const_generics(2)]
11768#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11769pub fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11770    unsafe {
11771        static_assert_uimm_bits!(IMM8, 8);
11772        vfpclasssh(a, IMM8, k1)
11773    }
11774}
11775
11776/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11777/// and store the results in dst.
11778///
11779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph)
11780#[inline]
11781#[target_feature(enable = "avx512fp16,avx512vl")]
11782#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11783#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11784pub const fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
11785    unsafe { simd_select_bitmask(k, b, a) }
11786}
11787
11788/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11789/// and store the results in dst.
11790///
11791/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph)
11792#[inline]
11793#[target_feature(enable = "avx512fp16,avx512vl")]
11794#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11795#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11796pub const fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
11797    unsafe { simd_select_bitmask(k, b, a) }
11798}
11799
11800/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11801/// and store the results in dst.
11802///
11803/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph)
11804#[inline]
11805#[target_feature(enable = "avx512fp16")]
11806#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11807#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
11808pub const fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
11809    unsafe { simd_select_bitmask(k, b, a) }
11810}
11811
11812/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11813/// and index in idx, and store the results in dst.
11814///
11815/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph)
11816#[inline]
11817#[target_feature(enable = "avx512fp16,avx512vl")]
11818#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11819pub fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
11820    _mm_castsi128_ph(_mm_permutex2var_epi16(
11821        _mm_castph_si128(a),
11822        idx,
11823        _mm_castph_si128(b),
11824    ))
11825}
11826
11827/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11828/// and index in idx, and store the results in dst.
11829///
11830/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph)
11831#[inline]
11832#[target_feature(enable = "avx512fp16,avx512vl")]
11833#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11834pub fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
11835    _mm256_castsi256_ph(_mm256_permutex2var_epi16(
11836        _mm256_castph_si256(a),
11837        idx,
11838        _mm256_castph_si256(b),
11839    ))
11840}
11841
11842/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11843/// and index in idx, and store the results in dst.
11844///
11845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph)
11846#[inline]
11847#[target_feature(enable = "avx512fp16")]
11848#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11849pub fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
11850    _mm512_castsi512_ph(_mm512_permutex2var_epi16(
11851        _mm512_castph_si512(a),
11852        idx,
11853        _mm512_castph_si512(b),
11854    ))
11855}
11856
11857/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11858/// and store the results in dst.
11859///
11860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph)
11861#[inline]
11862#[target_feature(enable = "avx512fp16,avx512vl")]
11863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11864pub fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
11865    _mm_castsi128_ph(_mm_permutexvar_epi16(idx, _mm_castph_si128(a)))
11866}
11867
11868/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11869/// and store the results in dst.
11870///
11871/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph)
11872#[inline]
11873#[target_feature(enable = "avx512fp16,avx512vl")]
11874#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11875pub fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
11876    _mm256_castsi256_ph(_mm256_permutexvar_epi16(idx, _mm256_castph_si256(a)))
11877}
11878
11879/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11880/// and store the results in dst.
11881///
11882/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph)
11883#[inline]
11884#[target_feature(enable = "avx512fp16")]
11885#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11886pub fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
11887    _mm512_castsi512_ph(_mm512_permutexvar_epi16(idx, _mm512_castph_si512(a)))
11888}
11889
11890/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11891/// and store the results in dst.
11892///
11893/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph)
11894#[inline]
11895#[target_feature(enable = "avx512fp16,avx512vl")]
11896#[cfg_attr(test, assert_instr(vcvtw2ph))]
11897#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11898pub fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
11899    unsafe { vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION) }
11900}
11901
11902/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11903/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11904/// mask bit is not set).
11905///
11906/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph)
11907#[inline]
11908#[target_feature(enable = "avx512fp16,avx512vl")]
11909#[cfg_attr(test, assert_instr(vcvtw2ph))]
11910#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11911pub fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11912    unsafe { simd_select_bitmask(k, _mm_cvtepi16_ph(a), src) }
11913}
11914
11915/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11916/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11917///
11918/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph)
11919#[inline]
11920#[target_feature(enable = "avx512fp16,avx512vl")]
11921#[cfg_attr(test, assert_instr(vcvtw2ph))]
11922#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11923pub fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
11924    _mm_mask_cvtepi16_ph(_mm_setzero_ph(), k, a)
11925}
11926
11927/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11928/// and store the results in dst.
11929///
11930/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_ph)
11931#[inline]
11932#[target_feature(enable = "avx512fp16,avx512vl")]
11933#[cfg_attr(test, assert_instr(vcvtw2ph))]
11934#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11935pub fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
11936    unsafe { vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION) }
11937}
11938
11939/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11940/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11941/// mask bit is not set).
11942///
11943/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph)
11944#[inline]
11945#[target_feature(enable = "avx512fp16,avx512vl")]
11946#[cfg_attr(test, assert_instr(vcvtw2ph))]
11947#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11948pub fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11949    unsafe { simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src) }
11950}
11951
11952/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11953/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11954///
11955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph)
11956#[inline]
11957#[target_feature(enable = "avx512fp16,avx512vl")]
11958#[cfg_attr(test, assert_instr(vcvtw2ph))]
11959#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11960pub fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
11961    _mm256_mask_cvtepi16_ph(_mm256_setzero_ph(), k, a)
11962}
11963
11964/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11965/// and store the results in dst.
11966///
11967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_ph)
11968#[inline]
11969#[target_feature(enable = "avx512fp16")]
11970#[cfg_attr(test, assert_instr(vcvtw2ph))]
11971#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11972pub fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
11973    unsafe { vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION) }
11974}
11975
11976/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11977/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11978/// mask bit is not set).
11979///
11980/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_ph)
11981#[inline]
11982#[target_feature(enable = "avx512fp16")]
11983#[cfg_attr(test, assert_instr(vcvtw2ph))]
11984#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11985pub fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11986    unsafe { simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src) }
11987}
11988
11989/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11990/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11991///
11992/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_ph)
11993#[inline]
11994#[target_feature(enable = "avx512fp16")]
11995#[cfg_attr(test, assert_instr(vcvtw2ph))]
11996#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11997pub fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
11998    _mm512_mask_cvtepi16_ph(_mm512_setzero_ph(), k, a)
11999}
12000
12001/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12002/// and store the results in dst.
12003///
12004/// Rounding is done according to the rounding parameter, which can be one of:
12005///
12006/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12007/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12008/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12009/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12010/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12011///
12012/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi16_ph)
12013#[inline]
12014#[target_feature(enable = "avx512fp16")]
12015#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
12016#[rustc_legacy_const_generics(1)]
12017#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12018pub fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
12019    unsafe {
12020        static_assert_rounding!(ROUNDING);
12021        vcvtw2ph_512(a.as_i16x32(), ROUNDING)
12022    }
12023}
12024
12025/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12026/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12027/// mask bit is not set).
12028///
12029/// Rounding is done according to the rounding parameter, which can be one of:
12030///
12031/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12032/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12033/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12034/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12035/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12036///
12037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi16_ph)
12038#[inline]
12039#[target_feature(enable = "avx512fp16")]
12040#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
12041#[rustc_legacy_const_generics(3)]
12042#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12043pub fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
12044    src: __m512h,
12045    k: __mmask32,
12046    a: __m512i,
12047) -> __m512h {
12048    unsafe {
12049        static_assert_rounding!(ROUNDING);
12050        simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
12051    }
12052}
12053
12054/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12055/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12056///
12057/// Rounding is done according to the rounding parameter, which can be one of:
12058///
12059/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12060/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12061/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12062/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12063/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12064///
12065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph)
12066#[inline]
12067#[target_feature(enable = "avx512fp16")]
12068#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
12069#[rustc_legacy_const_generics(2)]
12070#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12071pub fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
12072    static_assert_rounding!(ROUNDING);
12073    _mm512_mask_cvt_roundepi16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
12074}
12075
12076/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12077/// and store the results in dst.
12078///
12079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_ph)
12080#[inline]
12081#[target_feature(enable = "avx512fp16,avx512vl")]
12082#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12083#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12084pub fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
12085    unsafe { vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION) }
12086}
12087
12088/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12089/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12090/// mask bit is not set).
12091///
12092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_ph)
12093#[inline]
12094#[target_feature(enable = "avx512fp16,avx512vl")]
12095#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12097pub fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12098    unsafe { simd_select_bitmask(k, _mm_cvtepu16_ph(a), src) }
12099}
12100
12101/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12102/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12103///
12104/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_ph)
12105#[inline]
12106#[target_feature(enable = "avx512fp16,avx512vl")]
12107#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12108#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12109pub fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
12110    _mm_mask_cvtepu16_ph(_mm_setzero_ph(), k, a)
12111}
12112
12113/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12114/// and store the results in dst.
12115///
12116/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_ph)
12117#[inline]
12118#[target_feature(enable = "avx512fp16,avx512vl")]
12119#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12120#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12121pub fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
12122    unsafe { vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION) }
12123}
12124
12125/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12126/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12127/// mask bit is not set).
12128///
12129/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_ph)
12130#[inline]
12131#[target_feature(enable = "avx512fp16,avx512vl")]
12132#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12133#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12134pub fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
12135    unsafe { simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src) }
12136}
12137
12138/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12139/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12140///
12141/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_ph)
12142#[inline]
12143#[target_feature(enable = "avx512fp16,avx512vl")]
12144#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12145#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12146pub fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
12147    _mm256_mask_cvtepu16_ph(_mm256_setzero_ph(), k, a)
12148}
12149
12150/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12151/// and store the results in dst.
12152///
12153/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_ph)
12154#[inline]
12155#[target_feature(enable = "avx512fp16")]
12156#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12157#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12158pub fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
12159    unsafe { vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION) }
12160}
12161
12162/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12163/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12164/// mask bit is not set).
12165///
12166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_ph)
12167#[inline]
12168#[target_feature(enable = "avx512fp16")]
12169#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12170#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12171pub fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
12172    unsafe { simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src) }
12173}
12174
12175/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12176/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12177///
12178/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_ph)
12179#[inline]
12180#[target_feature(enable = "avx512fp16")]
12181#[cfg_attr(test, assert_instr(vcvtuw2ph))]
12182#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12183pub fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
12184    _mm512_mask_cvtepu16_ph(_mm512_setzero_ph(), k, a)
12185}
12186
12187/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12188/// and store the results in dst.
12189///
12190/// Rounding is done according to the rounding parameter, which can be one of:
12191///
12192/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12193/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12194/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12195/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12196/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12197///
12198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu16_ph)
12199#[inline]
12200#[target_feature(enable = "avx512fp16")]
12201#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
12202#[rustc_legacy_const_generics(1)]
12203#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12204pub fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
12205    unsafe {
12206        static_assert_rounding!(ROUNDING);
12207        vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
12208    }
12209}
12210
12211/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12212/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12213/// mask bit is not set).
12214///
12215/// Rounding is done according to the rounding parameter, which can be one of:
12216///
12217/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12218/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12219/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12220/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12221/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12222///
12223/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu16_ph)
12224#[inline]
12225#[target_feature(enable = "avx512fp16")]
12226#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
12227#[rustc_legacy_const_generics(3)]
12228#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12229pub fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
12230    src: __m512h,
12231    k: __mmask32,
12232    a: __m512i,
12233) -> __m512h {
12234    unsafe {
12235        static_assert_rounding!(ROUNDING);
12236        simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
12237    }
12238}
12239
12240/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12241/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12242///
12243/// Rounding is done according to the rounding parameter, which can be one of:
12244///
12245/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12246/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12247/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12248/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12249/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12250///
12251/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu16_ph)
12252#[inline]
12253#[target_feature(enable = "avx512fp16")]
12254#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
12255#[rustc_legacy_const_generics(2)]
12256#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12257pub fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
12258    static_assert_rounding!(ROUNDING);
12259    _mm512_mask_cvt_roundepu16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
12260}
12261
12262/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12263/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12264///
12265/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph)
12266#[inline]
12267#[target_feature(enable = "avx512fp16,avx512vl")]
12268#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12269#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12270pub fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
12271    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), 0xff, a)
12272}
12273
12274/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12275/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12276/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12277///
12278/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ph)
12279#[inline]
12280#[target_feature(enable = "avx512fp16,avx512vl")]
12281#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12282#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12283pub fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12284    unsafe { vcvtdq2ph_128(a.as_i32x4(), src, k) }
12285}
12286
12287/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12288/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12289/// The upper 64 bits of dst are zeroed out.
12290///
12291/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph)
12292#[inline]
12293#[target_feature(enable = "avx512fp16,avx512vl")]
12294#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12295#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12296pub fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
12297    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
12298}
12299
12300/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12301/// and store the results in dst.
12302///
12303/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ph)
12304#[inline]
12305#[target_feature(enable = "avx512fp16,avx512vl")]
12306#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12307#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12308pub fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
12309    unsafe { vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION) }
12310}
12311
12312/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12313/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12314/// mask bit is not set).
12315///
12316/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ph)
12317#[inline]
12318#[target_feature(enable = "avx512fp16,avx512vl")]
12319#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12320#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12321pub fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12322    unsafe { simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src) }
12323}
12324
12325/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12326/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12327///
12328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph)
12329#[inline]
12330#[target_feature(enable = "avx512fp16,avx512vl")]
12331#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12332#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12333pub fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
12334    _mm256_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
12335}
12336
12337/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12338/// and store the results in dst.
12339///
12340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ph)
12341#[inline]
12342#[target_feature(enable = "avx512fp16")]
12343#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12344#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12345pub fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
12346    unsafe { vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION) }
12347}
12348
12349/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12350/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12351/// mask bit is not set).
12352///
12353/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ph)
12354#[inline]
12355#[target_feature(enable = "avx512fp16")]
12356#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12357#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12358pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12359    unsafe { simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src) }
12360}
12361
12362/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12363/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12364///
12365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ph)
12366#[inline]
12367#[target_feature(enable = "avx512fp16")]
12368#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12369#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12370pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
12371    _mm512_mask_cvtepi32_ph(f16x16::ZERO.as_m256h(), k, a)
12372}
12373
12374/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12375/// and store the results in dst.
12376///
12377/// Rounding is done according to the rounding parameter, which can be one of:
12378///
12379/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12380/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12381/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12382/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12383/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12384///
12385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ph)
12386#[inline]
12387#[target_feature(enable = "avx512fp16")]
12388#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12389#[rustc_legacy_const_generics(1)]
12390#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12391pub fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12392    unsafe {
12393        static_assert_rounding!(ROUNDING);
12394        vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
12395    }
12396}
12397
12398/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12399/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12400/// mask bit is not set).
12401///
12402/// Rounding is done according to the rounding parameter, which can be one of:
12403///
12404/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12405/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12406/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12407/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12408/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12409///
12410/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ph)
12411#[inline]
12412#[target_feature(enable = "avx512fp16")]
12413#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12414#[rustc_legacy_const_generics(3)]
12415#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12416pub fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
12417    src: __m256h,
12418    k: __mmask16,
12419    a: __m512i,
12420) -> __m256h {
12421    unsafe {
12422        static_assert_rounding!(ROUNDING);
12423        simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
12424    }
12425}
12426
12427/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12428/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12429///
12430/// Rounding is done according to the rounding parameter, which can be one of:
12431///
12432/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12433/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12434/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12435/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12436/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12437///
12438/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph)
12439#[inline]
12440#[target_feature(enable = "avx512fp16")]
12441#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12442#[rustc_legacy_const_generics(2)]
12443#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12444pub fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12445    static_assert_rounding!(ROUNDING);
12446    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12447}
12448
12449/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12450/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12451/// of dst.
12452///
12453/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti32_sh)
12454#[inline]
12455#[target_feature(enable = "avx512fp16")]
12456#[cfg_attr(test, assert_instr(vcvtsi2sh))]
12457#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12458pub fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
12459    unsafe { vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12460}
12461
12462/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12463/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12464/// of dst.
12465///
12466/// Rounding is done according to the rounding parameter, which can be one of:
12467///
12468/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12469/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12470/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12471/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12472/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12473///
12474/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh)
12475#[inline]
12476#[target_feature(enable = "avx512fp16")]
12477#[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
12478#[rustc_legacy_const_generics(2)]
12479#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12480pub fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
12481    unsafe {
12482        static_assert_rounding!(ROUNDING);
12483        vcvtsi2sh(a, b, ROUNDING)
12484    }
12485}
12486
12487/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12488/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12489///
12490/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_ph)
12491#[inline]
12492#[target_feature(enable = "avx512fp16,avx512vl")]
12493#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12494#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12495pub fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
12496    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), 0xff, a)
12497}
12498
12499/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12500/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12501/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12502///
12503/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_ph)
12504#[inline]
12505#[target_feature(enable = "avx512fp16,avx512vl")]
12506#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12507#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12508pub fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12509    unsafe { vcvtudq2ph_128(a.as_u32x4(), src, k) }
12510}
12511
12512/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12513/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12514/// The upper 64 bits of dst are zeroed out.
12515///
12516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_ph)
12517#[inline]
12518#[target_feature(enable = "avx512fp16,avx512vl")]
12519#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12520#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12521pub fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
12522    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12523}
12524
12525/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12526/// and store the results in dst.
12527///
12528/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_ph)
12529#[inline]
12530#[target_feature(enable = "avx512fp16,avx512vl")]
12531#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12532#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12533pub fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
12534    unsafe { vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION) }
12535}
12536
12537/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12538/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12539/// mask bit is not set).
12540///
12541/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_ph)
12542#[inline]
12543#[target_feature(enable = "avx512fp16,avx512vl")]
12544#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12545#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12546pub fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12547    unsafe { simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src) }
12548}
12549
12550/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12551/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12552///
12553/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_ph)
12554#[inline]
12555#[target_feature(enable = "avx512fp16,avx512vl")]
12556#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12557#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12558pub fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
12559    _mm256_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12560}
12561
12562/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12563/// and store the results in dst.
12564///
12565/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ph)
12566#[inline]
12567#[target_feature(enable = "avx512fp16")]
12568#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12569#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12570pub fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
12571    unsafe { vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION) }
12572}
12573
12574/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12575/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12576/// mask bit is not set).
12577///
12578/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ph)
12579#[inline]
12580#[target_feature(enable = "avx512fp16")]
12581#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12582#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12583pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12584    unsafe { simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src) }
12585}
12586
12587/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12588/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12589///
12590/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ph)
12591#[inline]
12592#[target_feature(enable = "avx512fp16")]
12593#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12594#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12595pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
12596    _mm512_mask_cvtepu32_ph(f16x16::ZERO.as_m256h(), k, a)
12597}
12598
12599/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12600/// and store the results in dst.
12601///
12602/// Rounding is done according to the rounding parameter, which can be one of:
12603///
12604/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12605/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12606/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12607/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12608/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12609///
12610/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ph)
12611#[inline]
12612#[target_feature(enable = "avx512fp16")]
12613#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12614#[rustc_legacy_const_generics(1)]
12615#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12616pub fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12617    unsafe {
12618        static_assert_rounding!(ROUNDING);
12619        vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
12620    }
12621}
12622
12623/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12624/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12625/// mask bit is not set).
12626///
12627/// Rounding is done according to the rounding parameter, which can be one of:
12628///
12629/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12630/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12631/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12632/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12633/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12634///
12635/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ph)
12636#[inline]
12637#[target_feature(enable = "avx512fp16")]
12638#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12639#[rustc_legacy_const_generics(3)]
12640#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12641pub fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
12642    src: __m256h,
12643    k: __mmask16,
12644    a: __m512i,
12645) -> __m256h {
12646    unsafe {
12647        static_assert_rounding!(ROUNDING);
12648        simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
12649    }
12650}
12651
12652/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12653/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12654///
12655/// Rounding is done according to the rounding parameter, which can be one of:
12656///
12657/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12658/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12659/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12660/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12661/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12662///
12663/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ph)
12664#[inline]
12665#[target_feature(enable = "avx512fp16")]
12666#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12667#[rustc_legacy_const_generics(2)]
12668#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12669pub fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12670    static_assert_rounding!(ROUNDING);
12671    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12672}
12673
12674/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12675/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12676/// of dst.
12677///
12678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sh)
12679#[inline]
12680#[target_feature(enable = "avx512fp16")]
12681#[cfg_attr(test, assert_instr(vcvtusi2sh))]
12682#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12683pub fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
12684    unsafe { vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12685}
12686
12687/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12688/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12689/// of dst.
12690///
12691/// Rounding is done according to the rounding parameter, which can be one of:
12692///
12693/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12694/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12695/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12696/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12697/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12698///
12699/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu32_sh)
12700#[inline]
12701#[target_feature(enable = "avx512fp16")]
12702#[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
12703#[rustc_legacy_const_generics(2)]
12704#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12705pub fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
12706    unsafe {
12707        static_assert_rounding!(ROUNDING);
12708        vcvtusi2sh(a, b, ROUNDING)
12709    }
12710}
12711
12712/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12713/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12714///
12715/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_ph)
12716#[inline]
12717#[target_feature(enable = "avx512fp16,avx512vl")]
12718#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12719#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12720pub fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
12721    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12722}
12723
12724/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12725/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12726/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12727///
12728/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_ph)
12729#[inline]
12730#[target_feature(enable = "avx512fp16,avx512vl")]
12731#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12732#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12733pub fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12734    unsafe { vcvtqq2ph_128(a.as_i64x2(), src, k) }
12735}
12736
12737/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12738/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12739/// The upper 96 bits of dst are zeroed out.
12740///
12741/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_ph)
12742#[inline]
12743#[target_feature(enable = "avx512fp16,avx512vl")]
12744#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12745#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12746pub fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
12747    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12748}
12749
12750/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12751/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12752///
12753/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_ph)
12754#[inline]
12755#[target_feature(enable = "avx512fp16,avx512vl")]
12756#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12757#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12758pub fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
12759    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12760}
12761
12762/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12763/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12764/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12765///
12766/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_ph)
12767#[inline]
12768#[target_feature(enable = "avx512fp16,avx512vl")]
12769#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12770#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12771pub fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12772    unsafe { vcvtqq2ph_256(a.as_i64x4(), src, k) }
12773}
12774
12775/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12776/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12777/// The upper 64 bits of dst are zeroed out.
12778///
12779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_ph)
12780#[inline]
12781#[target_feature(enable = "avx512fp16,avx512vl")]
12782#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12783#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12784pub fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
12785    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12786}
12787
12788/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12789/// and store the results in dst.
12790///
12791/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_ph)
12792#[inline]
12793#[target_feature(enable = "avx512fp16")]
12794#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12795#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12796pub fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
12797    unsafe { vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION) }
12798}
12799
12800/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12801/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12802/// mask bit is not set).
12803///
12804/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_ph)
12805#[inline]
12806#[target_feature(enable = "avx512fp16")]
12807#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12808#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12809pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12810    unsafe { simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src) }
12811}
12812
12813/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12814/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12815///
12816/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_ph)
12817#[inline]
12818#[target_feature(enable = "avx512fp16")]
12819#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12820#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12821pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
12822    _mm512_mask_cvtepi64_ph(f16x8::ZERO.as_m128h(), k, a)
12823}
12824
12825/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12826/// and store the results in dst.
12827///
12828/// Rounding is done according to the rounding parameter, which can be one of:
12829///
12830/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12831/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12832/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12833/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12834/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12835///
12836/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi64_ph)
12837#[inline]
12838#[target_feature(enable = "avx512fp16")]
12839#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12840#[rustc_legacy_const_generics(1)]
12841#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12842pub fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12843    unsafe {
12844        static_assert_rounding!(ROUNDING);
12845        vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
12846    }
12847}
12848
12849/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12850/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12851/// mask bit is not set).
12852///
12853/// Rounding is done according to the rounding parameter, which can be one of:
12854///
12855/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12856/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12857/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12858/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12859/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12860///
12861/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi64_ph)
12862#[inline]
12863#[target_feature(enable = "avx512fp16")]
12864#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12865#[rustc_legacy_const_generics(3)]
12866#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12867pub fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
12868    src: __m128h,
12869    k: __mmask8,
12870    a: __m512i,
12871) -> __m128h {
12872    unsafe {
12873        static_assert_rounding!(ROUNDING);
12874        simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
12875    }
12876}
12877
12878/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12879/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12880///
12881/// Rounding is done according to the rounding parameter, which can be one of:
12882///
12883/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12884/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12885/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12886/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12887/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12888///
12889/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi64_ph)
12890#[inline]
12891#[target_feature(enable = "avx512fp16")]
12892#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12893#[rustc_legacy_const_generics(2)]
12894#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12895pub fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12896    static_assert_rounding!(ROUNDING);
12897    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
12898}
12899
12900/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12901/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12902///
12903/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu64_ph)
12904#[inline]
12905#[target_feature(enable = "avx512fp16,avx512vl")]
12906#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12907#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12908pub fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
12909    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12910}
12911
12912/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12913/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12914/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12915///
12916/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu64_ph)
12917#[inline]
12918#[target_feature(enable = "avx512fp16,avx512vl")]
12919#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12920#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12921pub fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12922    unsafe { vcvtuqq2ph_128(a.as_u64x2(), src, k) }
12923}
12924
12925/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12926/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12927/// The upper 96 bits of dst are zeroed out.
12928///
12929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu64_ph)
12930#[inline]
12931#[target_feature(enable = "avx512fp16,avx512vl")]
12932#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12933#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12934pub fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
12935    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12936}
12937
12938/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12939/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12940///
12941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu64_ph)
12942#[inline]
12943#[target_feature(enable = "avx512fp16,avx512vl")]
12944#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12945#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12946pub fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
12947    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12948}
12949
12950/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12951/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12952/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12953///
12954/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu64_ph)
12955#[inline]
12956#[target_feature(enable = "avx512fp16,avx512vl")]
12957#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12958#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12959pub fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12960    unsafe { vcvtuqq2ph_256(a.as_u64x4(), src, k) }
12961}
12962
12963/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12964/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12965/// The upper 64 bits of dst are zeroed out.
12966///
12967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu64_ph)
12968#[inline]
12969#[target_feature(enable = "avx512fp16,avx512vl")]
12970#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12971#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12972pub fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
12973    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12974}
12975
12976/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12977/// and store the results in dst.
12978///
12979/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu64_ph)
12980#[inline]
12981#[target_feature(enable = "avx512fp16")]
12982#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12983#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12984pub fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
12985    unsafe { vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION) }
12986}
12987
12988/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12989/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12990/// mask bit is not set).
12991///
12992/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu64_ph)
12993#[inline]
12994#[target_feature(enable = "avx512fp16")]
12995#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12996#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12997pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12998    unsafe { simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src) }
12999}
13000
13001/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
13002/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13003///
13004/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu64_ph)
13005#[inline]
13006#[target_feature(enable = "avx512fp16")]
13007#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
13008#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13009pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
13010    _mm512_mask_cvtepu64_ph(f16x8::ZERO.as_m128h(), k, a)
13011}
13012
13013/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
13014/// and store the results in dst.
13015///
13016/// Rounding is done according to the rounding parameter, which can be one of:
13017///
13018/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13019/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13020/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13021/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13022/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13023///
13024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu64_ph)
13025#[inline]
13026#[target_feature(enable = "avx512fp16")]
13027#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
13028#[rustc_legacy_const_generics(1)]
13029#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13030pub fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
13031    unsafe {
13032        static_assert_rounding!(ROUNDING);
13033        vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
13034    }
13035}
13036
13037/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
13038/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
13039/// mask bit is not set).
13040///
13041/// Rounding is done according to the rounding parameter, which can be one of:
13042///
13043/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13044/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13045/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13046/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13047/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13048///
13049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu64_ph)
13050#[inline]
13051#[target_feature(enable = "avx512fp16")]
13052#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
13053#[rustc_legacy_const_generics(3)]
13054#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13055pub fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
13056    src: __m128h,
13057    k: __mmask8,
13058    a: __m512i,
13059) -> __m128h {
13060    unsafe {
13061        static_assert_rounding!(ROUNDING);
13062        simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
13063    }
13064}
13065
13066/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
13067/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13068///
13069/// Rounding is done according to the rounding parameter, which can be one of:
13070///
13071/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13072/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13073/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13074/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13075/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13076///
13077/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu64_ph)
13078#[inline]
13079#[target_feature(enable = "avx512fp16")]
13080#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
13081#[rustc_legacy_const_generics(2)]
13082#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13083pub fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
13084    static_assert_rounding!(ROUNDING);
13085    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
13086}
13087
13088/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13089/// floating-point elements, and store the results in dst.
13090///
13091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph)
13092#[inline]
13093#[target_feature(enable = "avx512fp16,avx512vl")]
13094#[cfg_attr(test, assert_instr(vcvtps2phx))]
13095#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13096pub fn _mm_cvtxps_ph(a: __m128) -> __m128h {
13097    _mm_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
13098}
13099
13100/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13101/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13102/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13103///
13104/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph)
13105#[inline]
13106#[target_feature(enable = "avx512fp16,avx512vl")]
13107#[cfg_attr(test, assert_instr(vcvtps2phx))]
13108#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13109pub fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
13110    unsafe { vcvtps2phx_128(a, src, k) }
13111}
13112
13113/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13114/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13115/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13116///
13117/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph)
13118#[inline]
13119#[target_feature(enable = "avx512fp16,avx512vl")]
13120#[cfg_attr(test, assert_instr(vcvtps2phx))]
13121#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13122pub fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
13123    _mm_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
13124}
13125
13126/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13127/// floating-point elements, and store the results in dst.
13128///
13129/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph)
13130#[inline]
13131#[target_feature(enable = "avx512fp16,avx512vl")]
13132#[cfg_attr(test, assert_instr(vcvtps2phx))]
13133#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13134pub fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
13135    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
13136}
13137
13138/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13139/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13140/// when the corresponding mask bit is not set).
13141///
13142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph)
13143#[inline]
13144#[target_feature(enable = "avx512fp16,avx512vl")]
13145#[cfg_attr(test, assert_instr(vcvtps2phx))]
13146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13147pub fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
13148    unsafe { vcvtps2phx_256(a, src, k) }
13149}
13150
13151/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13152/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13153/// corresponding mask bit is not set).
13154///
13155/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph)
13156#[inline]
13157#[target_feature(enable = "avx512fp16,avx512vl")]
13158#[cfg_attr(test, assert_instr(vcvtps2phx))]
13159#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13160pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
13161    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
13162}
13163
13164/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13165/// floating-point elements, and store the results in dst.
13166///
13167/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph)
13168#[inline]
13169#[target_feature(enable = "avx512fp16")]
13170#[cfg_attr(test, assert_instr(vcvtps2phx))]
13171#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13172pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
13173    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), 0xffff, a)
13174}
13175
13176/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13177/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13178/// when the corresponding mask bit is not set).
13179///
13180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph)
13181#[inline]
13182#[target_feature(enable = "avx512fp16")]
13183#[cfg_attr(test, assert_instr(vcvtps2phx))]
13184#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13185pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
13186    unsafe { vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
13187}
13188
13189/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13190/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13191/// corresponding mask bit is not set).
13192///
13193/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph)
13194#[inline]
13195#[target_feature(enable = "avx512fp16")]
13196#[cfg_attr(test, assert_instr(vcvtps2phx))]
13197#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13198pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
13199    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), k, a)
13200}
13201
13202/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13203/// floating-point elements, and store the results in dst.
13204///
13205/// Rounding is done according to the rounding parameter, which can be one of:
13206///
13207/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13208/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13209/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13210/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13211/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13212///
13213/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundps_ph)
13214#[inline]
13215#[target_feature(enable = "avx512fp16")]
13216#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
13217#[rustc_legacy_const_generics(1)]
13218#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13219pub fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
13220    static_assert_rounding!(ROUNDING);
13221    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), 0xffff, a)
13222}
13223
13224/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13225/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13226/// when the corresponding mask bit is not set).
13227///
13228/// Rounding is done according to the rounding parameter, which can be one of:
13229///
13230/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13231/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13232/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13233/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13234/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13235///
13236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundps_ph)
13237#[inline]
13238#[target_feature(enable = "avx512fp16")]
13239#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
13240#[rustc_legacy_const_generics(3)]
13241#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13242pub fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
13243    src: __m256h,
13244    k: __mmask16,
13245    a: __m512,
13246) -> __m256h {
13247    unsafe {
13248        static_assert_rounding!(ROUNDING);
13249        vcvtps2phx_512(a, src, k, ROUNDING)
13250    }
13251}
13252
13253/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13254/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13255/// corresponding mask bit is not set).
13256///
13257/// Rounding is done according to the rounding parameter, which can be one of:
13258///
13259/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13260/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13261/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13262/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13263/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13264///
13265/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundps_ph)
13266#[inline]
13267#[target_feature(enable = "avx512fp16")]
13268#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
13269#[rustc_legacy_const_generics(2)]
13270#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13271pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256h {
13272    static_assert_rounding!(ROUNDING);
13273    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
13274}
13275
13276/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13277/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13278/// elements from a to the upper elements of dst.
13279///
13280/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sh)
13281#[inline]
13282#[target_feature(enable = "avx512fp16")]
13283#[cfg_attr(test, assert_instr(vcvtss2sh))]
13284#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13285pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
13286    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
13287}
13288
13289/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13290/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13291/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13292/// upper elements of dst.
13293///
13294/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh)
13295#[inline]
13296#[target_feature(enable = "avx512fp16")]
13297#[cfg_attr(test, assert_instr(vcvtss2sh))]
13298#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13299pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
13300    unsafe { vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
13301}
13302
13303/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13304/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13305/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13306/// elements of dst.
13307///
13308/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh)
13309#[inline]
13310#[target_feature(enable = "avx512fp16")]
13311#[cfg_attr(test, assert_instr(vcvtss2sh))]
13312#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13313pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
13314    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), k, a, b)
13315}
13316
13317/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13318/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13319/// elements from a to the upper elements of dst.
13320///
13321/// Rounding is done according to the rounding parameter, which can be one of:
13322///
13323/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13324/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13325/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13326/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13327/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13328///
13329/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_sh)
13330#[inline]
13331#[target_feature(enable = "avx512fp16")]
13332#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13333#[rustc_legacy_const_generics(2)]
13334#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13335pub fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
13336    static_assert_rounding!(ROUNDING);
13337    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
13338}
13339
13340/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13341/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13342/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13343/// upper elements of dst.
13344///
13345/// Rounding is done according to the rounding parameter, which can be one of:
13346///
13347/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13348/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13349/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13350/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13351/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13352///
13353/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sh)
13354#[inline]
13355#[target_feature(enable = "avx512fp16")]
13356#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13357#[rustc_legacy_const_generics(4)]
13358#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13359pub fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
13360    src: __m128h,
13361    k: __mmask8,
13362    a: __m128h,
13363    b: __m128,
13364) -> __m128h {
13365    unsafe {
13366        static_assert_rounding!(ROUNDING);
13367        vcvtss2sh(a, b, src, k, ROUNDING)
13368    }
13369}
13370
13371/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13372/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13373/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13374/// elements of dst.
13375///
13376/// Rounding is done according to the rounding parameter, which can be one of:
13377///
13378/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13379/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13380/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13381/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13382/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13383///
13384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sh)
13385#[inline]
13386#[target_feature(enable = "avx512fp16")]
13387#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13388#[rustc_legacy_const_generics(3)]
13389#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13390pub fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
13391    k: __mmask8,
13392    a: __m128h,
13393    b: __m128,
13394) -> __m128h {
13395    static_assert_rounding!(ROUNDING);
13396    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
13397}
13398
13399/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13400/// floating-point elements, and store the results in dst. The upper 96 bits of dst are zeroed out.
13401///
13402/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ph)
13403#[inline]
13404#[target_feature(enable = "avx512fp16,avx512vl")]
13405#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13406#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13407pub fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
13408    _mm_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13409}
13410
13411/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13412/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13413/// when the corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13414///
13415/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ph)
13416#[inline]
13417#[target_feature(enable = "avx512fp16,avx512vl")]
13418#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13419#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13420pub fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
13421    unsafe { vcvtpd2ph_128(a, src, k) }
13422}
13423
13424/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13425/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13426/// corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13427///
13428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ph)
13429#[inline]
13430#[target_feature(enable = "avx512fp16,avx512vl")]
13431#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13432#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13433pub fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
13434    _mm_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13435}
13436
13437/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13438/// floating-point elements, and store the results in dst. The upper 64 bits of dst are zeroed out.
13439///
13440/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ph)
13441#[inline]
13442#[target_feature(enable = "avx512fp16,avx512vl")]
13443#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13444#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13445pub fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
13446    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13447}
13448
13449/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13450/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13451/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13452///
13453/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ph)
13454#[inline]
13455#[target_feature(enable = "avx512fp16,avx512vl")]
13456#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13457#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13458pub fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
13459    unsafe { vcvtpd2ph_256(a, src, k) }
13460}
13461
13462/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13463/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13464/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13465///
13466/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ph)
13467#[inline]
13468#[target_feature(enable = "avx512fp16,avx512vl")]
13469#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13470#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13471pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
13472    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13473}
13474
13475/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13476/// floating-point elements, and store the results in dst.
13477///
13478/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ph)
13479#[inline]
13480#[target_feature(enable = "avx512fp16")]
13481#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13482#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13483pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
13484    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), 0xff, a)
13485}
13486
13487/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13488/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13489/// when the corresponding mask bit is not set).
13490///
13491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ph)
13492#[inline]
13493#[target_feature(enable = "avx512fp16")]
13494#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13495#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13496pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
13497    unsafe { vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
13498}
13499
13500/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13501/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13502/// corresponding mask bit is not set).
13503///
13504/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ph)
13505#[inline]
13506#[target_feature(enable = "avx512fp16")]
13507#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13508#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13509pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
13510    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), k, a)
13511}
13512
13513/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13514/// floating-point elements, and store the results in dst.
13515///
13516/// Rounding is done according to the rounding parameter, which can be one of:
13517///
13518/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13519/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13520/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13521/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13522/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13523///
13524/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ph)
13525#[inline]
13526#[target_feature(enable = "avx512fp16")]
13527#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13528#[rustc_legacy_const_generics(1)]
13529#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13530pub fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
13531    static_assert_rounding!(ROUNDING);
13532    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a)
13533}
13534
13535/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13536/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13537/// when the corresponding mask bit is not set).
13538///
13539/// Rounding is done according to the rounding parameter, which can be one of:
13540///
13541/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13542/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13543/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13544/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13545/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13546///
13547/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ph)
13548#[inline]
13549#[target_feature(enable = "avx512fp16")]
13550#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13551#[rustc_legacy_const_generics(3)]
13552#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13553pub fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
13554    src: __m128h,
13555    k: __mmask8,
13556    a: __m512d,
13557) -> __m128h {
13558    unsafe {
13559        static_assert_rounding!(ROUNDING);
13560        vcvtpd2ph_512(a, src, k, ROUNDING)
13561    }
13562}
13563
13564/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13565/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13566/// corresponding mask bit is not set).
13567///
13568/// Rounding is done according to the rounding parameter, which can be one of:
13569///
13570/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13571/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13572/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13573/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13574/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13575///
13576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ph)
13577#[inline]
13578#[target_feature(enable = "avx512fp16")]
13579#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13580#[rustc_legacy_const_generics(2)]
13581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13582pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
13583    static_assert_rounding!(ROUNDING);
13584    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
13585}
13586
13587/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13588/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13589/// elements from a to the upper elements of dst.
13590///
13591/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_sh)
13592#[inline]
13593#[target_feature(enable = "avx512fp16")]
13594#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13595#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13596pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
13597    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
13598}
13599
13600/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13601/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13602/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13603/// upper elements of dst.
13604///
13605/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_sh)
13606#[inline]
13607#[target_feature(enable = "avx512fp16")]
13608#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13609#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13610pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13611    unsafe { vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
13612}
13613
13614/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13615/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13616/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13617/// elements of dst.
13618///
13619/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_sh)
13620#[inline]
13621#[target_feature(enable = "avx512fp16")]
13622#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13623#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13624pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13625    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), k, a, b)
13626}
13627
13628/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13629/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13630/// elements from a to the upper elements of dst.
13631///
13632/// Rounding is done according to the rounding parameter, which can be one of:
13633///
13634/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13635/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13636/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13637/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13638/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13639///
13640/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_sh)
13641#[inline]
13642#[target_feature(enable = "avx512fp16")]
13643#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13644#[rustc_legacy_const_generics(2)]
13645#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13646pub fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
13647    static_assert_rounding!(ROUNDING);
13648    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
13649}
13650
13651/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13652/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13653/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13654/// upper elements of dst.
13655///
13656/// Rounding is done according to the rounding parameter, which can be one of:
13657///
13658/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13659/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13660/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13661/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13662/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13663///
13664/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh)
13665#[inline]
13666#[target_feature(enable = "avx512fp16")]
13667#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13668#[rustc_legacy_const_generics(4)]
13669#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13670pub fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
13671    src: __m128h,
13672    k: __mmask8,
13673    a: __m128h,
13674    b: __m128d,
13675) -> __m128h {
13676    unsafe {
13677        static_assert_rounding!(ROUNDING);
13678        vcvtsd2sh(a, b, src, k, ROUNDING)
13679    }
13680}
13681
13682/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13683/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13684/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13685/// elements of dst.
13686///
13687/// Rounding is done according to the rounding parameter, which can be one of:
13688///
13689/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13690/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13691/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13692/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13693/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13694///
13695/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh)
13696#[inline]
13697#[target_feature(enable = "avx512fp16")]
13698#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13699#[rustc_legacy_const_generics(3)]
13700#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13701pub fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
13702    k: __mmask8,
13703    a: __m128h,
13704    b: __m128d,
13705) -> __m128h {
13706    static_assert_rounding!(ROUNDING);
13707    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
13708}
13709
13710/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13711/// store the results in dst.
13712///
13713/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi16)
13714#[inline]
13715#[target_feature(enable = "avx512fp16,avx512vl")]
13716#[cfg_attr(test, assert_instr(vcvtph2w))]
13717#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13718pub fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
13719    _mm_mask_cvtph_epi16(_mm_undefined_si128(), 0xff, a)
13720}
13721
13722/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13723/// store the results in dst using writemask k (elements are copied from src when the corresponding
13724/// mask bit is not set).
13725///
13726/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi16)
13727#[inline]
13728#[target_feature(enable = "avx512fp16,avx512vl")]
13729#[cfg_attr(test, assert_instr(vcvtph2w))]
13730#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13731pub fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13732    unsafe { transmute(vcvtph2w_128(a, src.as_i16x8(), k)) }
13733}
13734
13735/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13736/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13737///
13738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi16)
13739#[inline]
13740#[target_feature(enable = "avx512fp16,avx512vl")]
13741#[cfg_attr(test, assert_instr(vcvtph2w))]
13742#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13743pub fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13744    _mm_mask_cvtph_epi16(_mm_setzero_si128(), k, a)
13745}
13746
13747/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13748/// store the results in dst.
13749///
13750/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi16)
13751#[inline]
13752#[target_feature(enable = "avx512fp16,avx512vl")]
13753#[cfg_attr(test, assert_instr(vcvtph2w))]
13754#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13755pub fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
13756    _mm256_mask_cvtph_epi16(_mm256_undefined_si256(), 0xffff, a)
13757}
13758
13759/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13760/// store the results in dst using writemask k (elements are copied from src when the corresponding
13761/// mask bit is not set).
13762///
13763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi16)
13764#[inline]
13765#[target_feature(enable = "avx512fp16,avx512vl")]
13766#[cfg_attr(test, assert_instr(vcvtph2w))]
13767#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13768pub fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13769    unsafe { transmute(vcvtph2w_256(a, src.as_i16x16(), k)) }
13770}
13771
13772/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13773/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13774///
13775/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi16)
13776#[inline]
13777#[target_feature(enable = "avx512fp16,avx512vl")]
13778#[cfg_attr(test, assert_instr(vcvtph2w))]
13779#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13780pub fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13781    _mm256_mask_cvtph_epi16(_mm256_setzero_si256(), k, a)
13782}
13783
13784/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13785/// store the results in dst.
13786///
13787/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi16)
13788#[inline]
13789#[target_feature(enable = "avx512fp16")]
13790#[cfg_attr(test, assert_instr(vcvtph2w))]
13791#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13792pub fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
13793    _mm512_mask_cvtph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13794}
13795
13796/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13797/// store the results in dst using writemask k (elements are copied from src when the corresponding
13798/// mask bit is not set).
13799///
13800/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi16)
13801#[inline]
13802#[target_feature(enable = "avx512fp16")]
13803#[cfg_attr(test, assert_instr(vcvtph2w))]
13804#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13805pub fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13806    unsafe {
13807        transmute(vcvtph2w_512(
13808            a,
13809            src.as_i16x32(),
13810            k,
13811            _MM_FROUND_CUR_DIRECTION,
13812        ))
13813    }
13814}
13815
13816/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13817/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13818///
13819/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi16)
13820#[inline]
13821#[target_feature(enable = "avx512fp16")]
13822#[cfg_attr(test, assert_instr(vcvtph2w))]
13823#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13824pub fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13825    _mm512_mask_cvtph_epi16(_mm512_setzero_si512(), k, a)
13826}
13827
13828/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13829/// store the results in dst.
13830///
13831/// Rounding is done according to the rounding parameter, which can be one of:
13832///
13833/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13834/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13835/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13836/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13837/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13838///
13839/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi16)
13840#[inline]
13841#[target_feature(enable = "avx512fp16")]
13842#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13843#[rustc_legacy_const_generics(1)]
13844#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13845pub fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
13846    static_assert_rounding!(ROUNDING);
13847    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
13848}
13849
13850/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13851/// store the results in dst using writemask k (elements are copied from src when the corresponding
13852/// mask bit is not set).
13853///
13854/// Rounding is done according to the rounding parameter, which can be one of:
13855///
13856/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13857/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13858/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13859/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13860/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13861///
13862/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi16)
13863#[inline]
13864#[target_feature(enable = "avx512fp16")]
13865#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13866#[rustc_legacy_const_generics(3)]
13867#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13868pub fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
13869    src: __m512i,
13870    k: __mmask32,
13871    a: __m512h,
13872) -> __m512i {
13873    unsafe {
13874        static_assert_rounding!(ROUNDING);
13875        transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
13876    }
13877}
13878
13879/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13880/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13881///
13882/// Rounding is done according to the rounding parameter, which can be one of:
13883///
13884/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13885/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13886/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13887/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13888/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13889///
13890/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi16)
13891#[inline]
13892#[target_feature(enable = "avx512fp16")]
13893#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13894#[rustc_legacy_const_generics(2)]
13895#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13896pub fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
13897    static_assert_rounding!(ROUNDING);
13898    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_setzero_si512(), k, a)
13899}
13900
13901/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13902/// and store the results in dst.
13903///
13904/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu16)
13905#[inline]
13906#[target_feature(enable = "avx512fp16,avx512vl")]
13907#[cfg_attr(test, assert_instr(vcvtph2uw))]
13908#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13909pub fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
13910    _mm_mask_cvtph_epu16(_mm_undefined_si128(), 0xff, a)
13911}
13912
13913/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13914/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13915/// mask bit is not set).
13916///
13917/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu16)
13918#[inline]
13919#[target_feature(enable = "avx512fp16,avx512vl")]
13920#[cfg_attr(test, assert_instr(vcvtph2uw))]
13921#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13922pub fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13923    unsafe { transmute(vcvtph2uw_128(a, src.as_u16x8(), k)) }
13924}
13925
13926/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13927/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13928///
13929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu16)
13930#[inline]
13931#[target_feature(enable = "avx512fp16,avx512vl")]
13932#[cfg_attr(test, assert_instr(vcvtph2uw))]
13933#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13934pub fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13935    _mm_mask_cvtph_epu16(_mm_setzero_si128(), k, a)
13936}
13937
13938/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13939/// and store the results in dst.
13940///
13941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu16)
13942#[inline]
13943#[target_feature(enable = "avx512fp16,avx512vl")]
13944#[cfg_attr(test, assert_instr(vcvtph2uw))]
13945#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13946pub fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
13947    _mm256_mask_cvtph_epu16(_mm256_undefined_si256(), 0xffff, a)
13948}
13949
13950/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13951/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13952/// mask bit is not set).
13953///
13954/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu16)
13955#[inline]
13956#[target_feature(enable = "avx512fp16,avx512vl")]
13957#[cfg_attr(test, assert_instr(vcvtph2uw))]
13958#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13959pub fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13960    unsafe { transmute(vcvtph2uw_256(a, src.as_u16x16(), k)) }
13961}
13962
13963/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13964/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13965///
13966/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu16)
13967#[inline]
13968#[target_feature(enable = "avx512fp16,avx512vl")]
13969#[cfg_attr(test, assert_instr(vcvtph2uw))]
13970#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13971pub fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
13972    _mm256_mask_cvtph_epu16(_mm256_setzero_si256(), k, a)
13973}
13974
13975/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13976/// and store the results in dst.
13977///
13978/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu16)
13979#[inline]
13980#[target_feature(enable = "avx512fp16")]
13981#[cfg_attr(test, assert_instr(vcvtph2uw))]
13982#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13983pub fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
13984    _mm512_mask_cvtph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
13985}
13986
13987/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13988/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13989/// mask bit is not set).
13990///
13991/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu16)
13992#[inline]
13993#[target_feature(enable = "avx512fp16")]
13994#[cfg_attr(test, assert_instr(vcvtph2uw))]
13995#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13996pub fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13997    unsafe {
13998        transmute(vcvtph2uw_512(
13999            a,
14000            src.as_u16x32(),
14001            k,
14002            _MM_FROUND_CUR_DIRECTION,
14003        ))
14004    }
14005}
14006
14007/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
14008/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14009///
14010/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu16)
14011#[inline]
14012#[target_feature(enable = "avx512fp16")]
14013#[cfg_attr(test, assert_instr(vcvtph2uw))]
14014#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14015pub fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
14016    _mm512_mask_cvtph_epu16(_mm512_setzero_si512(), k, a)
14017}
14018
14019/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
14020/// and store the results in dst.
14021///
14022/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
14023///
14024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu16)
14025#[inline]
14026#[target_feature(enable = "avx512fp16")]
14027#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
14028#[rustc_legacy_const_generics(1)]
14029#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14030pub fn _mm512_cvt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
14031    static_assert_sae!(SAE);
14032    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
14033}
14034
14035/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
14036/// and store the results in dst using writemask k (elements are copied from src when the corresponding
14037/// mask bit is not set).
14038///
14039/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
14040///
14041/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu16)
14042#[inline]
14043#[target_feature(enable = "avx512fp16")]
14044#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
14045#[rustc_legacy_const_generics(3)]
14046#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14047pub fn _mm512_mask_cvt_roundph_epu16<const SAE: i32>(
14048    src: __m512i,
14049    k: __mmask32,
14050    a: __m512h,
14051) -> __m512i {
14052    unsafe {
14053        static_assert_sae!(SAE);
14054        transmute(vcvtph2uw_512(a, src.as_u16x32(), k, SAE))
14055    }
14056}
14057
14058/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
14059/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14060///
14061/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
14062///
14063/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu16)
14064#[inline]
14065#[target_feature(enable = "avx512fp16")]
14066#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
14067#[rustc_legacy_const_generics(2)]
14068#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14069pub fn _mm512_maskz_cvt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14070    static_assert_sae!(SAE);
14071    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
14072}
14073
14074/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14075/// truncation, and store the results in dst.
14076///
14077/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi16)
14078#[inline]
14079#[target_feature(enable = "avx512fp16,avx512vl")]
14080#[cfg_attr(test, assert_instr(vcvttph2w))]
14081#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14082pub fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
14083    _mm_mask_cvttph_epi16(_mm_undefined_si128(), 0xff, a)
14084}
14085
14086/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14087/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14088/// mask bit is not set).
14089///
14090/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi16)
14091#[inline]
14092#[target_feature(enable = "avx512fp16,avx512vl")]
14093#[cfg_attr(test, assert_instr(vcvttph2w))]
14094#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14095pub fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14096    unsafe { transmute(vcvttph2w_128(a, src.as_i16x8(), k)) }
14097}
14098
14099/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14100/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14101/// mask bit is not set).
14102///
14103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi16)
14104#[inline]
14105#[target_feature(enable = "avx512fp16,avx512vl")]
14106#[cfg_attr(test, assert_instr(vcvttph2w))]
14107#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14108pub fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
14109    _mm_mask_cvttph_epi16(_mm_setzero_si128(), k, a)
14110}
14111
14112/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14113/// truncation, and store the results in dst.
14114///
14115/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi16)
14116#[inline]
14117#[target_feature(enable = "avx512fp16,avx512vl")]
14118#[cfg_attr(test, assert_instr(vcvttph2w))]
14119#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14120pub fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
14121    _mm256_mask_cvttph_epi16(_mm256_undefined_si256(), 0xffff, a)
14122}
14123
14124/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14125/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14126/// mask bit is not set).
14127///
14128/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi16)
14129#[inline]
14130#[target_feature(enable = "avx512fp16,avx512vl")]
14131#[cfg_attr(test, assert_instr(vcvttph2w))]
14132#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14133pub fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
14134    unsafe { transmute(vcvttph2w_256(a, src.as_i16x16(), k)) }
14135}
14136
14137/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14138/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14139/// mask bit is not set).
14140///
14141/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi16)
14142#[inline]
14143#[target_feature(enable = "avx512fp16,avx512vl")]
14144#[cfg_attr(test, assert_instr(vcvttph2w))]
14145#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14146pub fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
14147    _mm256_mask_cvttph_epi16(_mm256_setzero_si256(), k, a)
14148}
14149
14150/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14151/// truncation, and store the results in dst.
14152///
14153/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi16)
14154#[inline]
14155#[target_feature(enable = "avx512fp16")]
14156#[cfg_attr(test, assert_instr(vcvttph2w))]
14157#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14158pub fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
14159    _mm512_mask_cvttph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
14160}
14161
14162/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14163/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14164/// mask bit is not set).
14165///
14166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi16)
14167#[inline]
14168#[target_feature(enable = "avx512fp16")]
14169#[cfg_attr(test, assert_instr(vcvttph2w))]
14170#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14171pub fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
14172    unsafe {
14173        transmute(vcvttph2w_512(
14174            a,
14175            src.as_i16x32(),
14176            k,
14177            _MM_FROUND_CUR_DIRECTION,
14178        ))
14179    }
14180}
14181
14182/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14183/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14184/// mask bit is not set).
14185///
14186/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi16)
14187#[inline]
14188#[target_feature(enable = "avx512fp16")]
14189#[cfg_attr(test, assert_instr(vcvttph2w))]
14190#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14191pub fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
14192    _mm512_mask_cvttph_epi16(_mm512_setzero_si512(), k, a)
14193}
14194
14195/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14196/// truncation, and store the results in dst.
14197///
14198/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14199///
14200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi16)
14201#[inline]
14202#[target_feature(enable = "avx512fp16")]
14203#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
14204#[rustc_legacy_const_generics(1)]
14205#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14206pub fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
14207    static_assert_sae!(SAE);
14208    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
14209}
14210
14211/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14212/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14213/// mask bit is not set).
14214///
14215/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14216///
14217/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi16)
14218#[inline]
14219#[target_feature(enable = "avx512fp16")]
14220#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
14221#[rustc_legacy_const_generics(3)]
14222#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14223pub fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
14224    src: __m512i,
14225    k: __mmask32,
14226    a: __m512h,
14227) -> __m512i {
14228    unsafe {
14229        static_assert_sae!(SAE);
14230        transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
14231    }
14232}
14233
14234/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14235/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14236/// mask bit is not set).
14237///
14238/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14239///
14240/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi16)
14241#[inline]
14242#[target_feature(enable = "avx512fp16")]
14243#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
14244#[rustc_legacy_const_generics(2)]
14245#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14246pub fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14247    static_assert_sae!(SAE);
14248    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_setzero_si512(), k, a)
14249}
14250
14251/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14252/// truncation, and store the results in dst.
14253///
14254/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu16)
14255#[inline]
14256#[target_feature(enable = "avx512fp16,avx512vl")]
14257#[cfg_attr(test, assert_instr(vcvttph2uw))]
14258#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14259pub fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
14260    _mm_mask_cvttph_epu16(_mm_undefined_si128(), 0xff, a)
14261}
14262
14263/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14264/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14265/// mask bit is not set).
14266///
14267/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu16)
14268#[inline]
14269#[target_feature(enable = "avx512fp16,avx512vl")]
14270#[cfg_attr(test, assert_instr(vcvttph2uw))]
14271#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14272pub fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14273    unsafe { transmute(vcvttph2uw_128(a, src.as_u16x8(), k)) }
14274}
14275
14276/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14277/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14278/// mask bit is not set).
14279///
14280/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu16)
14281#[inline]
14282#[target_feature(enable = "avx512fp16,avx512vl")]
14283#[cfg_attr(test, assert_instr(vcvttph2uw))]
14284#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14285pub fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
14286    _mm_mask_cvttph_epu16(_mm_setzero_si128(), k, a)
14287}
14288
14289/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14290/// truncation, and store the results in dst.
14291///
14292/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu16)
14293#[inline]
14294#[target_feature(enable = "avx512fp16,avx512vl")]
14295#[cfg_attr(test, assert_instr(vcvttph2uw))]
14296#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14297pub fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
14298    _mm256_mask_cvttph_epu16(_mm256_undefined_si256(), 0xffff, a)
14299}
14300
14301/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14302/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14303/// mask bit is not set).
14304///
14305/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu16)
14306#[inline]
14307#[target_feature(enable = "avx512fp16,avx512vl")]
14308#[cfg_attr(test, assert_instr(vcvttph2uw))]
14309#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14310pub fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
14311    unsafe { transmute(vcvttph2uw_256(a, src.as_u16x16(), k)) }
14312}
14313
14314/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14315/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14316/// mask bit is not set).
14317///
14318/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu16)
14319#[inline]
14320#[target_feature(enable = "avx512fp16,avx512vl")]
14321#[cfg_attr(test, assert_instr(vcvttph2uw))]
14322#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14323pub fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
14324    _mm256_mask_cvttph_epu16(_mm256_setzero_si256(), k, a)
14325}
14326
14327/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14328/// truncation, and store the results in dst.
14329///
14330/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu16)
14331#[inline]
14332#[target_feature(enable = "avx512fp16")]
14333#[cfg_attr(test, assert_instr(vcvttph2uw))]
14334#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14335pub fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
14336    _mm512_mask_cvttph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
14337}
14338
14339/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14340/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14341/// mask bit is not set).
14342///
14343/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu16)
14344#[inline]
14345#[target_feature(enable = "avx512fp16")]
14346#[cfg_attr(test, assert_instr(vcvttph2uw))]
14347#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14348pub fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
14349    unsafe {
14350        transmute(vcvttph2uw_512(
14351            a,
14352            src.as_u16x32(),
14353            k,
14354            _MM_FROUND_CUR_DIRECTION,
14355        ))
14356    }
14357}
14358
14359/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14360/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14361/// mask bit is not set).
14362///
14363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu16)
14364#[inline]
14365#[target_feature(enable = "avx512fp16")]
14366#[cfg_attr(test, assert_instr(vcvttph2uw))]
14367#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14368pub fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
14369    _mm512_mask_cvttph_epu16(_mm512_setzero_si512(), k, a)
14370}
14371
14372/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14373/// truncation, and store the results in dst.
14374///
14375/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14376///
14377/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu16)
14378#[inline]
14379#[target_feature(enable = "avx512fp16")]
14380#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14381#[rustc_legacy_const_generics(1)]
14382#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14383pub fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
14384    static_assert_sae!(SAE);
14385    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
14386}
14387
14388/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14389/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14390/// mask bit is not set).
14391///
14392/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14393///
14394/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu16)
14395#[inline]
14396#[target_feature(enable = "avx512fp16")]
14397#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14398#[rustc_legacy_const_generics(3)]
14399#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14400pub fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
14401    src: __m512i,
14402    k: __mmask32,
14403    a: __m512h,
14404) -> __m512i {
14405    unsafe {
14406        static_assert_sae!(SAE);
14407        transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
14408    }
14409}
14410
14411/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14412/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14413/// mask bit is not set).
14414///
14415/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14416///
14417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu16)
14418#[inline]
14419#[target_feature(enable = "avx512fp16")]
14420#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14421#[rustc_legacy_const_generics(2)]
14422#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14423pub fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14424    static_assert_sae!(SAE);
14425    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
14426}
14427
14428/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14429/// results in dst.
14430///
14431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi32)
14432#[inline]
14433#[target_feature(enable = "avx512fp16,avx512vl")]
14434#[cfg_attr(test, assert_instr(vcvtph2dq))]
14435#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14436pub fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
14437    _mm_mask_cvtph_epi32(_mm_undefined_si128(), 0xff, a)
14438}
14439
14440/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14441/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14442///
14443/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi32)
14444#[inline]
14445#[target_feature(enable = "avx512fp16,avx512vl")]
14446#[cfg_attr(test, assert_instr(vcvtph2dq))]
14447#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14448pub fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14449    unsafe { transmute(vcvtph2dq_128(a, src.as_i32x4(), k)) }
14450}
14451
14452/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14453/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14454///
14455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi32)
14456#[inline]
14457#[target_feature(enable = "avx512fp16,avx512vl")]
14458#[cfg_attr(test, assert_instr(vcvtph2dq))]
14459#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14460pub fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14461    _mm_mask_cvtph_epi32(_mm_setzero_si128(), k, a)
14462}
14463
14464/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14465/// results in dst.
14466///
14467/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi32)
14468#[inline]
14469#[target_feature(enable = "avx512fp16,avx512vl")]
14470#[cfg_attr(test, assert_instr(vcvtph2dq))]
14471#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14472pub fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
14473    _mm256_mask_cvtph_epi32(_mm256_undefined_si256(), 0xff, a)
14474}
14475
14476/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14477/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14478///
14479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi32)
14480#[inline]
14481#[target_feature(enable = "avx512fp16,avx512vl")]
14482#[cfg_attr(test, assert_instr(vcvtph2dq))]
14483#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14484pub fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14485    unsafe { transmute(vcvtph2dq_256(a, src.as_i32x8(), k)) }
14486}
14487
14488/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14489/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14490///
14491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi32)
14492#[inline]
14493#[target_feature(enable = "avx512fp16,avx512vl")]
14494#[cfg_attr(test, assert_instr(vcvtph2dq))]
14495#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14496pub fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14497    _mm256_mask_cvtph_epi32(_mm256_setzero_si256(), k, a)
14498}
14499
14500/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14501/// results in dst.
14502///
14503/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi32)
14504#[inline]
14505#[target_feature(enable = "avx512fp16")]
14506#[cfg_attr(test, assert_instr(vcvtph2dq))]
14507#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14508pub fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
14509    _mm512_mask_cvtph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14510}
14511
14512/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14513/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14514///
14515/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi32)
14516#[inline]
14517#[target_feature(enable = "avx512fp16")]
14518#[cfg_attr(test, assert_instr(vcvtph2dq))]
14519#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14520pub fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14521    unsafe {
14522        transmute(vcvtph2dq_512(
14523            a,
14524            src.as_i32x16(),
14525            k,
14526            _MM_FROUND_CUR_DIRECTION,
14527        ))
14528    }
14529}
14530
14531/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14532/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14533///
14534/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi32)
14535#[inline]
14536#[target_feature(enable = "avx512fp16")]
14537#[cfg_attr(test, assert_instr(vcvtph2dq))]
14538#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14539pub fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14540    _mm512_mask_cvtph_epi32(_mm512_setzero_si512(), k, a)
14541}
14542
14543/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14544/// results in dst.
14545///
14546/// Rounding is done according to the rounding parameter, which can be one of:
14547///
14548/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14549/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14550/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14551/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14552/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14553///
14554/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi32)
14555#[inline]
14556#[target_feature(enable = "avx512fp16")]
14557#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14558#[rustc_legacy_const_generics(1)]
14559#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14560pub fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14561    static_assert_rounding!(ROUNDING);
14562    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14563}
14564
14565/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14566/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14567///
14568/// Rounding is done according to the rounding parameter, which can be one of:
14569///
14570/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14571/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14572/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14573/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14574/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14575///
14576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi32)
14577#[inline]
14578#[target_feature(enable = "avx512fp16")]
14579#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14580#[rustc_legacy_const_generics(3)]
14581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14582pub fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
14583    src: __m512i,
14584    k: __mmask16,
14585    a: __m256h,
14586) -> __m512i {
14587    unsafe {
14588        static_assert_rounding!(ROUNDING);
14589        transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
14590    }
14591}
14592
14593/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14594/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14595///
14596/// Rounding is done according to the rounding parameter, which can be one of:
14597///
14598/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14599/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14600/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14601/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14602/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14603///
14604/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi32)
14605#[inline]
14606#[target_feature(enable = "avx512fp16")]
14607#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14608#[rustc_legacy_const_generics(2)]
14609#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14610pub fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14611    static_assert_rounding!(ROUNDING);
14612    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14613}
14614
14615/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14616/// the result in dst.
14617///
14618/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_i32)
14619#[inline]
14620#[target_feature(enable = "avx512fp16")]
14621#[cfg_attr(test, assert_instr(vcvtsh2si))]
14622#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14623pub fn _mm_cvtsh_i32(a: __m128h) -> i32 {
14624    unsafe { vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14625}
14626
14627/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14628/// the result in dst.
14629///
14630/// Rounding is done according to the rounding parameter, which can be one of:
14631///
14632/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14633/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14634/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14635/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14636/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14637///
14638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_i32)
14639#[inline]
14640#[target_feature(enable = "avx512fp16")]
14641#[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
14642#[rustc_legacy_const_generics(1)]
14643#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14644pub fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
14645    unsafe {
14646        static_assert_rounding!(ROUNDING);
14647        vcvtsh2si32(a, ROUNDING)
14648    }
14649}
14650
14651/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14652/// results in dst.
14653///
14654/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu32)
14655#[inline]
14656#[target_feature(enable = "avx512fp16,avx512vl")]
14657#[cfg_attr(test, assert_instr(vcvtph2udq))]
14658#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14659pub fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
14660    _mm_mask_cvtph_epu32(_mm_undefined_si128(), 0xff, a)
14661}
14662
14663/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14664/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14665///
14666/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu32)
14667#[inline]
14668#[target_feature(enable = "avx512fp16,avx512vl")]
14669#[cfg_attr(test, assert_instr(vcvtph2udq))]
14670#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14671pub fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14672    unsafe { transmute(vcvtph2udq_128(a, src.as_u32x4(), k)) }
14673}
14674
14675/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14676/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14677///
14678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu32)
14679#[inline]
14680#[target_feature(enable = "avx512fp16,avx512vl")]
14681#[cfg_attr(test, assert_instr(vcvtph2udq))]
14682#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14683pub fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14684    _mm_mask_cvtph_epu32(_mm_setzero_si128(), k, a)
14685}
14686
14687/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14688/// the results in dst.
14689///
14690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu32)
14691#[inline]
14692#[target_feature(enable = "avx512fp16,avx512vl")]
14693#[cfg_attr(test, assert_instr(vcvtph2udq))]
14694#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14695pub fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
14696    _mm256_mask_cvtph_epu32(_mm256_undefined_si256(), 0xff, a)
14697}
14698
14699/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14700/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14701///
14702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu32)
14703#[inline]
14704#[target_feature(enable = "avx512fp16,avx512vl")]
14705#[cfg_attr(test, assert_instr(vcvtph2udq))]
14706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14707pub fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14708    unsafe { transmute(vcvtph2udq_256(a, src.as_u32x8(), k)) }
14709}
14710
14711/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14712/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14713///
14714/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu32)
14715#[inline]
14716#[target_feature(enable = "avx512fp16,avx512vl")]
14717#[cfg_attr(test, assert_instr(vcvtph2udq))]
14718#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14719pub fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14720    _mm256_mask_cvtph_epu32(_mm256_setzero_si256(), k, a)
14721}
14722
14723/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14724/// the results in dst.
14725///
14726/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu32)
14727#[inline]
14728#[target_feature(enable = "avx512fp16")]
14729#[cfg_attr(test, assert_instr(vcvtph2udq))]
14730#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14731pub fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
14732    _mm512_mask_cvtph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14733}
14734
14735/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14736/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14737///
14738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu32)
14739#[inline]
14740#[target_feature(enable = "avx512fp16")]
14741#[cfg_attr(test, assert_instr(vcvtph2udq))]
14742#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14743pub fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14744    unsafe {
14745        transmute(vcvtph2udq_512(
14746            a,
14747            src.as_u32x16(),
14748            k,
14749            _MM_FROUND_CUR_DIRECTION,
14750        ))
14751    }
14752}
14753
14754/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14755/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14756///
14757/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu32)
14758#[inline]
14759#[target_feature(enable = "avx512fp16")]
14760#[cfg_attr(test, assert_instr(vcvtph2udq))]
14761#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14762pub fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14763    _mm512_mask_cvtph_epu32(_mm512_setzero_si512(), k, a)
14764}
14765
14766/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14767/// the results in dst.
14768///
14769/// Rounding is done according to the rounding parameter, which can be one of:
14770///
14771/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14772/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14773/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14774/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14775/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14776///
14777/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu32)
14778#[inline]
14779#[target_feature(enable = "avx512fp16")]
14780#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14781#[rustc_legacy_const_generics(1)]
14782#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14783pub fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14784    static_assert_rounding!(ROUNDING);
14785    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14786}
14787
14788/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14789/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14790///
14791/// Rounding is done according to the rounding parameter, which can be one of:
14792///
14793/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14794/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14795/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14796/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14797/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14798///
14799/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu32)
14800#[inline]
14801#[target_feature(enable = "avx512fp16")]
14802#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14803#[rustc_legacy_const_generics(3)]
14804#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14805pub fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
14806    src: __m512i,
14807    k: __mmask16,
14808    a: __m256h,
14809) -> __m512i {
14810    unsafe {
14811        static_assert_rounding!(ROUNDING);
14812        transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
14813    }
14814}
14815
14816/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14817/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14818///
14819/// Rounding is done according to the rounding parameter, which can be one of:
14820///
14821/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14822/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14823/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14824/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14825/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14826///
14827/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu32)
14828#[inline]
14829#[target_feature(enable = "avx512fp16")]
14830#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14831#[rustc_legacy_const_generics(2)]
14832#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14833pub fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14834    static_assert_rounding!(ROUNDING);
14835    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14836}
14837
14838/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14839/// the result in dst.
14840///
14841/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_u32)
14842#[inline]
14843#[target_feature(enable = "avx512fp16")]
14844#[cfg_attr(test, assert_instr(vcvtsh2usi))]
14845#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14846pub fn _mm_cvtsh_u32(a: __m128h) -> u32 {
14847    unsafe { vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
14848}
14849
14850/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14851/// the result in dst.
14852///
14853/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
14854///
14855/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_u32)
14856#[inline]
14857#[target_feature(enable = "avx512fp16")]
14858#[cfg_attr(test, assert_instr(vcvtsh2usi, SAE = 8))]
14859#[rustc_legacy_const_generics(1)]
14860#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14861pub fn _mm_cvt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
14862    unsafe {
14863        static_assert_rounding!(SAE);
14864        vcvtsh2usi32(a, SAE)
14865    }
14866}
14867
14868/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14869/// store the results in dst.
14870///
14871/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi32)
14872#[inline]
14873#[target_feature(enable = "avx512fp16,avx512vl")]
14874#[cfg_attr(test, assert_instr(vcvttph2dq))]
14875#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14876pub fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
14877    _mm_mask_cvttph_epi32(_mm_undefined_si128(), 0xff, a)
14878}
14879
14880/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14881/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14882///
14883/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi32)
14884#[inline]
14885#[target_feature(enable = "avx512fp16,avx512vl")]
14886#[cfg_attr(test, assert_instr(vcvttph2dq))]
14887#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14888pub fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14889    unsafe { transmute(vcvttph2dq_128(a, src.as_i32x4(), k)) }
14890}
14891
14892/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14893/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14894///
14895/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi32)
14896#[inline]
14897#[target_feature(enable = "avx512fp16,avx512vl")]
14898#[cfg_attr(test, assert_instr(vcvttph2dq))]
14899#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14900pub fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14901    _mm_mask_cvttph_epi32(_mm_setzero_si128(), k, a)
14902}
14903
14904/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14905/// store the results in dst.
14906///
14907/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi32)
14908#[inline]
14909#[target_feature(enable = "avx512fp16,avx512vl")]
14910#[cfg_attr(test, assert_instr(vcvttph2dq))]
14911#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14912pub fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
14913    _mm256_mask_cvttph_epi32(_mm256_undefined_si256(), 0xff, a)
14914}
14915
14916/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14917/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14918///
14919/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi32)
14920#[inline]
14921#[target_feature(enable = "avx512fp16,avx512vl")]
14922#[cfg_attr(test, assert_instr(vcvttph2dq))]
14923#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14924pub fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14925    unsafe { transmute(vcvttph2dq_256(a, src.as_i32x8(), k)) }
14926}
14927
14928/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14929/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14930///
14931/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi32)
14932#[inline]
14933#[target_feature(enable = "avx512fp16,avx512vl")]
14934#[cfg_attr(test, assert_instr(vcvttph2dq))]
14935#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14936pub fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14937    _mm256_mask_cvttph_epi32(_mm256_setzero_si256(), k, a)
14938}
14939
14940/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14941/// store the results in dst.
14942///
14943/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi32)
14944#[inline]
14945#[target_feature(enable = "avx512fp16")]
14946#[cfg_attr(test, assert_instr(vcvttph2dq))]
14947#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14948pub fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
14949    _mm512_mask_cvttph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14950}
14951
14952/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14953/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14954///
14955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi32)
14956#[inline]
14957#[target_feature(enable = "avx512fp16")]
14958#[cfg_attr(test, assert_instr(vcvttph2dq))]
14959#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14960pub fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14961    unsafe {
14962        transmute(vcvttph2dq_512(
14963            a,
14964            src.as_i32x16(),
14965            k,
14966            _MM_FROUND_CUR_DIRECTION,
14967        ))
14968    }
14969}
14970
14971/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14972/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14973///
14974/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi32)
14975#[inline]
14976#[target_feature(enable = "avx512fp16")]
14977#[cfg_attr(test, assert_instr(vcvttph2dq))]
14978#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14979pub fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14980    _mm512_mask_cvttph_epi32(_mm512_setzero_si512(), k, a)
14981}
14982
14983/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14984/// store the results in dst.
14985///
14986/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14987///
14988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi32)
14989#[inline]
14990#[target_feature(enable = "avx512fp16")]
14991#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14992#[rustc_legacy_const_generics(1)]
14993#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14994pub fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
14995    static_assert_sae!(SAE);
14996    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14997}
14998
14999/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
15000/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15001///
15002/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15003///
15004/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi32)
15005#[inline]
15006#[target_feature(enable = "avx512fp16")]
15007#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
15008#[rustc_legacy_const_generics(3)]
15009#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15010pub fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
15011    src: __m512i,
15012    k: __mmask16,
15013    a: __m256h,
15014) -> __m512i {
15015    unsafe {
15016        static_assert_sae!(SAE);
15017        transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
15018    }
15019}
15020
15021/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
15022/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15023///
15024/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15025///
15026/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi32)
15027#[inline]
15028#[target_feature(enable = "avx512fp16")]
15029#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
15030#[rustc_legacy_const_generics(2)]
15031#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15032pub fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
15033    static_assert_sae!(SAE);
15034    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_setzero_si512(), k, a)
15035}
15036
15037/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
15038/// the result in dst.
15039///
15040/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_i32)
15041#[inline]
15042#[target_feature(enable = "avx512fp16")]
15043#[cfg_attr(test, assert_instr(vcvttsh2si))]
15044#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15045pub fn _mm_cvttsh_i32(a: __m128h) -> i32 {
15046    unsafe { vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
15047}
15048
15049/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
15050/// the result in dst.
15051///
15052/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15053///
15054/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_i32)
15055#[inline]
15056#[target_feature(enable = "avx512fp16")]
15057#[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
15058#[rustc_legacy_const_generics(1)]
15059#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15060pub fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
15061    unsafe {
15062        static_assert_sae!(SAE);
15063        vcvttsh2si32(a, SAE)
15064    }
15065}
15066
15067/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15068/// store the results in dst.
15069///
15070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu32)
15071#[inline]
15072#[target_feature(enable = "avx512fp16,avx512vl")]
15073#[cfg_attr(test, assert_instr(vcvttph2udq))]
15074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15075pub fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
15076    _mm_mask_cvttph_epu32(_mm_undefined_si128(), 0xff, a)
15077}
15078
15079/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15080/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15081///
15082/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu32)
15083#[inline]
15084#[target_feature(enable = "avx512fp16,avx512vl")]
15085#[cfg_attr(test, assert_instr(vcvttph2udq))]
15086#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15087pub fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15088    unsafe { transmute(vcvttph2udq_128(a, src.as_u32x4(), k)) }
15089}
15090
15091/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15092/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15093///
15094/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu32)
15095#[inline]
15096#[target_feature(enable = "avx512fp16,avx512vl")]
15097#[cfg_attr(test, assert_instr(vcvttph2udq))]
15098#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15099pub fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
15100    _mm_mask_cvttph_epu32(_mm_setzero_si128(), k, a)
15101}
15102
15103/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15104/// store the results in dst.
15105///
15106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu32)
15107#[inline]
15108#[target_feature(enable = "avx512fp16,avx512vl")]
15109#[cfg_attr(test, assert_instr(vcvttph2udq))]
15110#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15111pub fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
15112    _mm256_mask_cvttph_epu32(_mm256_undefined_si256(), 0xff, a)
15113}
15114
15115/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15116/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15117///
15118/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu32)
15119#[inline]
15120#[target_feature(enable = "avx512fp16,avx512vl")]
15121#[cfg_attr(test, assert_instr(vcvttph2udq))]
15122#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15123pub fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15124    unsafe { transmute(vcvttph2udq_256(a, src.as_u32x8(), k)) }
15125}
15126
15127/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15128/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15129///
15130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu32)
15131#[inline]
15132#[target_feature(enable = "avx512fp16,avx512vl")]
15133#[cfg_attr(test, assert_instr(vcvttph2udq))]
15134#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15135pub fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
15136    _mm256_mask_cvttph_epu32(_mm256_setzero_si256(), k, a)
15137}
15138
15139/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15140/// store the results in dst.
15141///
15142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu32)
15143#[inline]
15144#[target_feature(enable = "avx512fp16")]
15145#[cfg_attr(test, assert_instr(vcvttph2udq))]
15146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15147pub fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
15148    _mm512_mask_cvttph_epu32(_mm512_undefined_epi32(), 0xffff, a)
15149}
15150
15151/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15152/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15153///
15154/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu32)
15155#[inline]
15156#[target_feature(enable = "avx512fp16")]
15157#[cfg_attr(test, assert_instr(vcvttph2udq))]
15158#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15159pub fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
15160    unsafe {
15161        transmute(vcvttph2udq_512(
15162            a,
15163            src.as_u32x16(),
15164            k,
15165            _MM_FROUND_CUR_DIRECTION,
15166        ))
15167    }
15168}
15169
15170/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15171/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15172///
15173/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu32)
15174#[inline]
15175#[target_feature(enable = "avx512fp16")]
15176#[cfg_attr(test, assert_instr(vcvttph2udq))]
15177#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15178pub fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
15179    _mm512_mask_cvttph_epu32(_mm512_setzero_si512(), k, a)
15180}
15181
15182/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15183/// store the results in dst.
15184///
15185/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15186///
15187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu32)
15188#[inline]
15189#[target_feature(enable = "avx512fp16")]
15190#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
15191#[rustc_legacy_const_generics(1)]
15192#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15193pub fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
15194    static_assert_sae!(SAE);
15195    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
15196}
15197
15198/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15199/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15200///
15201/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15202///
15203/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu32)
15204#[inline]
15205#[target_feature(enable = "avx512fp16")]
15206#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
15207#[rustc_legacy_const_generics(3)]
15208#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15209pub fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
15210    src: __m512i,
15211    k: __mmask16,
15212    a: __m256h,
15213) -> __m512i {
15214    unsafe {
15215        static_assert_sae!(SAE);
15216        transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
15217    }
15218}
15219
15220/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15221/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15222///
15223/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15224///
15225/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu32)
15226#[inline]
15227#[target_feature(enable = "avx512fp16")]
15228#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
15229#[rustc_legacy_const_generics(2)]
15230#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15231pub fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
15232    static_assert_sae!(SAE);
15233    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_setzero_si512(), k, a)
15234}
15235
15236/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
15237/// the result in dst.
15238///
15239/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_u32)
15240#[inline]
15241#[target_feature(enable = "avx512fp16")]
15242#[cfg_attr(test, assert_instr(vcvttsh2usi))]
15243#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15244pub fn _mm_cvttsh_u32(a: __m128h) -> u32 {
15245    unsafe { vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
15246}
15247
15248/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
15249/// the result in dst.
15250///
15251/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15252///
15253/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_u32)
15254#[inline]
15255#[target_feature(enable = "avx512fp16")]
15256#[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
15257#[rustc_legacy_const_generics(1)]
15258#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15259pub fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
15260    unsafe {
15261        static_assert_sae!(SAE);
15262        vcvttsh2usi32(a, SAE)
15263    }
15264}
15265
15266/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15267/// store the results in dst.
15268///
15269/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi64)
15270#[inline]
15271#[target_feature(enable = "avx512fp16,avx512vl")]
15272#[cfg_attr(test, assert_instr(vcvtph2qq))]
15273#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15274pub fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
15275    _mm_mask_cvtph_epi64(_mm_undefined_si128(), 0xff, a)
15276}
15277
15278/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15279/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15280///
15281/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi64)
15282#[inline]
15283#[target_feature(enable = "avx512fp16,avx512vl")]
15284#[cfg_attr(test, assert_instr(vcvtph2qq))]
15285#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15286pub fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15287    unsafe { transmute(vcvtph2qq_128(a, src.as_i64x2(), k)) }
15288}
15289
15290/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15291/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15292///
15293/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi64)
15294#[inline]
15295#[target_feature(enable = "avx512fp16,avx512vl")]
15296#[cfg_attr(test, assert_instr(vcvtph2qq))]
15297#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15298pub fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15299    _mm_mask_cvtph_epi64(_mm_setzero_si128(), k, a)
15300}
15301
15302/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15303/// store the results in dst.
15304///
15305/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi64)
15306#[inline]
15307#[target_feature(enable = "avx512fp16,avx512vl")]
15308#[cfg_attr(test, assert_instr(vcvtph2qq))]
15309#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15310pub fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
15311    _mm256_mask_cvtph_epi64(_mm256_undefined_si256(), 0xff, a)
15312}
15313
15314/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15315/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15316///
15317/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi64)
15318#[inline]
15319#[target_feature(enable = "avx512fp16,avx512vl")]
15320#[cfg_attr(test, assert_instr(vcvtph2qq))]
15321#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15322pub fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15323    unsafe { transmute(vcvtph2qq_256(a, src.as_i64x4(), k)) }
15324}
15325
15326/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15327/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15328///
15329/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi64)
15330#[inline]
15331#[target_feature(enable = "avx512fp16,avx512vl")]
15332#[cfg_attr(test, assert_instr(vcvtph2qq))]
15333#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15334pub fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15335    _mm256_mask_cvtph_epi64(_mm256_setzero_si256(), k, a)
15336}
15337
15338/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15339/// store the results in dst.
15340///
15341/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi64)
15342#[inline]
15343#[target_feature(enable = "avx512fp16")]
15344#[cfg_attr(test, assert_instr(vcvtph2qq))]
15345#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15346pub fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
15347    _mm512_mask_cvtph_epi64(_mm512_undefined_epi32(), 0xff, a)
15348}
15349
15350/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15351/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15352///
15353/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi64)
15354#[inline]
15355#[target_feature(enable = "avx512fp16")]
15356#[cfg_attr(test, assert_instr(vcvtph2qq))]
15357#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15358pub fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15359    unsafe {
15360        transmute(vcvtph2qq_512(
15361            a,
15362            src.as_i64x8(),
15363            k,
15364            _MM_FROUND_CUR_DIRECTION,
15365        ))
15366    }
15367}
15368
15369/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15370/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15371///
15372/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi64)
15373#[inline]
15374#[target_feature(enable = "avx512fp16")]
15375#[cfg_attr(test, assert_instr(vcvtph2qq))]
15376#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15377pub fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15378    _mm512_mask_cvtph_epi64(_mm512_setzero_si512(), k, a)
15379}
15380
15381/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15382/// store the results in dst.
15383///
15384/// Rounding is done according to the rounding parameter, which can be one of:
15385///
15386/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15387/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15388/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15389/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15390/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15391///
15392/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi64)
15393#[inline]
15394#[target_feature(enable = "avx512fp16")]
15395#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15396#[rustc_legacy_const_generics(1)]
15397#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15398pub fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15399    static_assert_rounding!(ROUNDING);
15400    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15401}
15402
15403/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15404/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15405///
15406/// Rounding is done according to the rounding parameter, which can be one of:
15407///
15408/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15409/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15410/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15411/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15412/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15413///
15414/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi64)
15415#[inline]
15416#[target_feature(enable = "avx512fp16")]
15417#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15418#[rustc_legacy_const_generics(3)]
15419#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15420pub fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
15421    src: __m512i,
15422    k: __mmask8,
15423    a: __m128h,
15424) -> __m512i {
15425    unsafe {
15426        static_assert_rounding!(ROUNDING);
15427        transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
15428    }
15429}
15430
15431/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15432/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15433///
15434/// Rounding is done according to the rounding parameter, which can be one of:
15435///
15436/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15437/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15438/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15439/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15440/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15441///
15442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi64)
15443#[inline]
15444#[target_feature(enable = "avx512fp16")]
15445#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15446#[rustc_legacy_const_generics(2)]
15447#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15448pub fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15449    static_assert_rounding!(ROUNDING);
15450    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15451}
15452
15453/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15454/// store the results in dst.
15455///
15456/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu64)
15457#[inline]
15458#[target_feature(enable = "avx512fp16,avx512vl")]
15459#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15460#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15461pub fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
15462    _mm_mask_cvtph_epu64(_mm_undefined_si128(), 0xff, a)
15463}
15464
15465/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15466/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15467///
15468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu64)
15469#[inline]
15470#[target_feature(enable = "avx512fp16,avx512vl")]
15471#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15472#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15473pub fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15474    unsafe { transmute(vcvtph2uqq_128(a, src.as_u64x2(), k)) }
15475}
15476
15477/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15478/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15479///
15480/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu64)
15481#[inline]
15482#[target_feature(enable = "avx512fp16,avx512vl")]
15483#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15484#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15485pub fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15486    _mm_mask_cvtph_epu64(_mm_setzero_si128(), k, a)
15487}
15488
15489/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15490/// store the results in dst.
15491///
15492/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu64)
15493#[inline]
15494#[target_feature(enable = "avx512fp16,avx512vl")]
15495#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15497pub fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
15498    _mm256_mask_cvtph_epu64(_mm256_undefined_si256(), 0xff, a)
15499}
15500
15501/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15502/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15503///
15504/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu64)
15505#[inline]
15506#[target_feature(enable = "avx512fp16,avx512vl")]
15507#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15508#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15509pub fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15510    unsafe { transmute(vcvtph2uqq_256(a, src.as_u64x4(), k)) }
15511}
15512
15513/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15514/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15515///
15516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu64)
15517#[inline]
15518#[target_feature(enable = "avx512fp16,avx512vl")]
15519#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15520#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15521pub fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15522    _mm256_mask_cvtph_epu64(_mm256_setzero_si256(), k, a)
15523}
15524
15525/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15526/// store the results in dst.
15527///
15528/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu64)
15529#[inline]
15530#[target_feature(enable = "avx512fp16")]
15531#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15532#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15533pub fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
15534    _mm512_mask_cvtph_epu64(_mm512_undefined_epi32(), 0xff, a)
15535}
15536
15537/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15538/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15539///
15540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu64)
15541#[inline]
15542#[target_feature(enable = "avx512fp16")]
15543#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15544#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15545pub fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15546    unsafe {
15547        transmute(vcvtph2uqq_512(
15548            a,
15549            src.as_u64x8(),
15550            k,
15551            _MM_FROUND_CUR_DIRECTION,
15552        ))
15553    }
15554}
15555
15556/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15557/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15558///
15559/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu64)
15560#[inline]
15561#[target_feature(enable = "avx512fp16")]
15562#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15563#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15564pub fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15565    _mm512_mask_cvtph_epu64(_mm512_setzero_si512(), k, a)
15566}
15567
15568/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15569/// store the results in dst.
15570///
15571/// Rounding is done according to the rounding parameter, which can be one of:
15572///
15573/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15574/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15575/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15576/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15577/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15578///
15579/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu64)
15580#[inline]
15581#[target_feature(enable = "avx512fp16")]
15582#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15583#[rustc_legacy_const_generics(1)]
15584#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15585pub fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15586    static_assert_rounding!(ROUNDING);
15587    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15588}
15589
15590/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15591/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15592///
15593/// Rounding is done according to the rounding parameter, which can be one of:
15594///
15595/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15596/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15597/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15598/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15599/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15600///
15601/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu64)
15602#[inline]
15603#[target_feature(enable = "avx512fp16")]
15604#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15605#[rustc_legacy_const_generics(3)]
15606#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15607pub fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
15608    src: __m512i,
15609    k: __mmask8,
15610    a: __m128h,
15611) -> __m512i {
15612    unsafe {
15613        static_assert_rounding!(ROUNDING);
15614        transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
15615    }
15616}
15617
15618/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15619/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15620///
15621/// Rounding is done according to the rounding parameter, which can be one of:
15622///
15623/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15624/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15625/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15626/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15627/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15628///
15629/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu64)
15630#[inline]
15631#[target_feature(enable = "avx512fp16")]
15632#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15633#[rustc_legacy_const_generics(2)]
15634#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15635pub fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15636    static_assert_rounding!(ROUNDING);
15637    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15638}
15639
15640/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15641/// store the results in dst.
15642///
15643/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi64)
15644#[inline]
15645#[target_feature(enable = "avx512fp16,avx512vl")]
15646#[cfg_attr(test, assert_instr(vcvttph2qq))]
15647#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15648pub fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
15649    _mm_mask_cvttph_epi64(_mm_undefined_si128(), 0xff, a)
15650}
15651
15652/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15653/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15654///
15655/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi64)
15656#[inline]
15657#[target_feature(enable = "avx512fp16,avx512vl")]
15658#[cfg_attr(test, assert_instr(vcvttph2qq))]
15659#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15660pub fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15661    unsafe { transmute(vcvttph2qq_128(a, src.as_i64x2(), k)) }
15662}
15663
15664/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15665/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15666///
15667/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi64)
15668#[inline]
15669#[target_feature(enable = "avx512fp16,avx512vl")]
15670#[cfg_attr(test, assert_instr(vcvttph2qq))]
15671#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15672pub fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15673    _mm_mask_cvttph_epi64(_mm_setzero_si128(), k, a)
15674}
15675
15676/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15677/// store the results in dst.
15678///
15679/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi64)
15680#[inline]
15681#[target_feature(enable = "avx512fp16,avx512vl")]
15682#[cfg_attr(test, assert_instr(vcvttph2qq))]
15683#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15684pub fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
15685    _mm256_mask_cvttph_epi64(_mm256_undefined_si256(), 0xff, a)
15686}
15687
15688/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15689/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15690///
15691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi64)
15692#[inline]
15693#[target_feature(enable = "avx512fp16,avx512vl")]
15694#[cfg_attr(test, assert_instr(vcvttph2qq))]
15695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15696pub fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15697    unsafe { transmute(vcvttph2qq_256(a, src.as_i64x4(), k)) }
15698}
15699
15700/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15701/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15702///
15703/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi64)
15704#[inline]
15705#[target_feature(enable = "avx512fp16,avx512vl")]
15706#[cfg_attr(test, assert_instr(vcvttph2qq))]
15707#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15708pub fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15709    _mm256_mask_cvttph_epi64(_mm256_setzero_si256(), k, a)
15710}
15711
15712/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15713/// store the results in dst.
15714///
15715/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi64)
15716#[inline]
15717#[target_feature(enable = "avx512fp16")]
15718#[cfg_attr(test, assert_instr(vcvttph2qq))]
15719#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15720pub fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
15721    _mm512_mask_cvttph_epi64(_mm512_undefined_epi32(), 0xff, a)
15722}
15723
15724/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15725/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15726///
15727/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi64)
15728#[inline]
15729#[target_feature(enable = "avx512fp16")]
15730#[cfg_attr(test, assert_instr(vcvttph2qq))]
15731#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15732pub fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15733    unsafe {
15734        transmute(vcvttph2qq_512(
15735            a,
15736            src.as_i64x8(),
15737            k,
15738            _MM_FROUND_CUR_DIRECTION,
15739        ))
15740    }
15741}
15742
15743/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15744/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15745///
15746/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi64)
15747#[inline]
15748#[target_feature(enable = "avx512fp16")]
15749#[cfg_attr(test, assert_instr(vcvttph2qq))]
15750#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15751pub fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15752    _mm512_mask_cvttph_epi64(_mm512_setzero_si512(), k, a)
15753}
15754
15755/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15756/// store the results in dst.
15757///
15758/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15759///
15760/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi64)
15761#[inline]
15762#[target_feature(enable = "avx512fp16")]
15763#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15764#[rustc_legacy_const_generics(1)]
15765#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15766pub fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
15767    static_assert_sae!(SAE);
15768    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15769}
15770
15771/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15772/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15773///
15774/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15775///
15776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi64)
15777#[inline]
15778#[target_feature(enable = "avx512fp16")]
15779#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15780#[rustc_legacy_const_generics(3)]
15781#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15782pub fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
15783    src: __m512i,
15784    k: __mmask8,
15785    a: __m128h,
15786) -> __m512i {
15787    unsafe {
15788        static_assert_sae!(SAE);
15789        transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
15790    }
15791}
15792
15793/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15794/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15795///
15796/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15797///
15798/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi64)
15799#[inline]
15800#[target_feature(enable = "avx512fp16")]
15801#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15802#[rustc_legacy_const_generics(2)]
15803#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15804pub fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15805    static_assert_sae!(SAE);
15806    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_setzero_si512(), k, a)
15807}
15808
15809/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15810/// store the results in dst.
15811///
15812/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu64)
15813#[inline]
15814#[target_feature(enable = "avx512fp16,avx512vl")]
15815#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15816#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15817pub fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
15818    _mm_mask_cvttph_epu64(_mm_undefined_si128(), 0xff, a)
15819}
15820
15821/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15822/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15823///
15824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu64)
15825#[inline]
15826#[target_feature(enable = "avx512fp16,avx512vl")]
15827#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15828#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15829pub fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15830    unsafe { transmute(vcvttph2uqq_128(a, src.as_u64x2(), k)) }
15831}
15832
15833/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15834/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15835///
15836/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu64)
15837#[inline]
15838#[target_feature(enable = "avx512fp16,avx512vl")]
15839#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15840#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15841pub fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15842    _mm_mask_cvttph_epu64(_mm_setzero_si128(), k, a)
15843}
15844
15845/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15846/// store the results in dst.
15847///
15848/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu64)
15849#[inline]
15850#[target_feature(enable = "avx512fp16,avx512vl")]
15851#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15852#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15853pub fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
15854    _mm256_mask_cvttph_epu64(_mm256_undefined_si256(), 0xff, a)
15855}
15856
15857/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15858/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15859///
15860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu64)
15861#[inline]
15862#[target_feature(enable = "avx512fp16,avx512vl")]
15863#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15864#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15865pub fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15866    unsafe { transmute(vcvttph2uqq_256(a, src.as_u64x4(), k)) }
15867}
15868
15869/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15870/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15871///
15872/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu64)
15873#[inline]
15874#[target_feature(enable = "avx512fp16,avx512vl")]
15875#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15876#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15877pub fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15878    _mm256_mask_cvttph_epu64(_mm256_setzero_si256(), k, a)
15879}
15880
15881/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15882/// store the results in dst.
15883///
15884/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu64)
15885#[inline]
15886#[target_feature(enable = "avx512fp16")]
15887#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15888#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15889pub fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
15890    _mm512_mask_cvttph_epu64(_mm512_undefined_epi32(), 0xff, a)
15891}
15892
15893/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15894/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15895///
15896/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu64)
15897#[inline]
15898#[target_feature(enable = "avx512fp16")]
15899#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15900#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15901pub fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15902    unsafe {
15903        transmute(vcvttph2uqq_512(
15904            a,
15905            src.as_u64x8(),
15906            k,
15907            _MM_FROUND_CUR_DIRECTION,
15908        ))
15909    }
15910}
15911
15912/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15913/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15914///
15915/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu64)
15916#[inline]
15917#[target_feature(enable = "avx512fp16")]
15918#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15919#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15920pub fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15921    _mm512_mask_cvttph_epu64(_mm512_setzero_si512(), k, a)
15922}
15923
15924/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15925/// store the results in dst.
15926///
15927/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15928///
15929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu64)
15930#[inline]
15931#[target_feature(enable = "avx512fp16")]
15932#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15933#[rustc_legacy_const_generics(1)]
15934#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15935pub fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
15936    static_assert_sae!(SAE);
15937    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15938}
15939
15940/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15941/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15942///
15943/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15944///
15945/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu64)
15946#[inline]
15947#[target_feature(enable = "avx512fp16")]
15948#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15949#[rustc_legacy_const_generics(3)]
15950#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15951pub fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
15952    src: __m512i,
15953    k: __mmask8,
15954    a: __m128h,
15955) -> __m512i {
15956    unsafe {
15957        static_assert_sae!(SAE);
15958        transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
15959    }
15960}
15961
15962/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15963/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15964///
15965/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15966///
15967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu64)
15968#[inline]
15969#[target_feature(enable = "avx512fp16")]
15970#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15971#[rustc_legacy_const_generics(2)]
15972#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15973pub fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15974    static_assert_sae!(SAE);
15975    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_setzero_si512(), k, a)
15976}
15977
15978/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15979/// floating-point elements, and store the results in dst.
15980///
15981/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps)
15982#[inline]
15983#[target_feature(enable = "avx512fp16,avx512vl")]
15984#[cfg_attr(test, assert_instr(vcvtph2psx))]
15985#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15986pub fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
15987    _mm_mask_cvtxph_ps(_mm_setzero_ps(), 0xff, a)
15988}
15989
15990/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15991/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15992/// dst when the corresponding mask bit is not set).
15993///
15994/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps)
15995#[inline]
15996#[target_feature(enable = "avx512fp16,avx512vl")]
15997#[cfg_attr(test, assert_instr(vcvtph2psx))]
15998#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15999pub fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
16000    unsafe { vcvtph2psx_128(a, src, k) }
16001}
16002
16003/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16004/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16005/// corresponding mask bit is not set).
16006///
16007/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps)
16008#[inline]
16009#[target_feature(enable = "avx512fp16,avx512vl")]
16010#[cfg_attr(test, assert_instr(vcvtph2psx))]
16011#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16012pub fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
16013    _mm_mask_cvtxph_ps(_mm_setzero_ps(), k, a)
16014}
16015
16016/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16017/// floating-point elements, and store the results in dst.
16018///
16019/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps)
16020#[inline]
16021#[target_feature(enable = "avx512fp16,avx512vl")]
16022#[cfg_attr(test, assert_instr(vcvtph2psx))]
16023#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16024pub fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
16025    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), 0xff, a)
16026}
16027
16028/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16029/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16030/// dst when the corresponding mask bit is not set).
16031///
16032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps)
16033#[inline]
16034#[target_feature(enable = "avx512fp16,avx512vl")]
16035#[cfg_attr(test, assert_instr(vcvtph2psx))]
16036#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16037pub fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
16038    unsafe { vcvtph2psx_256(a, src, k) }
16039}
16040
16041/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16042/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16043/// corresponding mask bit is not set).
16044///
16045/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps)
16046#[inline]
16047#[target_feature(enable = "avx512fp16,avx512vl")]
16048#[cfg_attr(test, assert_instr(vcvtph2psx))]
16049#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16050pub fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
16051    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), k, a)
16052}
16053
16054/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16055/// floating-point elements, and store the results in dst.
16056///
16057/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxph_ps)
16058#[inline]
16059#[target_feature(enable = "avx512fp16")]
16060#[cfg_attr(test, assert_instr(vcvtph2psx))]
16061#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16062pub fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
16063    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), 0xffff, a)
16064}
16065
16066/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16067/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16068/// dst when the corresponding mask bit is not set).
16069///
16070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxph_ps)
16071#[inline]
16072#[target_feature(enable = "avx512fp16")]
16073#[cfg_attr(test, assert_instr(vcvtph2psx))]
16074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16075pub fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
16076    unsafe { vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
16077}
16078
16079/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16080/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16081/// corresponding mask bit is not set).
16082///
16083/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxph_ps)
16084#[inline]
16085#[target_feature(enable = "avx512fp16")]
16086#[cfg_attr(test, assert_instr(vcvtph2psx))]
16087#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16088pub fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
16089    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), k, a)
16090}
16091
16092/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16093/// floating-point elements, and store the results in dst.
16094///
16095/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16096///
16097/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundph_ps)
16098#[inline]
16099#[target_feature(enable = "avx512fp16")]
16100#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
16101#[rustc_legacy_const_generics(1)]
16102#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16103pub fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
16104    static_assert_sae!(SAE);
16105    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), 0xffff, a)
16106}
16107
16108/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16109/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16110/// dst when the corresponding mask bit is not set).
16111///
16112/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16113///
16114/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundph_ps)
16115#[inline]
16116#[target_feature(enable = "avx512fp16")]
16117#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
16118#[rustc_legacy_const_generics(3)]
16119#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16120pub fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
16121    src: __m512,
16122    k: __mmask16,
16123    a: __m256h,
16124) -> __m512 {
16125    unsafe {
16126        static_assert_sae!(SAE);
16127        vcvtph2psx_512(a, src, k, SAE)
16128    }
16129}
16130
16131/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
16132/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16133/// corresponding mask bit is not set).
16134///
16135/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16136///
16137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundph_ps)
16138#[inline]
16139#[target_feature(enable = "avx512fp16")]
16140#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
16141#[rustc_legacy_const_generics(2)]
16142#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16143pub fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
16144    static_assert_sae!(SAE);
16145    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), k, a)
16146}
16147
16148/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16149/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed
16150/// elements from a to the upper elements of dst.
16151///
16152/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_ss)
16153#[inline]
16154#[target_feature(enable = "avx512fp16")]
16155#[cfg_attr(test, assert_instr(vcvtsh2ss))]
16156#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16157pub fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
16158    _mm_mask_cvtsh_ss(a, 0xff, a, b)
16159}
16160
16161/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16162/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16163/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
16164/// upper elements of dst.
16165///
16166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_ss)
16167#[inline]
16168#[target_feature(enable = "avx512fp16")]
16169#[cfg_attr(test, assert_instr(vcvtsh2ss))]
16170#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16171pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
16172    unsafe { vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
16173}
16174
16175/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16176/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16177/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
16178/// of dst.
16179///
16180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_ss)
16181#[inline]
16182#[target_feature(enable = "avx512fp16")]
16183#[cfg_attr(test, assert_instr(vcvtsh2ss))]
16184#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16185pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
16186    _mm_mask_cvtsh_ss(_mm_set_ss(0.0), k, a, b)
16187}
16188
16189/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16190/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements
16191/// from a to the upper elements of dst.
16192///
16193/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16194///
16195/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_ss)
16196#[inline]
16197#[target_feature(enable = "avx512fp16")]
16198#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
16199#[rustc_legacy_const_generics(2)]
16200#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16201pub fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
16202    static_assert_sae!(SAE);
16203    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_undefined_ps(), 0xff, a, b)
16204}
16205
16206/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16207/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16208/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
16209/// upper elements of dst.
16210///
16211/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16212///
16213/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss)
16214#[inline]
16215#[target_feature(enable = "avx512fp16")]
16216#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
16217#[rustc_legacy_const_generics(4)]
16218#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16219pub fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
16220    src: __m128,
16221    k: __mmask8,
16222    a: __m128,
16223    b: __m128h,
16224) -> __m128 {
16225    unsafe {
16226        static_assert_sae!(SAE);
16227        vcvtsh2ss(a, b, src, k, SAE)
16228    }
16229}
16230
16231/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16232/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16233/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
16234/// of dst.
16235///
16236/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16237///
16238/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss)
16239#[inline]
16240#[target_feature(enable = "avx512fp16")]
16241#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
16242#[rustc_legacy_const_generics(3)]
16243#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16244pub fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
16245    static_assert_sae!(SAE);
16246    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_set_ss(0.0), k, a, b)
16247}
16248
16249/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16250/// floating-point elements, and store the results in dst.
16251///
16252/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_pd)
16253#[inline]
16254#[target_feature(enable = "avx512fp16,avx512vl")]
16255#[cfg_attr(test, assert_instr(vcvtph2pd))]
16256#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16257pub fn _mm_cvtph_pd(a: __m128h) -> __m128d {
16258    _mm_mask_cvtph_pd(_mm_setzero_pd(), 0xff, a)
16259}
16260
16261/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16262/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16263/// dst when the corresponding mask bit is not set).
16264///
16265/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_pd)
16266#[inline]
16267#[target_feature(enable = "avx512fp16,avx512vl")]
16268#[cfg_attr(test, assert_instr(vcvtph2pd))]
16269#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16270pub fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
16271    unsafe { vcvtph2pd_128(a, src, k) }
16272}
16273
16274/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16275/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16276/// corresponding mask bit is not set).
16277///
16278/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_pd)
16279#[inline]
16280#[target_feature(enable = "avx512fp16,avx512vl")]
16281#[cfg_attr(test, assert_instr(vcvtph2pd))]
16282#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16283pub fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
16284    _mm_mask_cvtph_pd(_mm_setzero_pd(), k, a)
16285}
16286
16287/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16288/// floating-point elements, and store the results in dst.
16289///
16290/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_pd)
16291#[inline]
16292#[target_feature(enable = "avx512fp16,avx512vl")]
16293#[cfg_attr(test, assert_instr(vcvtph2pd))]
16294#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16295pub fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
16296    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), 0xff, a)
16297}
16298
16299/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16300/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16301/// dst when the corresponding mask bit is not set).
16302///
16303/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_pd)
16304#[inline]
16305#[target_feature(enable = "avx512fp16,avx512vl")]
16306#[cfg_attr(test, assert_instr(vcvtph2pd))]
16307#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16308pub fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
16309    unsafe { vcvtph2pd_256(a, src, k) }
16310}
16311
16312/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16313/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16314/// corresponding mask bit is not set).
16315///
16316/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_pd)
16317#[inline]
16318#[target_feature(enable = "avx512fp16,avx512vl")]
16319#[cfg_attr(test, assert_instr(vcvtph2pd))]
16320#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16321pub fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
16322    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), k, a)
16323}
16324
16325/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16326/// floating-point elements, and store the results in dst.
16327///
16328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_pd)
16329#[inline]
16330#[target_feature(enable = "avx512fp16")]
16331#[cfg_attr(test, assert_instr(vcvtph2pd))]
16332#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16333pub fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
16334    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), 0xff, a)
16335}
16336
16337/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16338/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16339/// dst when the corresponding mask bit is not set).
16340///
16341/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_pd)
16342#[inline]
16343#[target_feature(enable = "avx512fp16")]
16344#[cfg_attr(test, assert_instr(vcvtph2pd))]
16345#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16346pub fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
16347    unsafe { vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
16348}
16349
16350/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16351/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16352/// corresponding mask bit is not set).
16353///
16354/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_pd)
16355#[inline]
16356#[target_feature(enable = "avx512fp16")]
16357#[cfg_attr(test, assert_instr(vcvtph2pd))]
16358#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16359pub fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
16360    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), k, a)
16361}
16362
16363/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16364/// floating-point elements, and store the results in dst.
16365///
16366/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16367///
16368/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_pd)
16369#[inline]
16370#[target_feature(enable = "avx512fp16")]
16371#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16372#[rustc_legacy_const_generics(1)]
16373#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16374pub fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
16375    static_assert_sae!(SAE);
16376    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), 0xff, a)
16377}
16378
16379/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16380/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16381/// dst when the corresponding mask bit is not set).
16382///
16383/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16384///
16385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_pd)
16386#[inline]
16387#[target_feature(enable = "avx512fp16")]
16388#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16389#[rustc_legacy_const_generics(3)]
16390#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16391pub fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
16392    src: __m512d,
16393    k: __mmask8,
16394    a: __m128h,
16395) -> __m512d {
16396    unsafe {
16397        static_assert_sae!(SAE);
16398        vcvtph2pd_512(a, src, k, SAE)
16399    }
16400}
16401
16402/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16403/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16404/// corresponding mask bit is not set).
16405///
16406/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16407///
16408/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_pd)
16409#[inline]
16410#[target_feature(enable = "avx512fp16")]
16411#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16412#[rustc_legacy_const_generics(2)]
16413#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16414pub fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
16415    static_assert_sae!(SAE);
16416    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), k, a)
16417}
16418
16419/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16420/// floating-point element, store the result in the lower element of dst, and copy the upper element
16421/// from a to the upper element of dst.
16422///
16423/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_sd)
16424#[inline]
16425#[target_feature(enable = "avx512fp16")]
16426#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16427#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16428pub fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
16429    _mm_mask_cvtsh_sd(a, 0xff, a, b)
16430}
16431
16432/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16433/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16434/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16435/// of dst.
16436///
16437/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_sd)
16438#[inline]
16439#[target_feature(enable = "avx512fp16")]
16440#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16441#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16442pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16443    unsafe { vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
16444}
16445
16446/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16447/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16448/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16449///
16450/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_sd)
16451#[inline]
16452#[target_feature(enable = "avx512fp16")]
16453#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16454#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16455pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16456    _mm_mask_cvtsh_sd(_mm_set_sd(0.0), k, a, b)
16457}
16458
16459/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16460/// floating-point element, store the result in the lower element of dst, and copy the upper element from a
16461/// to the upper element of dst.
16462///
16463/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16464///
16465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_sd)
16466#[inline]
16467#[target_feature(enable = "avx512fp16")]
16468#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16469#[rustc_legacy_const_generics(2)]
16470#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16471pub fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
16472    static_assert_sae!(SAE);
16473    _mm_mask_cvt_roundsh_sd::<SAE>(a, 0xff, a, b)
16474}
16475
16476/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16477/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16478/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16479/// of dst.
16480///
16481/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16482///
16483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd)
16484#[inline]
16485#[target_feature(enable = "avx512fp16")]
16486#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16487#[rustc_legacy_const_generics(4)]
16488#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16489pub fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
16490    src: __m128d,
16491    k: __mmask8,
16492    a: __m128d,
16493    b: __m128h,
16494) -> __m128d {
16495    unsafe {
16496        static_assert_sae!(SAE);
16497        vcvtsh2sd(a, b, src, k, SAE)
16498    }
16499}
16500
16501/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16502/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16503/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16504///
16505/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16506///
16507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd)
16508#[inline]
16509#[target_feature(enable = "avx512fp16")]
16510#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16511#[rustc_legacy_const_generics(3)]
16512#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16513pub fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16514    static_assert_sae!(SAE);
16515    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_set_sd(0.0), k, a, b)
16516}
16517
16518/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16519///
16520/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
16521#[inline]
16522#[target_feature(enable = "avx512fp16")]
16523#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16524#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16525pub const fn _mm_cvtsh_h(a: __m128h) -> f16 {
16526    unsafe { simd_extract!(a, 0) }
16527}
16528
16529/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16530///
16531/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
16532#[inline]
16533#[target_feature(enable = "avx512fp16")]
16534#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16535#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16536pub const fn _mm256_cvtsh_h(a: __m256h) -> f16 {
16537    unsafe { simd_extract!(a, 0) }
16538}
16539
16540/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16541///
16542/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
16543#[inline]
16544#[target_feature(enable = "avx512fp16")]
16545#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16546#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16547pub const fn _mm512_cvtsh_h(a: __m512h) -> f16 {
16548    unsafe { simd_extract!(a, 0) }
16549}
16550
16551/// Copy the lower 16-bit integer in a to dst.
16552///
16553/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
16554#[inline]
16555#[target_feature(enable = "avx512fp16")]
16556#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16557#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16558pub const fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
16559    unsafe { simd_extract!(a.as_i16x8(), 0) }
16560}
16561
16562/// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
16563///
16564/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
16565#[inline]
16566#[target_feature(enable = "avx512fp16")]
16567#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16568#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
16569pub const fn _mm_cvtsi16_si128(a: i16) -> __m128i {
16570    unsafe { transmute(simd_insert!(i16x8::ZERO, 0, a)) }
16571}
16572
16573#[allow(improper_ctypes)]
16574unsafe extern "C" {
16575    #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
16576    fn vcmpsh(a: __m128h, b: __m128h, imm8: i32, mask: __mmask8, sae: i32) -> __mmask8;
16577    #[link_name = "llvm.x86.avx512fp16.vcomi.sh"]
16578    fn vcomish(a: __m128h, b: __m128h, imm8: i32, sae: i32) -> i32;
16579
16580    #[link_name = "llvm.x86.avx512fp16.add.ph.512"]
16581    fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16582    #[link_name = "llvm.x86.avx512fp16.sub.ph.512"]
16583    fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16584    #[link_name = "llvm.x86.avx512fp16.mul.ph.512"]
16585    fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16586    #[link_name = "llvm.x86.avx512fp16.div.ph.512"]
16587    fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16588
16589    #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"]
16590    fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16591    #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"]
16592    fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16593    #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"]
16594    fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16595    #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"]
16596    fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16597
16598    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.128"]
16599    fn vfmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16600    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.256"]
16601    fn vfmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16602    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.512"]
16603    fn vfmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16604    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.csh"]
16605    fn vfmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16606
16607    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.128"]
16608    fn vfcmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16609    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.256"]
16610    fn vfcmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16611    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.512"]
16612    fn vfcmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16613    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.csh"]
16614    fn vfcmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16615
16616    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.128"]
16617    fn vfmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16618    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.128"]
16619    fn vfmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16620    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.256"]
16621    fn vfmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16622    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.256"]
16623    fn vfmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16624    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.512"]
16625    fn vfmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16626    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.512"]
16627    fn vfmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16628    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.csh"]
16629    fn vfmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16630    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.csh"]
16631    fn vfmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16632
16633    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.128"]
16634    fn vfcmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16635    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.128"]
16636    fn vfcmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16637    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.256"]
16638    fn vfcmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16639    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.256"]
16640    fn vfcmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16641    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.512"]
16642    fn vfcmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16643    -> __m512;
16644    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.512"]
16645    fn vfcmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16646    -> __m512;
16647    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.csh"]
16648    fn vfcmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16649    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.csh"]
16650    fn vfcmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16651
16652    #[link_name = "llvm.x86.avx512fp16.vfmadd.ph.512"]
16653    fn vfmaddph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16654    #[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
16655    fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;
16656
16657    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
16658    fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16659
16660    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.128"]
16661    fn vrcpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16662    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.256"]
16663    fn vrcpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16664    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.512"]
16665    fn vrcpph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16666    #[link_name = "llvm.x86.avx512fp16.mask.rcp.sh"]
16667    fn vrcpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16668
16669    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.128"]
16670    fn vrsqrtph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16671    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.256"]
16672    fn vrsqrtph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16673    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.512"]
16674    fn vrsqrtph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16675    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.sh"]
16676    fn vrsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16677
16678    #[link_name = "llvm.x86.avx512fp16.sqrt.ph.512"]
16679    fn vsqrtph_512(a: __m512h, rounding: i32) -> __m512h;
16680    #[link_name = "llvm.x86.avx512fp16.mask.sqrt.sh"]
16681    fn vsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16682
16683    #[link_name = "llvm.x86.avx512fp16.max.ph.128"]
16684    fn vmaxph_128(a: __m128h, b: __m128h) -> __m128h;
16685    #[link_name = "llvm.x86.avx512fp16.max.ph.256"]
16686    fn vmaxph_256(a: __m256h, b: __m256h) -> __m256h;
16687    #[link_name = "llvm.x86.avx512fp16.max.ph.512"]
16688    fn vmaxph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16689    #[link_name = "llvm.x86.avx512fp16.mask.max.sh.round"]
16690    fn vmaxsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16691
16692    #[link_name = "llvm.x86.avx512fp16.min.ph.128"]
16693    fn vminph_128(a: __m128h, b: __m128h) -> __m128h;
16694    #[link_name = "llvm.x86.avx512fp16.min.ph.256"]
16695    fn vminph_256(a: __m256h, b: __m256h) -> __m256h;
16696    #[link_name = "llvm.x86.avx512fp16.min.ph.512"]
16697    fn vminph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16698    #[link_name = "llvm.x86.avx512fp16.mask.min.sh.round"]
16699    fn vminsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16700
16701    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.128"]
16702    fn vgetexpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16703    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.256"]
16704    fn vgetexpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16705    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.512"]
16706    fn vgetexpph_512(a: __m512h, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16707    #[link_name = "llvm.x86.avx512fp16.mask.getexp.sh"]
16708    fn vgetexpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16709
16710    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.128"]
16711    fn vgetmantph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16712    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.256"]
16713    fn vgetmantph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16714    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.512"]
16715    fn vgetmantph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16716    #[link_name = "llvm.x86.avx512fp16.mask.getmant.sh"]
16717    fn vgetmantsh(
16718        a: __m128h,
16719        b: __m128h,
16720        imm8: i32,
16721        src: __m128h,
16722        k: __mmask8,
16723        sae: i32,
16724    ) -> __m128h;
16725
16726    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.128"]
16727    fn vrndscaleph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16728    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.256"]
16729    fn vrndscaleph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16730    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.512"]
16731    fn vrndscaleph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16732    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.sh"]
16733    fn vrndscalesh(
16734        a: __m128h,
16735        b: __m128h,
16736        src: __m128h,
16737        k: __mmask8,
16738        imm8: i32,
16739        sae: i32,
16740    ) -> __m128h;
16741
16742    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.128"]
16743    fn vscalefph_128(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16744    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.256"]
16745    fn vscalefph_256(a: __m256h, b: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16746    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.512"]
16747    fn vscalefph_512(a: __m512h, b: __m512h, src: __m512h, k: __mmask32, rounding: i32) -> __m512h;
16748    #[link_name = "llvm.x86.avx512fp16.mask.scalef.sh"]
16749    fn vscalefsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16750
16751    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.128"]
16752    fn vreduceph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16753    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.256"]
16754    fn vreduceph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16755    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.512"]
16756    fn vreduceph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16757    #[link_name = "llvm.x86.avx512fp16.mask.reduce.sh"]
16758    fn vreducesh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, imm8: i32, sae: i32)
16759    -> __m128h;
16760
16761    #[link_name = "llvm.x86.avx512fp16.mask.fpclass.sh"]
16762    fn vfpclasssh(a: __m128h, imm8: i32, k: __mmask8) -> __mmask8;
16763
16764    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i16"]
16765    fn vcvtw2ph_128(a: i16x8, rounding: i32) -> __m128h;
16766    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i16"]
16767    fn vcvtw2ph_256(a: i16x16, rounding: i32) -> __m256h;
16768    #[link_name = "llvm.x86.avx512.sitofp.round.v32f16.v32i16"]
16769    fn vcvtw2ph_512(a: i16x32, rounding: i32) -> __m512h;
16770    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i16"]
16771    fn vcvtuw2ph_128(a: u16x8, rounding: i32) -> __m128h;
16772    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i16"]
16773    fn vcvtuw2ph_256(a: u16x16, rounding: i32) -> __m256h;
16774    #[link_name = "llvm.x86.avx512.uitofp.round.v32f16.v32i16"]
16775    fn vcvtuw2ph_512(a: u16x32, rounding: i32) -> __m512h;
16776
16777    #[link_name = "llvm.x86.avx512fp16.mask.vcvtdq2ph.128"]
16778    fn vcvtdq2ph_128(a: i32x4, src: __m128h, k: __mmask8) -> __m128h;
16779    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i32"]
16780    fn vcvtdq2ph_256(a: i32x8, rounding: i32) -> __m128h;
16781    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i32"]
16782    fn vcvtdq2ph_512(a: i32x16, rounding: i32) -> __m256h;
16783    #[link_name = "llvm.x86.avx512fp16.vcvtsi2sh"]
16784    fn vcvtsi2sh(a: __m128h, b: i32, rounding: i32) -> __m128h;
16785    #[link_name = "llvm.x86.avx512fp16.mask.vcvtudq2ph.128"]
16786    fn vcvtudq2ph_128(a: u32x4, src: __m128h, k: __mmask8) -> __m128h;
16787    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i32"]
16788    fn vcvtudq2ph_256(a: u32x8, rounding: i32) -> __m128h;
16789    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i32"]
16790    fn vcvtudq2ph_512(a: u32x16, rounding: i32) -> __m256h;
16791    #[link_name = "llvm.x86.avx512fp16.vcvtusi2sh"]
16792    fn vcvtusi2sh(a: __m128h, b: u32, rounding: i32) -> __m128h;
16793
16794    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.128"]
16795    fn vcvtqq2ph_128(a: i64x2, src: __m128h, k: __mmask8) -> __m128h;
16796    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.256"]
16797    fn vcvtqq2ph_256(a: i64x4, src: __m128h, k: __mmask8) -> __m128h;
16798    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i64"]
16799    fn vcvtqq2ph_512(a: i64x8, rounding: i32) -> __m128h;
16800    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.128"]
16801    fn vcvtuqq2ph_128(a: u64x2, src: __m128h, k: __mmask8) -> __m128h;
16802    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.256"]
16803    fn vcvtuqq2ph_256(a: u64x4, src: __m128h, k: __mmask8) -> __m128h;
16804    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i64"]
16805    fn vcvtuqq2ph_512(a: u64x8, rounding: i32) -> __m128h;
16806
16807    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.128"]
16808    fn vcvtps2phx_128(a: __m128, src: __m128h, k: __mmask8) -> __m128h;
16809    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.256"]
16810    fn vcvtps2phx_256(a: __m256, src: __m128h, k: __mmask8) -> __m128h;
16811    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.512"]
16812    fn vcvtps2phx_512(a: __m512, src: __m256h, k: __mmask16, rounding: i32) -> __m256h;
16813    #[link_name = "llvm.x86.avx512fp16.mask.vcvtss2sh.round"]
16814    fn vcvtss2sh(a: __m128h, b: __m128, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16815
16816    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.128"]
16817    fn vcvtpd2ph_128(a: __m128d, src: __m128h, k: __mmask8) -> __m128h;
16818    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.256"]
16819    fn vcvtpd2ph_256(a: __m256d, src: __m128h, k: __mmask8) -> __m128h;
16820    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.512"]
16821    fn vcvtpd2ph_512(a: __m512d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16822    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsd2sh.round"]
16823    fn vcvtsd2sh(a: __m128h, b: __m128d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16824
16825    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.128"]
16826    fn vcvtph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16827    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.256"]
16828    fn vcvtph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16829    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.512"]
16830    fn vcvtph2w_512(a: __m512h, src: i16x32, k: __mmask32, rounding: i32) -> i16x32;
16831    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.128"]
16832    fn vcvtph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16833    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.256"]
16834    fn vcvtph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16835    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.512"]
16836    fn vcvtph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16837
16838    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.128"]
16839    fn vcvttph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16840    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.256"]
16841    fn vcvttph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16842    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.512"]
16843    fn vcvttph2w_512(a: __m512h, src: i16x32, k: __mmask32, sae: i32) -> i16x32;
16844    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.128"]
16845    fn vcvttph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16846    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.256"]
16847    fn vcvttph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16848    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.512"]
16849    fn vcvttph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16850
16851    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.128"]
16852    fn vcvtph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16853    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.256"]
16854    fn vcvtph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16855    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.512"]
16856    fn vcvtph2dq_512(a: __m256h, src: i32x16, k: __mmask16, rounding: i32) -> i32x16;
16857    #[link_name = "llvm.x86.avx512fp16.vcvtsh2si32"]
16858    fn vcvtsh2si32(a: __m128h, rounding: i32) -> i32;
16859    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.128"]
16860    fn vcvtph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16861    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.256"]
16862    fn vcvtph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16863    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.512"]
16864    fn vcvtph2udq_512(a: __m256h, src: u32x16, k: __mmask16, rounding: i32) -> u32x16;
16865    #[link_name = "llvm.x86.avx512fp16.vcvtsh2usi32"]
16866    fn vcvtsh2usi32(a: __m128h, sae: i32) -> u32;
16867
16868    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.128"]
16869    fn vcvttph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16870    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.256"]
16871    fn vcvttph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16872    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.512"]
16873    fn vcvttph2dq_512(a: __m256h, src: i32x16, k: __mmask16, sae: i32) -> i32x16;
16874    #[link_name = "llvm.x86.avx512fp16.vcvttsh2si32"]
16875    fn vcvttsh2si32(a: __m128h, sae: i32) -> i32;
16876    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.128"]
16877    fn vcvttph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16878    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.256"]
16879    fn vcvttph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16880    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.512"]
16881    fn vcvttph2udq_512(a: __m256h, src: u32x16, k: __mmask16, sae: i32) -> u32x16;
16882    #[link_name = "llvm.x86.avx512fp16.vcvttsh2usi32"]
16883    fn vcvttsh2usi32(a: __m128h, sae: i32) -> u32;
16884
16885    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.128"]
16886    fn vcvtph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16887    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.256"]
16888    fn vcvtph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16889    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.512"]
16890    fn vcvtph2qq_512(a: __m128h, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
16891    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.128"]
16892    fn vcvtph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16893    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.256"]
16894    fn vcvtph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16895    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.512"]
16896    fn vcvtph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
16897
16898    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.128"]
16899    fn vcvttph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16900    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.256"]
16901    fn vcvttph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16902    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.512"]
16903    fn vcvttph2qq_512(a: __m128h, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
16904    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.128"]
16905    fn vcvttph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16906    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.256"]
16907    fn vcvttph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16908    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.512"]
16909    fn vcvttph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
16910
16911    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.128"]
16912    fn vcvtph2psx_128(a: __m128h, src: __m128, k: __mmask8) -> __m128;
16913    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.256"]
16914    fn vcvtph2psx_256(a: __m128h, src: __m256, k: __mmask8) -> __m256;
16915    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.512"]
16916    fn vcvtph2psx_512(a: __m256h, src: __m512, k: __mmask16, sae: i32) -> __m512;
16917    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2ss.round"]
16918    fn vcvtsh2ss(a: __m128, b: __m128h, src: __m128, k: __mmask8, sae: i32) -> __m128;
16919
16920    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.128"]
16921    fn vcvtph2pd_128(a: __m128h, src: __m128d, k: __mmask8) -> __m128d;
16922    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.256"]
16923    fn vcvtph2pd_256(a: __m128h, src: __m256d, k: __mmask8) -> __m256d;
16924    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.512"]
16925    fn vcvtph2pd_512(a: __m128h, src: __m512d, k: __mmask8, sae: i32) -> __m512d;
16926    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2sd.round"]
16927    fn vcvtsh2sd(a: __m128d, b: __m128h, src: __m128d, k: __mmask8, sae: i32) -> __m128d;
16928
16929}
16930
16931#[cfg(test)]
16932mod tests {
16933    use crate::core_arch::assert_eq_const as assert_eq;
16934    use crate::core_arch::x86::*;
16935    use crate::mem::transmute;
16936    use crate::ptr::{addr_of, addr_of_mut};
16937    use stdarch_test::simd_test;
16938
16939    #[target_feature(enable = "avx512fp16")]
16940    #[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
16941    const fn _mm_set1_pch(re: f16, im: f16) -> __m128h {
16942        _mm_setr_ph(re, im, re, im, re, im, re, im)
16943    }
16944
16945    #[target_feature(enable = "avx512fp16")]
16946    #[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
16947    const fn _mm256_set1_pch(re: f16, im: f16) -> __m256h {
16948        _mm256_setr_ph(
16949            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16950        )
16951    }
16952
16953    #[target_feature(enable = "avx512fp16")]
16954    #[rustc_const_unstable(feature = "stdarch_const_helpers", issue = "none")]
16955    const fn _mm512_set1_pch(re: f16, im: f16) -> __m512h {
16956        _mm512_setr_ph(
16957            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16958            re, im, re, im, re, im, re, im, re, im,
16959        )
16960    }
16961
16962    #[simd_test(enable = "avx512fp16,avx512vl")]
16963    const unsafe fn test_mm_set_ph() {
16964        let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16965        let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16966        assert_eq_m128h(r, e);
16967    }
16968
16969    #[simd_test(enable = "avx512fp16,avx512vl")]
16970    const unsafe fn test_mm256_set_ph() {
16971        let r = _mm256_set_ph(
16972            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16973        );
16974        let e = _mm256_setr_ph(
16975            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16976        );
16977        assert_eq_m256h(r, e);
16978    }
16979
16980    #[simd_test(enable = "avx512fp16")]
16981    const unsafe fn test_mm512_set_ph() {
16982        let r = _mm512_set_ph(
16983            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16984            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16985            31.0, 32.0,
16986        );
16987        let e = _mm512_setr_ph(
16988            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16989            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16990            3.0, 2.0, 1.0,
16991        );
16992        assert_eq_m512h(r, e);
16993    }
16994
16995    #[simd_test(enable = "avx512fp16,avx512vl")]
16996    const unsafe fn test_mm_set_sh() {
16997        let r = _mm_set_sh(1.0);
16998        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0);
16999        assert_eq_m128h(r, e);
17000    }
17001
17002    #[simd_test(enable = "avx512fp16,avx512vl")]
17003    const unsafe fn test_mm_set1_ph() {
17004        let r = _mm_set1_ph(1.0);
17005        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
17006        assert_eq_m128h(r, e);
17007    }
17008
17009    #[simd_test(enable = "avx512fp16,avx512vl")]
17010    const unsafe fn test_mm256_set1_ph() {
17011        let r = _mm256_set1_ph(1.0);
17012        let e = _mm256_set_ph(
17013            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
17014        );
17015        assert_eq_m256h(r, e);
17016    }
17017
17018    #[simd_test(enable = "avx512fp16")]
17019    const unsafe fn test_mm512_set1_ph() {
17020        let r = _mm512_set1_ph(1.0);
17021        let e = _mm512_set_ph(
17022            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
17023            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
17024        );
17025        assert_eq_m512h(r, e);
17026    }
17027
17028    #[simd_test(enable = "avx512fp16,avx512vl")]
17029    const unsafe fn test_mm_setr_ph() {
17030        let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17031        let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17032        assert_eq_m128h(r, e);
17033    }
17034
17035    #[simd_test(enable = "avx512fp16,avx512vl")]
17036    const unsafe fn test_mm256_setr_ph() {
17037        let r = _mm256_setr_ph(
17038            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17039        );
17040        let e = _mm256_set_ph(
17041            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17042        );
17043        assert_eq_m256h(r, e);
17044    }
17045
17046    #[simd_test(enable = "avx512fp16")]
17047    const unsafe fn test_mm512_setr_ph() {
17048        let r = _mm512_setr_ph(
17049            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17050            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17051            31.0, 32.0,
17052        );
17053        let e = _mm512_set_ph(
17054            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17055            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17056            3.0, 2.0, 1.0,
17057        );
17058        assert_eq_m512h(r, e);
17059    }
17060
17061    #[simd_test(enable = "avx512fp16,avx512vl")]
17062    const unsafe fn test_mm_setzero_ph() {
17063        let r = _mm_setzero_ph();
17064        let e = _mm_set1_ph(0.0);
17065        assert_eq_m128h(r, e);
17066    }
17067
17068    #[simd_test(enable = "avx512fp16,avx512vl")]
17069    const unsafe fn test_mm256_setzero_ph() {
17070        let r = _mm256_setzero_ph();
17071        let e = _mm256_set1_ph(0.0);
17072        assert_eq_m256h(r, e);
17073    }
17074
17075    #[simd_test(enable = "avx512fp16")]
17076    const unsafe fn test_mm512_setzero_ph() {
17077        let r = _mm512_setzero_ph();
17078        let e = _mm512_set1_ph(0.0);
17079        assert_eq_m512h(r, e);
17080    }
17081
17082    #[simd_test(enable = "avx512fp16,avx512vl")]
17083    const unsafe fn test_mm_castsi128_ph() {
17084        let a = _mm_set1_epi16(0x3c00);
17085        let r = _mm_castsi128_ph(a);
17086        let e = _mm_set1_ph(1.0);
17087        assert_eq_m128h(r, e);
17088    }
17089
17090    #[simd_test(enable = "avx512fp16,avx512vl")]
17091    const unsafe fn test_mm256_castsi256_ph() {
17092        let a = _mm256_set1_epi16(0x3c00);
17093        let r = _mm256_castsi256_ph(a);
17094        let e = _mm256_set1_ph(1.0);
17095        assert_eq_m256h(r, e);
17096    }
17097
17098    #[simd_test(enable = "avx512fp16")]
17099    const unsafe fn test_mm512_castsi512_ph() {
17100        let a = _mm512_set1_epi16(0x3c00);
17101        let r = _mm512_castsi512_ph(a);
17102        let e = _mm512_set1_ph(1.0);
17103        assert_eq_m512h(r, e);
17104    }
17105
17106    #[simd_test(enable = "avx512fp16")]
17107    const unsafe fn test_mm_castph_si128() {
17108        let a = _mm_set1_ph(1.0);
17109        let r = _mm_castph_si128(a);
17110        let e = _mm_set1_epi16(0x3c00);
17111        assert_eq_m128i(r, e);
17112    }
17113
17114    #[simd_test(enable = "avx512fp16")]
17115    const unsafe fn test_mm256_castph_si256() {
17116        let a = _mm256_set1_ph(1.0);
17117        let r = _mm256_castph_si256(a);
17118        let e = _mm256_set1_epi16(0x3c00);
17119        assert_eq_m256i(r, e);
17120    }
17121
17122    #[simd_test(enable = "avx512fp16")]
17123    const unsafe fn test_mm512_castph_si512() {
17124        let a = _mm512_set1_ph(1.0);
17125        let r = _mm512_castph_si512(a);
17126        let e = _mm512_set1_epi16(0x3c00);
17127        assert_eq_m512i(r, e);
17128    }
17129
17130    #[simd_test(enable = "avx512fp16,avx512vl")]
17131    const unsafe fn test_mm_castps_ph() {
17132        let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00));
17133        let r = _mm_castps_ph(a);
17134        let e = _mm_set1_ph(1.0);
17135        assert_eq_m128h(r, e);
17136    }
17137
17138    #[simd_test(enable = "avx512fp16,avx512vl")]
17139    const unsafe fn test_mm256_castps_ph() {
17140        let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00));
17141        let r = _mm256_castps_ph(a);
17142        let e = _mm256_set1_ph(1.0);
17143        assert_eq_m256h(r, e);
17144    }
17145
17146    #[simd_test(enable = "avx512fp16")]
17147    const unsafe fn test_mm512_castps_ph() {
17148        let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00));
17149        let r = _mm512_castps_ph(a);
17150        let e = _mm512_set1_ph(1.0);
17151        assert_eq_m512h(r, e);
17152    }
17153
17154    #[simd_test(enable = "avx512fp16")]
17155    const unsafe fn test_mm_castph_ps() {
17156        let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000));
17157        let r = _mm_castph_ps(a);
17158        let e = _mm_set1_ps(1.0);
17159        assert_eq_m128(r, e);
17160    }
17161
17162    #[simd_test(enable = "avx512fp16")]
17163    const unsafe fn test_mm256_castph_ps() {
17164        let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000));
17165        let r = _mm256_castph_ps(a);
17166        let e = _mm256_set1_ps(1.0);
17167        assert_eq_m256(r, e);
17168    }
17169
17170    #[simd_test(enable = "avx512fp16")]
17171    const unsafe fn test_mm512_castph_ps() {
17172        let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000));
17173        let r = _mm512_castph_ps(a);
17174        let e = _mm512_set1_ps(1.0);
17175        assert_eq_m512(r, e);
17176    }
17177
17178    #[simd_test(enable = "avx512fp16,avx512vl")]
17179    const unsafe fn test_mm_castpd_ph() {
17180        let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00));
17181        let r = _mm_castpd_ph(a);
17182        let e = _mm_set1_ph(1.0);
17183        assert_eq_m128h(r, e);
17184    }
17185
17186    #[simd_test(enable = "avx512fp16,avx512vl")]
17187    const unsafe fn test_mm256_castpd_ph() {
17188        let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00));
17189        let r = _mm256_castpd_ph(a);
17190        let e = _mm256_set1_ph(1.0);
17191        assert_eq_m256h(r, e);
17192    }
17193
17194    #[simd_test(enable = "avx512fp16")]
17195    const unsafe fn test_mm512_castpd_ph() {
17196        let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00));
17197        let r = _mm512_castpd_ph(a);
17198        let e = _mm512_set1_ph(1.0);
17199        assert_eq_m512h(r, e);
17200    }
17201
17202    #[simd_test(enable = "avx512fp16")]
17203    const unsafe fn test_mm_castph_pd() {
17204        let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000));
17205        let r = _mm_castph_pd(a);
17206        let e = _mm_set1_pd(1.0);
17207        assert_eq_m128d(r, e);
17208    }
17209
17210    #[simd_test(enable = "avx512fp16")]
17211    const unsafe fn test_mm256_castph_pd() {
17212        let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000));
17213        let r = _mm256_castph_pd(a);
17214        let e = _mm256_set1_pd(1.0);
17215        assert_eq_m256d(r, e);
17216    }
17217
17218    #[simd_test(enable = "avx512fp16")]
17219    const unsafe fn test_mm512_castph_pd() {
17220        let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000));
17221        let r = _mm512_castph_pd(a);
17222        let e = _mm512_set1_pd(1.0);
17223        assert_eq_m512d(r, e);
17224    }
17225
17226    #[simd_test(enable = "avx512fp16,avx512vl")]
17227    const unsafe fn test_mm256_castph256_ph128() {
17228        let a = _mm256_setr_ph(
17229            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17230        );
17231        let r = _mm256_castph256_ph128(a);
17232        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17233        assert_eq_m128h(r, e);
17234    }
17235
17236    #[simd_test(enable = "avx512fp16,avx512vl")]
17237    const unsafe fn test_mm512_castph512_ph128() {
17238        let a = _mm512_setr_ph(
17239            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
17240            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
17241        );
17242        let r = _mm512_castph512_ph128(a);
17243        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17244        assert_eq_m128h(r, e);
17245    }
17246
17247    #[simd_test(enable = "avx512fp16,avx512vl")]
17248    const unsafe fn test_mm512_castph512_ph256() {
17249        let a = _mm512_setr_ph(
17250            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
17251            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
17252        );
17253        let r = _mm512_castph512_ph256(a);
17254        let e = _mm256_setr_ph(
17255            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17256        );
17257        assert_eq_m256h(r, e);
17258    }
17259
17260    #[simd_test(enable = "avx512fp16,avx512vl")]
17261    const unsafe fn test_mm256_castph128_ph256() {
17262        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17263        let r = _mm256_castph128_ph256(a);
17264        assert_eq_m128h(_mm256_castph256_ph128(r), a);
17265    }
17266
17267    #[simd_test(enable = "avx512fp16,avx512vl")]
17268    const unsafe fn test_mm512_castph128_ph512() {
17269        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17270        let r = _mm512_castph128_ph512(a);
17271        assert_eq_m128h(_mm512_castph512_ph128(r), a);
17272    }
17273
17274    #[simd_test(enable = "avx512fp16,avx512vl")]
17275    const unsafe fn test_mm512_castph256_ph512() {
17276        let a = _mm256_setr_ph(
17277            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17278        );
17279        let r = _mm512_castph256_ph512(a);
17280        assert_eq_m256h(_mm512_castph512_ph256(r), a);
17281    }
17282
17283    #[simd_test(enable = "avx512fp16,avx512vl")]
17284    const unsafe fn test_mm256_zextph128_ph256() {
17285        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17286        let r = _mm256_zextph128_ph256(a);
17287        let e = _mm256_setr_ph(
17288            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
17289        );
17290        assert_eq_m256h(r, e);
17291    }
17292
17293    #[simd_test(enable = "avx512fp16")]
17294    const unsafe fn test_mm512_zextph128_ph512() {
17295        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17296        let r = _mm512_zextph128_ph512(a);
17297        let e = _mm512_setr_ph(
17298            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17299            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17300        );
17301        assert_eq_m512h(r, e);
17302    }
17303
17304    #[simd_test(enable = "avx512fp16")]
17305    const unsafe fn test_mm512_zextph256_ph512() {
17306        let a = _mm256_setr_ph(
17307            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17308        );
17309        let r = _mm512_zextph256_ph512(a);
17310        let e = _mm512_setr_ph(
17311            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0.,
17312            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17313        );
17314        assert_eq_m512h(r, e);
17315    }
17316
17317    #[simd_test(enable = "avx512fp16,avx512vl")]
17318    unsafe fn test_mm_cmp_ph_mask() {
17319        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17320        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17321        let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17322        assert_eq!(r, 0b11110000);
17323    }
17324
17325    #[simd_test(enable = "avx512fp16,avx512vl")]
17326    unsafe fn test_mm_mask_cmp_ph_mask() {
17327        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17328        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17329        let r = _mm_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101, a, b);
17330        assert_eq!(r, 0b01010000);
17331    }
17332
17333    #[simd_test(enable = "avx512fp16,avx512vl")]
17334    unsafe fn test_mm256_cmp_ph_mask() {
17335        let a = _mm256_set_ph(
17336            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17337        );
17338        let b = _mm256_set_ph(
17339            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17340            -16.0,
17341        );
17342        let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17343        assert_eq!(r, 0b1111000011110000);
17344    }
17345
17346    #[simd_test(enable = "avx512fp16,avx512vl")]
17347    unsafe fn test_mm256_mask_cmp_ph_mask() {
17348        let a = _mm256_set_ph(
17349            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17350        );
17351        let b = _mm256_set_ph(
17352            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17353            -16.0,
17354        );
17355        let r = _mm256_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b0101010101010101, a, b);
17356        assert_eq!(r, 0b0101000001010000);
17357    }
17358
17359    #[simd_test(enable = "avx512fp16")]
17360    unsafe fn test_mm512_cmp_ph_mask() {
17361        let a = _mm512_set_ph(
17362            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17363            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17364            31.0, 32.0,
17365        );
17366        let b = _mm512_set_ph(
17367            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17368            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17369            -29.0, -30.0, -31.0, -32.0,
17370        );
17371        let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17372        assert_eq!(r, 0b11110000111100001111000011110000);
17373    }
17374
17375    #[simd_test(enable = "avx512fp16")]
17376    unsafe fn test_mm512_mask_cmp_ph_mask() {
17377        let a = _mm512_set_ph(
17378            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17379            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17380            31.0, 32.0,
17381        );
17382        let b = _mm512_set_ph(
17383            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17384            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17385            -29.0, -30.0, -31.0, -32.0,
17386        );
17387        let r = _mm512_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101010101010101010101010101, a, b);
17388        assert_eq!(r, 0b01010000010100000101000001010000);
17389    }
17390
17391    #[simd_test(enable = "avx512fp16")]
17392    unsafe fn test_mm512_cmp_round_ph_mask() {
17393        let a = _mm512_set_ph(
17394            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17395            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17396            31.0, 32.0,
17397        );
17398        let b = _mm512_set_ph(
17399            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17400            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17401            -29.0, -30.0, -31.0, -32.0,
17402        );
17403        let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17404        assert_eq!(r, 0b11110000111100001111000011110000);
17405    }
17406
17407    #[simd_test(enable = "avx512fp16")]
17408    unsafe fn test_mm512_mask_cmp_round_ph_mask() {
17409        let a = _mm512_set_ph(
17410            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17411            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17412            31.0, 32.0,
17413        );
17414        let b = _mm512_set_ph(
17415            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17416            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17417            -29.0, -30.0, -31.0, -32.0,
17418        );
17419        let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
17420            0b01010101010101010101010101010101,
17421            a,
17422            b,
17423        );
17424        assert_eq!(r, 0b01010000010100000101000001010000);
17425    }
17426
17427    #[simd_test(enable = "avx512fp16")]
17428    unsafe fn test_mm_cmp_round_sh_mask() {
17429        let a = _mm_set_sh(1.0);
17430        let b = _mm_set_sh(1.0);
17431        let r = _mm_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17432        assert_eq!(r, 1);
17433    }
17434
17435    #[simd_test(enable = "avx512fp16")]
17436    unsafe fn test_mm_mask_cmp_round_sh_mask() {
17437        let a = _mm_set_sh(1.0);
17438        let b = _mm_set_sh(1.0);
17439        let r = _mm_mask_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(0, a, b);
17440        assert_eq!(r, 0);
17441    }
17442
17443    #[simd_test(enable = "avx512fp16")]
17444    unsafe fn test_mm_cmp_sh_mask() {
17445        let a = _mm_set_sh(1.0);
17446        let b = _mm_set_sh(1.0);
17447        let r = _mm_cmp_sh_mask::<_CMP_EQ_OQ>(a, b);
17448        assert_eq!(r, 1);
17449    }
17450
17451    #[simd_test(enable = "avx512fp16")]
17452    unsafe fn test_mm_mask_cmp_sh_mask() {
17453        let a = _mm_set_sh(1.0);
17454        let b = _mm_set_sh(1.0);
17455        let r = _mm_mask_cmp_sh_mask::<_CMP_EQ_OQ>(0, a, b);
17456        assert_eq!(r, 0);
17457    }
17458
17459    #[simd_test(enable = "avx512fp16")]
17460    unsafe fn test_mm_comi_round_sh() {
17461        let a = _mm_set_sh(1.0);
17462        let b = _mm_set_sh(1.0);
17463        let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17464        assert_eq!(r, 1);
17465    }
17466
17467    #[simd_test(enable = "avx512fp16")]
17468    unsafe fn test_mm_comi_sh() {
17469        let a = _mm_set_sh(1.0);
17470        let b = _mm_set_sh(1.0);
17471        let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b);
17472        assert_eq!(r, 1);
17473    }
17474
17475    #[simd_test(enable = "avx512fp16")]
17476    unsafe fn test_mm_comieq_sh() {
17477        let a = _mm_set_sh(1.0);
17478        let b = _mm_set_sh(1.0);
17479        let r = _mm_comieq_sh(a, b);
17480        assert_eq!(r, 1);
17481    }
17482
17483    #[simd_test(enable = "avx512fp16")]
17484    unsafe fn test_mm_comige_sh() {
17485        let a = _mm_set_sh(2.0);
17486        let b = _mm_set_sh(1.0);
17487        let r = _mm_comige_sh(a, b);
17488        assert_eq!(r, 1);
17489    }
17490
17491    #[simd_test(enable = "avx512fp16")]
17492    unsafe fn test_mm_comigt_sh() {
17493        let a = _mm_set_sh(2.0);
17494        let b = _mm_set_sh(1.0);
17495        let r = _mm_comigt_sh(a, b);
17496        assert_eq!(r, 1);
17497    }
17498
17499    #[simd_test(enable = "avx512fp16")]
17500    unsafe fn test_mm_comile_sh() {
17501        let a = _mm_set_sh(1.0);
17502        let b = _mm_set_sh(2.0);
17503        let r = _mm_comile_sh(a, b);
17504        assert_eq!(r, 1);
17505    }
17506
17507    #[simd_test(enable = "avx512fp16")]
17508    unsafe fn test_mm_comilt_sh() {
17509        let a = _mm_set_sh(1.0);
17510        let b = _mm_set_sh(2.0);
17511        let r = _mm_comilt_sh(a, b);
17512        assert_eq!(r, 1);
17513    }
17514
17515    #[simd_test(enable = "avx512fp16")]
17516    unsafe fn test_mm_comineq_sh() {
17517        let a = _mm_set_sh(1.0);
17518        let b = _mm_set_sh(2.0);
17519        let r = _mm_comineq_sh(a, b);
17520        assert_eq!(r, 1);
17521    }
17522
17523    #[simd_test(enable = "avx512fp16")]
17524    unsafe fn test_mm_ucomieq_sh() {
17525        let a = _mm_set_sh(1.0);
17526        let b = _mm_set_sh(1.0);
17527        let r = _mm_ucomieq_sh(a, b);
17528        assert_eq!(r, 1);
17529    }
17530
17531    #[simd_test(enable = "avx512fp16")]
17532    unsafe fn test_mm_ucomige_sh() {
17533        let a = _mm_set_sh(2.0);
17534        let b = _mm_set_sh(1.0);
17535        let r = _mm_ucomige_sh(a, b);
17536        assert_eq!(r, 1);
17537    }
17538
17539    #[simd_test(enable = "avx512fp16")]
17540    unsafe fn test_mm_ucomigt_sh() {
17541        let a = _mm_set_sh(2.0);
17542        let b = _mm_set_sh(1.0);
17543        let r = _mm_ucomigt_sh(a, b);
17544        assert_eq!(r, 1);
17545    }
17546
17547    #[simd_test(enable = "avx512fp16")]
17548    unsafe fn test_mm_ucomile_sh() {
17549        let a = _mm_set_sh(1.0);
17550        let b = _mm_set_sh(2.0);
17551        let r = _mm_ucomile_sh(a, b);
17552        assert_eq!(r, 1);
17553    }
17554
17555    #[simd_test(enable = "avx512fp16")]
17556    unsafe fn test_mm_ucomilt_sh() {
17557        let a = _mm_set_sh(1.0);
17558        let b = _mm_set_sh(2.0);
17559        let r = _mm_ucomilt_sh(a, b);
17560        assert_eq!(r, 1);
17561    }
17562
17563    #[simd_test(enable = "avx512fp16")]
17564    unsafe fn test_mm_ucomineq_sh() {
17565        let a = _mm_set_sh(1.0);
17566        let b = _mm_set_sh(2.0);
17567        let r = _mm_ucomineq_sh(a, b);
17568        assert_eq!(r, 1);
17569    }
17570
17571    #[simd_test(enable = "avx512fp16,avx512vl")]
17572    const unsafe fn test_mm_load_ph() {
17573        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17574        let b = _mm_load_ph(addr_of!(a).cast());
17575        assert_eq_m128h(a, b);
17576    }
17577
17578    #[simd_test(enable = "avx512fp16,avx512vl")]
17579    const unsafe fn test_mm256_load_ph() {
17580        let a = _mm256_set_ph(
17581            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17582        );
17583        let b = _mm256_load_ph(addr_of!(a).cast());
17584        assert_eq_m256h(a, b);
17585    }
17586
17587    #[simd_test(enable = "avx512fp16")]
17588    const unsafe fn test_mm512_load_ph() {
17589        let a = _mm512_set_ph(
17590            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17591            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17592            31.0, 32.0,
17593        );
17594        let b = _mm512_load_ph(addr_of!(a).cast());
17595        assert_eq_m512h(a, b);
17596    }
17597
17598    #[simd_test(enable = "avx512fp16,avx512vl")]
17599    const unsafe fn test_mm_load_sh() {
17600        let a = _mm_set_sh(1.0);
17601        let b = _mm_load_sh(addr_of!(a).cast());
17602        assert_eq_m128h(a, b);
17603    }
17604
17605    #[simd_test(enable = "avx512fp16,avx512vl")]
17606    unsafe fn test_mm_mask_load_sh() {
17607        let a = _mm_set_sh(1.0);
17608        let src = _mm_set_sh(2.);
17609        let b = _mm_mask_load_sh(src, 1, addr_of!(a).cast());
17610        assert_eq_m128h(a, b);
17611        let b = _mm_mask_load_sh(src, 0, addr_of!(a).cast());
17612        assert_eq_m128h(src, b);
17613    }
17614
17615    #[simd_test(enable = "avx512fp16,avx512vl")]
17616    unsafe fn test_mm_maskz_load_sh() {
17617        let a = _mm_set_sh(1.0);
17618        let b = _mm_maskz_load_sh(1, addr_of!(a).cast());
17619        assert_eq_m128h(a, b);
17620        let b = _mm_maskz_load_sh(0, addr_of!(a).cast());
17621        assert_eq_m128h(_mm_setzero_ph(), b);
17622    }
17623
17624    #[simd_test(enable = "avx512fp16,avx512vl")]
17625    const unsafe fn test_mm_loadu_ph() {
17626        let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
17627        let r = _mm_loadu_ph(array.as_ptr());
17628        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17629        assert_eq_m128h(r, e);
17630    }
17631
17632    #[simd_test(enable = "avx512fp16,avx512vl")]
17633    const unsafe fn test_mm256_loadu_ph() {
17634        let array = [
17635            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17636        ];
17637        let r = _mm256_loadu_ph(array.as_ptr());
17638        let e = _mm256_setr_ph(
17639            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17640        );
17641        assert_eq_m256h(r, e);
17642    }
17643
17644    #[simd_test(enable = "avx512fp16")]
17645    const unsafe fn test_mm512_loadu_ph() {
17646        let array = [
17647            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17648            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17649            31.0, 32.0,
17650        ];
17651        let r = _mm512_loadu_ph(array.as_ptr());
17652        let e = _mm512_setr_ph(
17653            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17654            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17655            31.0, 32.0,
17656        );
17657        assert_eq_m512h(r, e);
17658    }
17659
17660    #[simd_test(enable = "avx512fp16,avx512vl")]
17661    const unsafe fn test_mm_move_sh() {
17662        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17663        let b = _mm_set_sh(9.0);
17664        let r = _mm_move_sh(a, b);
17665        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0);
17666        assert_eq_m128h(r, e);
17667    }
17668
17669    #[simd_test(enable = "avx512fp16,avx512vl")]
17670    const unsafe fn test_mm_mask_move_sh() {
17671        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17672        let b = _mm_set_sh(9.0);
17673        let src = _mm_set_sh(10.0);
17674        let r = _mm_mask_move_sh(src, 0, a, b);
17675        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0);
17676        assert_eq_m128h(r, e);
17677    }
17678
17679    #[simd_test(enable = "avx512fp16,avx512vl")]
17680    const unsafe fn test_mm_maskz_move_sh() {
17681        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17682        let b = _mm_set_sh(9.0);
17683        let r = _mm_maskz_move_sh(0, a, b);
17684        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0);
17685        assert_eq_m128h(r, e);
17686    }
17687
17688    #[simd_test(enable = "avx512fp16,avx512vl")]
17689    const unsafe fn test_mm_store_ph() {
17690        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17691        let mut b = _mm_setzero_ph();
17692        _mm_store_ph(addr_of_mut!(b).cast(), a);
17693        assert_eq_m128h(a, b);
17694    }
17695
17696    #[simd_test(enable = "avx512fp16,avx512vl")]
17697    const unsafe fn test_mm256_store_ph() {
17698        let a = _mm256_set_ph(
17699            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17700        );
17701        let mut b = _mm256_setzero_ph();
17702        _mm256_store_ph(addr_of_mut!(b).cast(), a);
17703        assert_eq_m256h(a, b);
17704    }
17705
17706    #[simd_test(enable = "avx512fp16")]
17707    const unsafe fn test_mm512_store_ph() {
17708        let a = _mm512_set_ph(
17709            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17710            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17711            31.0, 32.0,
17712        );
17713        let mut b = _mm512_setzero_ph();
17714        _mm512_store_ph(addr_of_mut!(b).cast(), a);
17715        assert_eq_m512h(a, b);
17716    }
17717
17718    #[simd_test(enable = "avx512fp16,avx512vl")]
17719    const unsafe fn test_mm_store_sh() {
17720        let a = _mm_set_sh(1.0);
17721        let mut b = _mm_setzero_ph();
17722        _mm_store_sh(addr_of_mut!(b).cast(), a);
17723        assert_eq_m128h(a, b);
17724    }
17725
17726    #[simd_test(enable = "avx512fp16,avx512vl")]
17727    unsafe fn test_mm_mask_store_sh() {
17728        let a = _mm_set_sh(1.0);
17729        let mut b = _mm_setzero_ph();
17730        _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a);
17731        assert_eq_m128h(_mm_setzero_ph(), b);
17732        _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a);
17733        assert_eq_m128h(a, b);
17734    }
17735
17736    #[simd_test(enable = "avx512fp16,avx512vl")]
17737    const unsafe fn test_mm_storeu_ph() {
17738        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17739        let mut array = [0.0; 8];
17740        _mm_storeu_ph(array.as_mut_ptr(), a);
17741        assert_eq_m128h(a, _mm_loadu_ph(array.as_ptr()));
17742    }
17743
17744    #[simd_test(enable = "avx512fp16,avx512vl")]
17745    const unsafe fn test_mm256_storeu_ph() {
17746        let a = _mm256_set_ph(
17747            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17748        );
17749        let mut array = [0.0; 16];
17750        _mm256_storeu_ph(array.as_mut_ptr(), a);
17751        assert_eq_m256h(a, _mm256_loadu_ph(array.as_ptr()));
17752    }
17753
17754    #[simd_test(enable = "avx512fp16")]
17755    const unsafe fn test_mm512_storeu_ph() {
17756        let a = _mm512_set_ph(
17757            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17758            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17759            31.0, 32.0,
17760        );
17761        let mut array = [0.0; 32];
17762        _mm512_storeu_ph(array.as_mut_ptr(), a);
17763        assert_eq_m512h(a, _mm512_loadu_ph(array.as_ptr()));
17764    }
17765
17766    #[simd_test(enable = "avx512fp16,avx512vl")]
17767    const unsafe fn test_mm_add_ph() {
17768        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17769        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17770        let r = _mm_add_ph(a, b);
17771        let e = _mm_set1_ph(9.0);
17772        assert_eq_m128h(r, e);
17773    }
17774
17775    #[simd_test(enable = "avx512fp16,avx512vl")]
17776    const unsafe fn test_mm_mask_add_ph() {
17777        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17778        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17779        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17780        let r = _mm_mask_add_ph(src, 0b01010101, a, b);
17781        let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.);
17782        assert_eq_m128h(r, e);
17783    }
17784
17785    #[simd_test(enable = "avx512fp16,avx512vl")]
17786    const unsafe fn test_mm_maskz_add_ph() {
17787        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17788        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17789        let r = _mm_maskz_add_ph(0b01010101, a, b);
17790        let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.);
17791        assert_eq_m128h(r, e);
17792    }
17793
17794    #[simd_test(enable = "avx512fp16,avx512vl")]
17795    const unsafe fn test_mm256_add_ph() {
17796        let a = _mm256_set_ph(
17797            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17798        );
17799        let b = _mm256_set_ph(
17800            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17801        );
17802        let r = _mm256_add_ph(a, b);
17803        let e = _mm256_set1_ph(17.0);
17804        assert_eq_m256h(r, e);
17805    }
17806
17807    #[simd_test(enable = "avx512fp16,avx512vl")]
17808    const unsafe fn test_mm256_mask_add_ph() {
17809        let a = _mm256_set_ph(
17810            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17811        );
17812        let b = _mm256_set_ph(
17813            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17814        );
17815        let src = _mm256_set_ph(
17816            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17817        );
17818        let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b);
17819        let e = _mm256_set_ph(
17820            18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17.,
17821        );
17822        assert_eq_m256h(r, e);
17823    }
17824
17825    #[simd_test(enable = "avx512fp16,avx512vl")]
17826    const unsafe fn test_mm256_maskz_add_ph() {
17827        let a = _mm256_set_ph(
17828            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17829        );
17830        let b = _mm256_set_ph(
17831            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17832        );
17833        let r = _mm256_maskz_add_ph(0b0101010101010101, a, b);
17834        let e = _mm256_set_ph(
17835            0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17.,
17836        );
17837        assert_eq_m256h(r, e);
17838    }
17839
17840    #[simd_test(enable = "avx512fp16")]
17841    const unsafe fn test_mm512_add_ph() {
17842        let a = _mm512_set_ph(
17843            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17844            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17845            31.0, 32.0,
17846        );
17847        let b = _mm512_set_ph(
17848            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17849            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17850            3.0, 2.0, 1.0,
17851        );
17852        let r = _mm512_add_ph(a, b);
17853        let e = _mm512_set1_ph(33.0);
17854        assert_eq_m512h(r, e);
17855    }
17856
17857    #[simd_test(enable = "avx512fp16")]
17858    const unsafe fn test_mm512_mask_add_ph() {
17859        let a = _mm512_set_ph(
17860            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17861            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17862            31.0, 32.0,
17863        );
17864        let b = _mm512_set_ph(
17865            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17866            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17867            3.0, 2.0, 1.0,
17868        );
17869        let src = _mm512_set_ph(
17870            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17871            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17872        );
17873        let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b);
17874        let e = _mm512_set_ph(
17875            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17876            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17877        );
17878        assert_eq_m512h(r, e);
17879    }
17880
17881    #[simd_test(enable = "avx512fp16")]
17882    const unsafe fn test_mm512_maskz_add_ph() {
17883        let a = _mm512_set_ph(
17884            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17885            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17886            31.0, 32.0,
17887        );
17888        let b = _mm512_set_ph(
17889            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17890            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17891            3.0, 2.0, 1.0,
17892        );
17893        let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b);
17894        let e = _mm512_set_ph(
17895            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17896            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17897        );
17898        assert_eq_m512h(r, e);
17899    }
17900
17901    #[simd_test(enable = "avx512fp16")]
17902    unsafe fn test_mm512_add_round_ph() {
17903        let a = _mm512_set_ph(
17904            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17905            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17906            31.0, 32.0,
17907        );
17908        let b = _mm512_set_ph(
17909            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17910            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17911            3.0, 2.0, 1.0,
17912        );
17913        let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17914        let e = _mm512_set1_ph(33.0);
17915        assert_eq_m512h(r, e);
17916    }
17917
17918    #[simd_test(enable = "avx512fp16")]
17919    unsafe fn test_mm512_mask_add_round_ph() {
17920        let a = _mm512_set_ph(
17921            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17922            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17923            31.0, 32.0,
17924        );
17925        let b = _mm512_set_ph(
17926            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17927            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17928            3.0, 2.0, 1.0,
17929        );
17930        let src = _mm512_set_ph(
17931            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17932            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17933        );
17934        let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17935            src,
17936            0b01010101010101010101010101010101,
17937            a,
17938            b,
17939        );
17940        let e = _mm512_set_ph(
17941            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17942            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17943        );
17944        assert_eq_m512h(r, e);
17945    }
17946
17947    #[simd_test(enable = "avx512fp16")]
17948    unsafe fn test_mm512_maskz_add_round_ph() {
17949        let a = _mm512_set_ph(
17950            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17951            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17952            31.0, 32.0,
17953        );
17954        let b = _mm512_set_ph(
17955            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17956            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17957            3.0, 2.0, 1.0,
17958        );
17959        let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17960            0b01010101010101010101010101010101,
17961            a,
17962            b,
17963        );
17964        let e = _mm512_set_ph(
17965            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17966            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17967        );
17968        assert_eq_m512h(r, e);
17969    }
17970
17971    #[simd_test(enable = "avx512fp16,avx512vl")]
17972    unsafe fn test_mm_add_round_sh() {
17973        let a = _mm_set_sh(1.0);
17974        let b = _mm_set_sh(2.0);
17975        let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17976        let e = _mm_set_sh(3.0);
17977        assert_eq_m128h(r, e);
17978    }
17979
17980    #[simd_test(enable = "avx512fp16,avx512vl")]
17981    unsafe fn test_mm_mask_add_round_sh() {
17982        let a = _mm_set_sh(1.0);
17983        let b = _mm_set_sh(2.0);
17984        let src = _mm_set_sh(4.0);
17985        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17986            src, 0, a, b,
17987        );
17988        let e = _mm_set_sh(4.0);
17989        assert_eq_m128h(r, e);
17990        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17991            src, 1, a, b,
17992        );
17993        let e = _mm_set_sh(3.0);
17994        assert_eq_m128h(r, e);
17995    }
17996
17997    #[simd_test(enable = "avx512fp16,avx512vl")]
17998    unsafe fn test_mm_maskz_add_round_sh() {
17999        let a = _mm_set_sh(1.0);
18000        let b = _mm_set_sh(2.0);
18001        let r =
18002            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18003        let e = _mm_set_sh(0.0);
18004        assert_eq_m128h(r, e);
18005        let r =
18006            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18007        let e = _mm_set_sh(3.0);
18008        assert_eq_m128h(r, e);
18009    }
18010
18011    #[simd_test(enable = "avx512fp16,avx512vl")]
18012    const unsafe fn test_mm_add_sh() {
18013        let a = _mm_set_sh(1.0);
18014        let b = _mm_set_sh(2.0);
18015        let r = _mm_add_sh(a, b);
18016        let e = _mm_set_sh(3.0);
18017        assert_eq_m128h(r, e);
18018    }
18019
18020    #[simd_test(enable = "avx512fp16,avx512vl")]
18021    const unsafe fn test_mm_mask_add_sh() {
18022        let a = _mm_set_sh(1.0);
18023        let b = _mm_set_sh(2.0);
18024        let src = _mm_set_sh(4.0);
18025        let r = _mm_mask_add_sh(src, 0, a, b);
18026        let e = _mm_set_sh(4.0);
18027        assert_eq_m128h(r, e);
18028        let r = _mm_mask_add_sh(src, 1, a, b);
18029        let e = _mm_set_sh(3.0);
18030        assert_eq_m128h(r, e);
18031    }
18032
18033    #[simd_test(enable = "avx512fp16,avx512vl")]
18034    const unsafe fn test_mm_maskz_add_sh() {
18035        let a = _mm_set_sh(1.0);
18036        let b = _mm_set_sh(2.0);
18037        let r = _mm_maskz_add_sh(0, a, b);
18038        let e = _mm_set_sh(0.0);
18039        assert_eq_m128h(r, e);
18040        let r = _mm_maskz_add_sh(1, a, b);
18041        let e = _mm_set_sh(3.0);
18042        assert_eq_m128h(r, e);
18043    }
18044
18045    #[simd_test(enable = "avx512fp16,avx512vl")]
18046    const unsafe fn test_mm_sub_ph() {
18047        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18048        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18049        let r = _mm_sub_ph(a, b);
18050        let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0);
18051        assert_eq_m128h(r, e);
18052    }
18053
18054    #[simd_test(enable = "avx512fp16,avx512vl")]
18055    const unsafe fn test_mm_mask_sub_ph() {
18056        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18057        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18058        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
18059        let r = _mm_mask_sub_ph(src, 0b01010101, a, b);
18060        let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.);
18061        assert_eq_m128h(r, e);
18062    }
18063
18064    #[simd_test(enable = "avx512fp16,avx512vl")]
18065    const unsafe fn test_mm_maskz_sub_ph() {
18066        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18067        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18068        let r = _mm_maskz_sub_ph(0b01010101, a, b);
18069        let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.);
18070        assert_eq_m128h(r, e);
18071    }
18072
18073    #[simd_test(enable = "avx512fp16,avx512vl")]
18074    const unsafe fn test_mm256_sub_ph() {
18075        let a = _mm256_set_ph(
18076            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18077        );
18078        let b = _mm256_set_ph(
18079            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18080        );
18081        let r = _mm256_sub_ph(a, b);
18082        let e = _mm256_set_ph(
18083            -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0,
18084            15.0,
18085        );
18086        assert_eq_m256h(r, e);
18087    }
18088
18089    #[simd_test(enable = "avx512fp16,avx512vl")]
18090    const unsafe fn test_mm256_mask_sub_ph() {
18091        let a = _mm256_set_ph(
18092            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18093        );
18094        let b = _mm256_set_ph(
18095            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18096        );
18097        let src = _mm256_set_ph(
18098            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
18099        );
18100        let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b);
18101        let e = _mm256_set_ph(
18102            18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15.,
18103        );
18104        assert_eq_m256h(r, e);
18105    }
18106
18107    #[simd_test(enable = "avx512fp16,avx512vl")]
18108    const unsafe fn test_mm256_maskz_sub_ph() {
18109        let a = _mm256_set_ph(
18110            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18111        );
18112        let b = _mm256_set_ph(
18113            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18114        );
18115        let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b);
18116        let e = _mm256_set_ph(
18117            0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15.,
18118        );
18119        assert_eq_m256h(r, e);
18120    }
18121
18122    #[simd_test(enable = "avx512fp16")]
18123    const unsafe fn test_mm512_sub_ph() {
18124        let a = _mm512_set_ph(
18125            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18126            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18127            31.0, 32.0,
18128        );
18129        let b = _mm512_set_ph(
18130            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18131            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18132            3.0, 2.0, 1.0,
18133        );
18134        let r = _mm512_sub_ph(a, b);
18135        let e = _mm512_set_ph(
18136            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
18137            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
18138            23.0, 25.0, 27.0, 29.0, 31.0,
18139        );
18140        assert_eq_m512h(r, e);
18141    }
18142
18143    #[simd_test(enable = "avx512fp16")]
18144    const unsafe fn test_mm512_mask_sub_ph() {
18145        let a = _mm512_set_ph(
18146            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18147            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18148            31.0, 32.0,
18149        );
18150        let b = _mm512_set_ph(
18151            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18152            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18153            3.0, 2.0, 1.0,
18154        );
18155        let src = _mm512_set_ph(
18156            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18157            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18158        );
18159        let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b);
18160        let e = _mm512_set_ph(
18161            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
18162            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
18163        );
18164        assert_eq_m512h(r, e);
18165    }
18166
18167    #[simd_test(enable = "avx512fp16")]
18168    const unsafe fn test_mm512_maskz_sub_ph() {
18169        let a = _mm512_set_ph(
18170            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18171            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18172            31.0, 32.0,
18173        );
18174        let b = _mm512_set_ph(
18175            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18176            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18177            3.0, 2.0, 1.0,
18178        );
18179        let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b);
18180        let e = _mm512_set_ph(
18181            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
18182            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
18183        );
18184        assert_eq_m512h(r, e);
18185    }
18186
18187    #[simd_test(enable = "avx512fp16")]
18188    unsafe fn test_mm512_sub_round_ph() {
18189        let a = _mm512_set_ph(
18190            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18191            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18192            31.0, 32.0,
18193        );
18194        let b = _mm512_set_ph(
18195            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18196            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18197            3.0, 2.0, 1.0,
18198        );
18199        let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18200        let e = _mm512_set_ph(
18201            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
18202            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
18203            23.0, 25.0, 27.0, 29.0, 31.0,
18204        );
18205        assert_eq_m512h(r, e);
18206    }
18207
18208    #[simd_test(enable = "avx512fp16")]
18209    unsafe fn test_mm512_mask_sub_round_ph() {
18210        let a = _mm512_set_ph(
18211            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18212            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18213            31.0, 32.0,
18214        );
18215        let b = _mm512_set_ph(
18216            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18217            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18218            3.0, 2.0, 1.0,
18219        );
18220        let src = _mm512_set_ph(
18221            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18222            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18223        );
18224        let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18225            src,
18226            0b01010101010101010101010101010101,
18227            a,
18228            b,
18229        );
18230        let e = _mm512_set_ph(
18231            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
18232            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
18233        );
18234        assert_eq_m512h(r, e);
18235    }
18236
18237    #[simd_test(enable = "avx512fp16")]
18238    unsafe fn test_mm512_maskz_sub_round_ph() {
18239        let a = _mm512_set_ph(
18240            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18241            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18242            31.0, 32.0,
18243        );
18244        let b = _mm512_set_ph(
18245            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18246            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18247            3.0, 2.0, 1.0,
18248        );
18249        let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18250            0b01010101010101010101010101010101,
18251            a,
18252            b,
18253        );
18254        let e = _mm512_set_ph(
18255            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
18256            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
18257        );
18258        assert_eq_m512h(r, e);
18259    }
18260
18261    #[simd_test(enable = "avx512fp16,avx512vl")]
18262    unsafe fn test_mm_sub_round_sh() {
18263        let a = _mm_set_sh(1.0);
18264        let b = _mm_set_sh(2.0);
18265        let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18266        let e = _mm_set_sh(-1.0);
18267        assert_eq_m128h(r, e);
18268    }
18269
18270    #[simd_test(enable = "avx512fp16,avx512vl")]
18271    unsafe fn test_mm_mask_sub_round_sh() {
18272        let a = _mm_set_sh(1.0);
18273        let b = _mm_set_sh(2.0);
18274        let src = _mm_set_sh(4.0);
18275        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18276            src, 0, a, b,
18277        );
18278        let e = _mm_set_sh(4.0);
18279        assert_eq_m128h(r, e);
18280        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18281            src, 1, a, b,
18282        );
18283        let e = _mm_set_sh(-1.0);
18284        assert_eq_m128h(r, e);
18285    }
18286
18287    #[simd_test(enable = "avx512fp16,avx512vl")]
18288    unsafe fn test_mm_maskz_sub_round_sh() {
18289        let a = _mm_set_sh(1.0);
18290        let b = _mm_set_sh(2.0);
18291        let r =
18292            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18293        let e = _mm_set_sh(0.0);
18294        assert_eq_m128h(r, e);
18295        let r =
18296            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18297        let e = _mm_set_sh(-1.0);
18298        assert_eq_m128h(r, e);
18299    }
18300
18301    #[simd_test(enable = "avx512fp16,avx512vl")]
18302    const unsafe fn test_mm_sub_sh() {
18303        let a = _mm_set_sh(1.0);
18304        let b = _mm_set_sh(2.0);
18305        let r = _mm_sub_sh(a, b);
18306        let e = _mm_set_sh(-1.0);
18307        assert_eq_m128h(r, e);
18308    }
18309
18310    #[simd_test(enable = "avx512fp16,avx512vl")]
18311    const unsafe fn test_mm_mask_sub_sh() {
18312        let a = _mm_set_sh(1.0);
18313        let b = _mm_set_sh(2.0);
18314        let src = _mm_set_sh(4.0);
18315        let r = _mm_mask_sub_sh(src, 0, a, b);
18316        let e = _mm_set_sh(4.0);
18317        assert_eq_m128h(r, e);
18318        let r = _mm_mask_sub_sh(src, 1, a, b);
18319        let e = _mm_set_sh(-1.0);
18320        assert_eq_m128h(r, e);
18321    }
18322
18323    #[simd_test(enable = "avx512fp16,avx512vl")]
18324    const unsafe fn test_mm_maskz_sub_sh() {
18325        let a = _mm_set_sh(1.0);
18326        let b = _mm_set_sh(2.0);
18327        let r = _mm_maskz_sub_sh(0, a, b);
18328        let e = _mm_set_sh(0.0);
18329        assert_eq_m128h(r, e);
18330        let r = _mm_maskz_sub_sh(1, a, b);
18331        let e = _mm_set_sh(-1.0);
18332        assert_eq_m128h(r, e);
18333    }
18334
18335    #[simd_test(enable = "avx512fp16,avx512vl")]
18336    const unsafe fn test_mm_mul_ph() {
18337        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18338        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18339        let r = _mm_mul_ph(a, b);
18340        let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0);
18341        assert_eq_m128h(r, e);
18342    }
18343
18344    #[simd_test(enable = "avx512fp16,avx512vl")]
18345    const unsafe fn test_mm_mask_mul_ph() {
18346        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18347        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18348        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
18349        let r = _mm_mask_mul_ph(src, 0b01010101, a, b);
18350        let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.);
18351        assert_eq_m128h(r, e);
18352    }
18353
18354    #[simd_test(enable = "avx512fp16,avx512vl")]
18355    const unsafe fn test_mm_maskz_mul_ph() {
18356        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18357        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18358        let r = _mm_maskz_mul_ph(0b01010101, a, b);
18359        let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.);
18360        assert_eq_m128h(r, e);
18361    }
18362
18363    #[simd_test(enable = "avx512fp16,avx512vl")]
18364    const unsafe fn test_mm256_mul_ph() {
18365        let a = _mm256_set_ph(
18366            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18367        );
18368        let b = _mm256_set_ph(
18369            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18370        );
18371        let r = _mm256_mul_ph(a, b);
18372        let e = _mm256_set_ph(
18373            16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0,
18374            30.0, 16.0,
18375        );
18376        assert_eq_m256h(r, e);
18377    }
18378
18379    #[simd_test(enable = "avx512fp16,avx512vl")]
18380    const unsafe fn test_mm256_mask_mul_ph() {
18381        let a = _mm256_set_ph(
18382            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18383        );
18384        let b = _mm256_set_ph(
18385            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18386        );
18387        let src = _mm256_set_ph(
18388            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
18389        );
18390        let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b);
18391        let e = _mm256_set_ph(
18392            18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16.,
18393        );
18394        assert_eq_m256h(r, e);
18395    }
18396
18397    #[simd_test(enable = "avx512fp16,avx512vl")]
18398    const unsafe fn test_mm256_maskz_mul_ph() {
18399        let a = _mm256_set_ph(
18400            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18401        );
18402        let b = _mm256_set_ph(
18403            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18404        );
18405        let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b);
18406        let e = _mm256_set_ph(
18407            0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16.,
18408        );
18409        assert_eq_m256h(r, e);
18410    }
18411
18412    #[simd_test(enable = "avx512fp16")]
18413    const unsafe fn test_mm512_mul_ph() {
18414        let a = _mm512_set_ph(
18415            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18416            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18417            31.0, 32.0,
18418        );
18419        let b = _mm512_set_ph(
18420            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18421            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18422            3.0, 2.0, 1.0,
18423        );
18424        let r = _mm512_mul_ph(a, b);
18425        let e = _mm512_set_ph(
18426            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18427            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18428            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18429        );
18430        assert_eq_m512h(r, e);
18431    }
18432
18433    #[simd_test(enable = "avx512fp16")]
18434    const unsafe fn test_mm512_mask_mul_ph() {
18435        let a = _mm512_set_ph(
18436            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18437            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18438            31.0, 32.0,
18439        );
18440        let b = _mm512_set_ph(
18441            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18442            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18443            3.0, 2.0, 1.0,
18444        );
18445        let src = _mm512_set_ph(
18446            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18447            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18448        );
18449        let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b);
18450        let e = _mm512_set_ph(
18451            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18452            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18453        );
18454        assert_eq_m512h(r, e);
18455    }
18456
18457    #[simd_test(enable = "avx512fp16")]
18458    const unsafe fn test_mm512_maskz_mul_ph() {
18459        let a = _mm512_set_ph(
18460            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18461            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18462            31.0, 32.0,
18463        );
18464        let b = _mm512_set_ph(
18465            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18466            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18467            3.0, 2.0, 1.0,
18468        );
18469        let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b);
18470        let e = _mm512_set_ph(
18471            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18472            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18473        );
18474        assert_eq_m512h(r, e);
18475    }
18476
18477    #[simd_test(enable = "avx512fp16")]
18478    unsafe fn test_mm512_mul_round_ph() {
18479        let a = _mm512_set_ph(
18480            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18481            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18482            31.0, 32.0,
18483        );
18484        let b = _mm512_set_ph(
18485            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18486            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18487            3.0, 2.0, 1.0,
18488        );
18489        let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18490        let e = _mm512_set_ph(
18491            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18492            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18493            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18494        );
18495        assert_eq_m512h(r, e);
18496    }
18497
18498    #[simd_test(enable = "avx512fp16")]
18499    unsafe fn test_mm512_mask_mul_round_ph() {
18500        let a = _mm512_set_ph(
18501            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18502            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18503            31.0, 32.0,
18504        );
18505        let b = _mm512_set_ph(
18506            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18507            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18508            3.0, 2.0, 1.0,
18509        );
18510        let src = _mm512_set_ph(
18511            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18512            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18513        );
18514        let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18515            src,
18516            0b01010101010101010101010101010101,
18517            a,
18518            b,
18519        );
18520        let e = _mm512_set_ph(
18521            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18522            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18523        );
18524        assert_eq_m512h(r, e);
18525    }
18526
18527    #[simd_test(enable = "avx512fp16")]
18528    unsafe fn test_mm512_maskz_mul_round_ph() {
18529        let a = _mm512_set_ph(
18530            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18531            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18532            31.0, 32.0,
18533        );
18534        let b = _mm512_set_ph(
18535            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18536            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18537            3.0, 2.0, 1.0,
18538        );
18539        let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18540            0b01010101010101010101010101010101,
18541            a,
18542            b,
18543        );
18544        let e = _mm512_set_ph(
18545            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18546            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18547        );
18548        assert_eq_m512h(r, e);
18549    }
18550
18551    #[simd_test(enable = "avx512fp16,avx512vl")]
18552    unsafe fn test_mm_mul_round_sh() {
18553        let a = _mm_set_sh(1.0);
18554        let b = _mm_set_sh(2.0);
18555        let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18556        let e = _mm_set_sh(2.0);
18557        assert_eq_m128h(r, e);
18558    }
18559
18560    #[simd_test(enable = "avx512fp16,avx512vl")]
18561    unsafe fn test_mm_mask_mul_round_sh() {
18562        let a = _mm_set_sh(1.0);
18563        let b = _mm_set_sh(2.0);
18564        let src = _mm_set_sh(4.0);
18565        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18566            src, 0, a, b,
18567        );
18568        let e = _mm_set_sh(4.0);
18569        assert_eq_m128h(r, e);
18570        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18571            src, 1, a, b,
18572        );
18573        let e = _mm_set_sh(2.0);
18574        assert_eq_m128h(r, e);
18575    }
18576
18577    #[simd_test(enable = "avx512fp16,avx512vl")]
18578    unsafe fn test_mm_maskz_mul_round_sh() {
18579        let a = _mm_set_sh(1.0);
18580        let b = _mm_set_sh(2.0);
18581        let r =
18582            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18583        let e = _mm_set_sh(0.0);
18584        assert_eq_m128h(r, e);
18585        let r =
18586            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18587        let e = _mm_set_sh(2.0);
18588        assert_eq_m128h(r, e);
18589    }
18590
18591    #[simd_test(enable = "avx512fp16,avx512vl")]
18592    const unsafe fn test_mm_mul_sh() {
18593        let a = _mm_set_sh(1.0);
18594        let b = _mm_set_sh(2.0);
18595        let r = _mm_mul_sh(a, b);
18596        let e = _mm_set_sh(2.0);
18597        assert_eq_m128h(r, e);
18598    }
18599
18600    #[simd_test(enable = "avx512fp16,avx512vl")]
18601    const unsafe fn test_mm_mask_mul_sh() {
18602        let a = _mm_set_sh(1.0);
18603        let b = _mm_set_sh(2.0);
18604        let src = _mm_set_sh(4.0);
18605        let r = _mm_mask_mul_sh(src, 0, a, b);
18606        let e = _mm_set_sh(4.0);
18607        assert_eq_m128h(r, e);
18608        let r = _mm_mask_mul_sh(src, 1, a, b);
18609        let e = _mm_set_sh(2.0);
18610        assert_eq_m128h(r, e);
18611    }
18612
18613    #[simd_test(enable = "avx512fp16,avx512vl")]
18614    const unsafe fn test_mm_maskz_mul_sh() {
18615        let a = _mm_set_sh(1.0);
18616        let b = _mm_set_sh(2.0);
18617        let r = _mm_maskz_mul_sh(0, a, b);
18618        let e = _mm_set_sh(0.0);
18619        assert_eq_m128h(r, e);
18620        let r = _mm_maskz_mul_sh(1, a, b);
18621        let e = _mm_set_sh(2.0);
18622        assert_eq_m128h(r, e);
18623    }
18624
18625    #[simd_test(enable = "avx512fp16,avx512vl")]
18626    const unsafe fn test_mm_div_ph() {
18627        let a = _mm_set1_ph(1.0);
18628        let b = _mm_set1_ph(2.0);
18629        let r = _mm_div_ph(a, b);
18630        let e = _mm_set1_ph(0.5);
18631        assert_eq_m128h(r, e);
18632    }
18633
18634    #[simd_test(enable = "avx512fp16,avx512vl")]
18635    const unsafe fn test_mm_mask_div_ph() {
18636        let a = _mm_set1_ph(1.0);
18637        let b = _mm_set1_ph(2.0);
18638        let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0);
18639        let r = _mm_mask_div_ph(src, 0b01010101, a, b);
18640        let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5);
18641        assert_eq_m128h(r, e);
18642    }
18643
18644    #[simd_test(enable = "avx512fp16,avx512vl")]
18645    const unsafe fn test_mm_maskz_div_ph() {
18646        let a = _mm_set1_ph(1.0);
18647        let b = _mm_set1_ph(2.0);
18648        let r = _mm_maskz_div_ph(0b01010101, a, b);
18649        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
18650        assert_eq_m128h(r, e);
18651    }
18652
18653    #[simd_test(enable = "avx512fp16,avx512vl")]
18654    const unsafe fn test_mm256_div_ph() {
18655        let a = _mm256_set1_ph(1.0);
18656        let b = _mm256_set1_ph(2.0);
18657        let r = _mm256_div_ph(a, b);
18658        let e = _mm256_set1_ph(0.5);
18659        assert_eq_m256h(r, e);
18660    }
18661
18662    #[simd_test(enable = "avx512fp16,avx512vl")]
18663    const unsafe fn test_mm256_mask_div_ph() {
18664        let a = _mm256_set1_ph(1.0);
18665        let b = _mm256_set1_ph(2.0);
18666        let src = _mm256_set_ph(
18667            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18668            19.0,
18669        );
18670        let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b);
18671        let e = _mm256_set_ph(
18672            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18673        );
18674        assert_eq_m256h(r, e);
18675    }
18676
18677    #[simd_test(enable = "avx512fp16,avx512vl")]
18678    const unsafe fn test_mm256_maskz_div_ph() {
18679        let a = _mm256_set1_ph(1.0);
18680        let b = _mm256_set1_ph(2.0);
18681        let r = _mm256_maskz_div_ph(0b0101010101010101, a, b);
18682        let e = _mm256_set_ph(
18683            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18684        );
18685        assert_eq_m256h(r, e);
18686    }
18687
18688    #[simd_test(enable = "avx512fp16")]
18689    const unsafe fn test_mm512_div_ph() {
18690        let a = _mm512_set1_ph(1.0);
18691        let b = _mm512_set1_ph(2.0);
18692        let r = _mm512_div_ph(a, b);
18693        let e = _mm512_set1_ph(0.5);
18694        assert_eq_m512h(r, e);
18695    }
18696
18697    #[simd_test(enable = "avx512fp16")]
18698    const unsafe fn test_mm512_mask_div_ph() {
18699        let a = _mm512_set1_ph(1.0);
18700        let b = _mm512_set1_ph(2.0);
18701        let src = _mm512_set_ph(
18702            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18703            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18704            33.0, 34.0, 35.0,
18705        );
18706        let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b);
18707        let e = _mm512_set_ph(
18708            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18709            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18710        );
18711        assert_eq_m512h(r, e);
18712    }
18713
18714    #[simd_test(enable = "avx512fp16")]
18715    const unsafe fn test_mm512_maskz_div_ph() {
18716        let a = _mm512_set1_ph(1.0);
18717        let b = _mm512_set1_ph(2.0);
18718        let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b);
18719        let e = _mm512_set_ph(
18720            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18721            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18722        );
18723        assert_eq_m512h(r, e);
18724    }
18725
18726    #[simd_test(enable = "avx512fp16")]
18727    unsafe fn test_mm512_div_round_ph() {
18728        let a = _mm512_set1_ph(1.0);
18729        let b = _mm512_set1_ph(2.0);
18730        let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18731        let e = _mm512_set1_ph(0.5);
18732        assert_eq_m512h(r, e);
18733    }
18734
18735    #[simd_test(enable = "avx512fp16")]
18736    unsafe fn test_mm512_mask_div_round_ph() {
18737        let a = _mm512_set1_ph(1.0);
18738        let b = _mm512_set1_ph(2.0);
18739        let src = _mm512_set_ph(
18740            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18741            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18742            33.0, 34.0, 35.0,
18743        );
18744        let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18745            src,
18746            0b01010101010101010101010101010101,
18747            a,
18748            b,
18749        );
18750        let e = _mm512_set_ph(
18751            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18752            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18753        );
18754        assert_eq_m512h(r, e);
18755    }
18756
18757    #[simd_test(enable = "avx512fp16")]
18758    unsafe fn test_mm512_maskz_div_round_ph() {
18759        let a = _mm512_set1_ph(1.0);
18760        let b = _mm512_set1_ph(2.0);
18761        let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18762            0b01010101010101010101010101010101,
18763            a,
18764            b,
18765        );
18766        let e = _mm512_set_ph(
18767            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18768            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18769        );
18770        assert_eq_m512h(r, e);
18771    }
18772
18773    #[simd_test(enable = "avx512fp16,avx512vl")]
18774    unsafe fn test_mm_div_round_sh() {
18775        let a = _mm_set_sh(1.0);
18776        let b = _mm_set_sh(2.0);
18777        let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18778        let e = _mm_set_sh(0.5);
18779        assert_eq_m128h(r, e);
18780    }
18781
18782    #[simd_test(enable = "avx512fp16,avx512vl")]
18783    unsafe fn test_mm_mask_div_round_sh() {
18784        let a = _mm_set_sh(1.0);
18785        let b = _mm_set_sh(2.0);
18786        let src = _mm_set_sh(4.0);
18787        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18788            src, 0, a, b,
18789        );
18790        let e = _mm_set_sh(4.0);
18791        assert_eq_m128h(r, e);
18792        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18793            src, 1, a, b,
18794        );
18795        let e = _mm_set_sh(0.5);
18796        assert_eq_m128h(r, e);
18797    }
18798
18799    #[simd_test(enable = "avx512fp16,avx512vl")]
18800    unsafe fn test_mm_maskz_div_round_sh() {
18801        let a = _mm_set_sh(1.0);
18802        let b = _mm_set_sh(2.0);
18803        let r =
18804            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18805        let e = _mm_set_sh(0.0);
18806        assert_eq_m128h(r, e);
18807        let r =
18808            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18809        let e = _mm_set_sh(0.5);
18810        assert_eq_m128h(r, e);
18811    }
18812
18813    #[simd_test(enable = "avx512fp16,avx512vl")]
18814    const unsafe fn test_mm_div_sh() {
18815        let a = _mm_set_sh(1.0);
18816        let b = _mm_set_sh(2.0);
18817        let r = _mm_div_sh(a, b);
18818        let e = _mm_set_sh(0.5);
18819        assert_eq_m128h(r, e);
18820    }
18821
18822    #[simd_test(enable = "avx512fp16,avx512vl")]
18823    const unsafe fn test_mm_mask_div_sh() {
18824        let a = _mm_set_sh(1.0);
18825        let b = _mm_set_sh(2.0);
18826        let src = _mm_set_sh(4.0);
18827        let r = _mm_mask_div_sh(src, 0, a, b);
18828        let e = _mm_set_sh(4.0);
18829        assert_eq_m128h(r, e);
18830        let r = _mm_mask_div_sh(src, 1, a, b);
18831        let e = _mm_set_sh(0.5);
18832        assert_eq_m128h(r, e);
18833    }
18834
18835    #[simd_test(enable = "avx512fp16,avx512vl")]
18836    const unsafe fn test_mm_maskz_div_sh() {
18837        let a = _mm_set_sh(1.0);
18838        let b = _mm_set_sh(2.0);
18839        let r = _mm_maskz_div_sh(0, a, b);
18840        let e = _mm_set_sh(0.0);
18841        assert_eq_m128h(r, e);
18842        let r = _mm_maskz_div_sh(1, a, b);
18843        let e = _mm_set_sh(0.5);
18844        assert_eq_m128h(r, e);
18845    }
18846
18847    #[simd_test(enable = "avx512fp16,avx512vl")]
18848    unsafe fn test_mm_mul_pch() {
18849        let a = _mm_set1_pch(0.0, 1.0);
18850        let b = _mm_set1_pch(0.0, 1.0);
18851        let r = _mm_mul_pch(a, b);
18852        let e = _mm_set1_pch(-1.0, 0.0);
18853        assert_eq_m128h(r, e);
18854    }
18855
18856    #[simd_test(enable = "avx512fp16,avx512vl")]
18857    unsafe fn test_mm_mask_mul_pch() {
18858        let a = _mm_set1_pch(0.0, 1.0);
18859        let b = _mm_set1_pch(0.0, 1.0);
18860        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18861        let r = _mm_mask_mul_pch(src, 0b0101, a, b);
18862        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18863        assert_eq_m128h(r, e);
18864    }
18865
18866    #[simd_test(enable = "avx512fp16,avx512vl")]
18867    unsafe fn test_mm_maskz_mul_pch() {
18868        let a = _mm_set1_pch(0.0, 1.0);
18869        let b = _mm_set1_pch(0.0, 1.0);
18870        let r = _mm_maskz_mul_pch(0b0101, a, b);
18871        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18872        assert_eq_m128h(r, e);
18873    }
18874
18875    #[simd_test(enable = "avx512fp16,avx512vl")]
18876    unsafe fn test_mm256_mul_pch() {
18877        let a = _mm256_set1_pch(0.0, 1.0);
18878        let b = _mm256_set1_pch(0.0, 1.0);
18879        let r = _mm256_mul_pch(a, b);
18880        let e = _mm256_set1_pch(-1.0, 0.0);
18881        assert_eq_m256h(r, e);
18882    }
18883
18884    #[simd_test(enable = "avx512fp16,avx512vl")]
18885    unsafe fn test_mm256_mask_mul_pch() {
18886        let a = _mm256_set1_pch(0.0, 1.0);
18887        let b = _mm256_set1_pch(0.0, 1.0);
18888        let src = _mm256_setr_ph(
18889            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18890        );
18891        let r = _mm256_mask_mul_pch(src, 0b01010101, a, b);
18892        let e = _mm256_setr_ph(
18893            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18894        );
18895        assert_eq_m256h(r, e);
18896    }
18897
18898    #[simd_test(enable = "avx512fp16,avx512vl")]
18899    unsafe fn test_mm256_maskz_mul_pch() {
18900        let a = _mm256_set1_pch(0.0, 1.0);
18901        let b = _mm256_set1_pch(0.0, 1.0);
18902        let r = _mm256_maskz_mul_pch(0b01010101, a, b);
18903        let e = _mm256_setr_ph(
18904            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18905        );
18906        assert_eq_m256h(r, e);
18907    }
18908
18909    #[simd_test(enable = "avx512fp16")]
18910    unsafe fn test_mm512_mul_pch() {
18911        let a = _mm512_set1_pch(0.0, 1.0);
18912        let b = _mm512_set1_pch(0.0, 1.0);
18913        let r = _mm512_mul_pch(a, b);
18914        let e = _mm512_set1_pch(-1.0, 0.0);
18915        assert_eq_m512h(r, e);
18916    }
18917
18918    #[simd_test(enable = "avx512fp16")]
18919    unsafe fn test_mm512_mask_mul_pch() {
18920        let a = _mm512_set1_pch(0.0, 1.0);
18921        let b = _mm512_set1_pch(0.0, 1.0);
18922        let src = _mm512_setr_ph(
18923            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18924            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18925            32.0, 33.0,
18926        );
18927        let r = _mm512_mask_mul_pch(src, 0b0101010101010101, a, b);
18928        let e = _mm512_setr_ph(
18929            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18930            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18931            33.0,
18932        );
18933        assert_eq_m512h(r, e);
18934    }
18935
18936    #[simd_test(enable = "avx512fp16")]
18937    unsafe fn test_mm512_maskz_mul_pch() {
18938        let a = _mm512_set1_pch(0.0, 1.0);
18939        let b = _mm512_set1_pch(0.0, 1.0);
18940        let r = _mm512_maskz_mul_pch(0b0101010101010101, a, b);
18941        let e = _mm512_setr_ph(
18942            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18943            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18944        );
18945        assert_eq_m512h(r, e);
18946    }
18947
18948    #[simd_test(enable = "avx512fp16")]
18949    unsafe fn test_mm512_mul_round_pch() {
18950        let a = _mm512_set1_pch(0.0, 1.0);
18951        let b = _mm512_set1_pch(0.0, 1.0);
18952        let r = _mm512_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18953        let e = _mm512_set1_pch(-1.0, 0.0);
18954        assert_eq_m512h(r, e);
18955    }
18956
18957    #[simd_test(enable = "avx512fp16")]
18958    unsafe fn test_mm512_mask_mul_round_pch() {
18959        let a = _mm512_set1_pch(0.0, 1.0);
18960        let b = _mm512_set1_pch(0.0, 1.0);
18961        let src = _mm512_setr_ph(
18962            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18963            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18964            32.0, 33.0,
18965        );
18966        let r = _mm512_mask_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18967            src,
18968            0b0101010101010101,
18969            a,
18970            b,
18971        );
18972        let e = _mm512_setr_ph(
18973            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18974            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18975            33.0,
18976        );
18977        assert_eq_m512h(r, e);
18978    }
18979
18980    #[simd_test(enable = "avx512fp16")]
18981    unsafe fn test_mm512_maskz_mul_round_pch() {
18982        let a = _mm512_set1_pch(0.0, 1.0);
18983        let b = _mm512_set1_pch(0.0, 1.0);
18984        let r = _mm512_maskz_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18985            0b0101010101010101,
18986            a,
18987            b,
18988        );
18989        let e = _mm512_setr_ph(
18990            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18991            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18992        );
18993        assert_eq_m512h(r, e);
18994    }
18995
18996    #[simd_test(enable = "avx512fp16,avx512vl")]
18997    unsafe fn test_mm_mul_round_sch() {
18998        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18999        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19000        let r = _mm_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19001        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19002        assert_eq_m128h(r, e);
19003    }
19004
19005    #[simd_test(enable = "avx512fp16,avx512vl")]
19006    unsafe fn test_mm_mask_mul_round_sch() {
19007        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19008        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19009        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19010        let r = _mm_mask_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19011            src, 0, a, b,
19012        );
19013        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19014        assert_eq_m128h(r, e);
19015    }
19016
19017    #[simd_test(enable = "avx512fp16,avx512vl")]
19018    unsafe fn test_mm_maskz_mul_round_sch() {
19019        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19020        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19021        let r =
19022            _mm_maskz_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19023        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19024        assert_eq_m128h(r, e);
19025    }
19026
19027    #[simd_test(enable = "avx512fp16,avx512vl")]
19028    unsafe fn test_mm_mul_sch() {
19029        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19030        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19031        let r = _mm_mul_sch(a, b);
19032        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19033        assert_eq_m128h(r, e);
19034    }
19035
19036    #[simd_test(enable = "avx512fp16,avx512vl")]
19037    unsafe fn test_mm_mask_mul_sch() {
19038        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19039        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19040        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19041        let r = _mm_mask_mul_sch(src, 0, a, b);
19042        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19043        assert_eq_m128h(r, e);
19044    }
19045
19046    #[simd_test(enable = "avx512fp16,avx512vl")]
19047    unsafe fn test_mm_maskz_mul_sch() {
19048        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19049        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19050        let r = _mm_maskz_mul_sch(0, a, b);
19051        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19052        assert_eq_m128h(r, e);
19053    }
19054
19055    #[simd_test(enable = "avx512fp16,avx512vl")]
19056    unsafe fn test_mm_fmul_pch() {
19057        let a = _mm_set1_pch(0.0, 1.0);
19058        let b = _mm_set1_pch(0.0, 1.0);
19059        let r = _mm_fmul_pch(a, b);
19060        let e = _mm_set1_pch(-1.0, 0.0);
19061        assert_eq_m128h(r, e);
19062    }
19063
19064    #[simd_test(enable = "avx512fp16,avx512vl")]
19065    unsafe fn test_mm_mask_fmul_pch() {
19066        let a = _mm_set1_pch(0.0, 1.0);
19067        let b = _mm_set1_pch(0.0, 1.0);
19068        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19069        let r = _mm_mask_fmul_pch(src, 0b0101, a, b);
19070        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19071        assert_eq_m128h(r, e);
19072    }
19073
19074    #[simd_test(enable = "avx512fp16,avx512vl")]
19075    unsafe fn test_mm_maskz_fmul_pch() {
19076        let a = _mm_set1_pch(0.0, 1.0);
19077        let b = _mm_set1_pch(0.0, 1.0);
19078        let r = _mm_maskz_fmul_pch(0b0101, a, b);
19079        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19080        assert_eq_m128h(r, e);
19081    }
19082
19083    #[simd_test(enable = "avx512fp16,avx512vl")]
19084    unsafe fn test_mm256_fmul_pch() {
19085        let a = _mm256_set1_pch(0.0, 1.0);
19086        let b = _mm256_set1_pch(0.0, 1.0);
19087        let r = _mm256_fmul_pch(a, b);
19088        let e = _mm256_set1_pch(-1.0, 0.0);
19089        assert_eq_m256h(r, e);
19090    }
19091
19092    #[simd_test(enable = "avx512fp16,avx512vl")]
19093    unsafe fn test_mm256_mask_fmul_pch() {
19094        let a = _mm256_set1_pch(0.0, 1.0);
19095        let b = _mm256_set1_pch(0.0, 1.0);
19096        let src = _mm256_setr_ph(
19097            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19098        );
19099        let r = _mm256_mask_fmul_pch(src, 0b01010101, a, b);
19100        let e = _mm256_setr_ph(
19101            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19102        );
19103        assert_eq_m256h(r, e);
19104    }
19105
19106    #[simd_test(enable = "avx512fp16,avx512vl")]
19107    unsafe fn test_mm256_maskz_fmul_pch() {
19108        let a = _mm256_set1_pch(0.0, 1.0);
19109        let b = _mm256_set1_pch(0.0, 1.0);
19110        let r = _mm256_maskz_fmul_pch(0b01010101, a, b);
19111        let e = _mm256_setr_ph(
19112            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19113        );
19114        assert_eq_m256h(r, e);
19115    }
19116
19117    #[simd_test(enable = "avx512fp16")]
19118    unsafe fn test_mm512_fmul_pch() {
19119        let a = _mm512_set1_pch(0.0, 1.0);
19120        let b = _mm512_set1_pch(0.0, 1.0);
19121        let r = _mm512_fmul_pch(a, b);
19122        let e = _mm512_set1_pch(-1.0, 0.0);
19123        assert_eq_m512h(r, e);
19124    }
19125
19126    #[simd_test(enable = "avx512fp16")]
19127    unsafe fn test_mm512_mask_fmul_pch() {
19128        let a = _mm512_set1_pch(0.0, 1.0);
19129        let b = _mm512_set1_pch(0.0, 1.0);
19130        let src = _mm512_setr_ph(
19131            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19132            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19133            32.0, 33.0,
19134        );
19135        let r = _mm512_mask_fmul_pch(src, 0b0101010101010101, a, b);
19136        let e = _mm512_setr_ph(
19137            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19138            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19139            33.0,
19140        );
19141        assert_eq_m512h(r, e);
19142    }
19143
19144    #[simd_test(enable = "avx512fp16")]
19145    unsafe fn test_mm512_maskz_fmul_pch() {
19146        let a = _mm512_set1_pch(0.0, 1.0);
19147        let b = _mm512_set1_pch(0.0, 1.0);
19148        let r = _mm512_maskz_fmul_pch(0b0101010101010101, a, b);
19149        let e = _mm512_setr_ph(
19150            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19151            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19152        );
19153        assert_eq_m512h(r, e);
19154    }
19155
19156    #[simd_test(enable = "avx512fp16")]
19157    unsafe fn test_mm512_fmul_round_pch() {
19158        let a = _mm512_set1_pch(0.0, 1.0);
19159        let b = _mm512_set1_pch(0.0, 1.0);
19160        let r = _mm512_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19161        let e = _mm512_set1_pch(-1.0, 0.0);
19162        assert_eq_m512h(r, e);
19163    }
19164
19165    #[simd_test(enable = "avx512fp16")]
19166    unsafe fn test_mm512_mask_fmul_round_pch() {
19167        let a = _mm512_set1_pch(0.0, 1.0);
19168        let b = _mm512_set1_pch(0.0, 1.0);
19169        let src = _mm512_setr_ph(
19170            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19171            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19172            32.0, 33.0,
19173        );
19174        let r = _mm512_mask_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19175            src,
19176            0b0101010101010101,
19177            a,
19178            b,
19179        );
19180        let e = _mm512_setr_ph(
19181            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19182            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19183            33.0,
19184        );
19185        assert_eq_m512h(r, e);
19186    }
19187
19188    #[simd_test(enable = "avx512fp16")]
19189    unsafe fn test_mm512_maskz_fmul_round_pch() {
19190        let a = _mm512_set1_pch(0.0, 1.0);
19191        let b = _mm512_set1_pch(0.0, 1.0);
19192        let r = _mm512_maskz_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19193            0b0101010101010101,
19194            a,
19195            b,
19196        );
19197        let e = _mm512_setr_ph(
19198            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19199            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19200        );
19201        assert_eq_m512h(r, e);
19202    }
19203
19204    #[simd_test(enable = "avx512fp16,avx512vl")]
19205    unsafe fn test_mm_fmul_round_sch() {
19206        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19207        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19208        let r = _mm_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19209        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19210        assert_eq_m128h(r, e);
19211    }
19212
19213    #[simd_test(enable = "avx512fp16,avx512vl")]
19214    unsafe fn test_mm_mask_fmul_round_sch() {
19215        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19216        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19217        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19218        let r = _mm_mask_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19219            src, 0, a, b,
19220        );
19221        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19222        assert_eq_m128h(r, e);
19223    }
19224
19225    #[simd_test(enable = "avx512fp16,avx512vl")]
19226    unsafe fn test_mm_maskz_fmul_round_sch() {
19227        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19228        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19229        let r =
19230            _mm_maskz_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19231        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19232        assert_eq_m128h(r, e);
19233    }
19234
19235    #[simd_test(enable = "avx512fp16,avx512vl")]
19236    unsafe fn test_mm_fmul_sch() {
19237        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19238        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19239        let r = _mm_fmul_sch(a, b);
19240        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19241        assert_eq_m128h(r, e);
19242    }
19243
19244    #[simd_test(enable = "avx512fp16,avx512vl")]
19245    unsafe fn test_mm_mask_fmul_sch() {
19246        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19247        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19248        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19249        let r = _mm_mask_fmul_sch(src, 0, a, b);
19250        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19251        assert_eq_m128h(r, e);
19252    }
19253
19254    #[simd_test(enable = "avx512fp16,avx512vl")]
19255    unsafe fn test_mm_maskz_fmul_sch() {
19256        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19257        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19258        let r = _mm_maskz_fmul_sch(0, a, b);
19259        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19260        assert_eq_m128h(r, e);
19261    }
19262
19263    #[simd_test(enable = "avx512fp16,avx512vl")]
19264    unsafe fn test_mm_cmul_pch() {
19265        let a = _mm_set1_pch(0.0, 1.0);
19266        let b = _mm_set1_pch(0.0, -1.0);
19267        let r = _mm_cmul_pch(a, b);
19268        let e = _mm_set1_pch(-1.0, 0.0);
19269        assert_eq_m128h(r, e);
19270    }
19271
19272    #[simd_test(enable = "avx512fp16,avx512vl")]
19273    unsafe fn test_mm_mask_cmul_pch() {
19274        let a = _mm_set1_pch(0.0, 1.0);
19275        let b = _mm_set1_pch(0.0, -1.0);
19276        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19277        let r = _mm_mask_cmul_pch(src, 0b0101, a, b);
19278        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19279        assert_eq_m128h(r, e);
19280    }
19281
19282    #[simd_test(enable = "avx512fp16,avx512vl")]
19283    unsafe fn test_mm_maskz_cmul_pch() {
19284        let a = _mm_set1_pch(0.0, 1.0);
19285        let b = _mm_set1_pch(0.0, -1.0);
19286        let r = _mm_maskz_cmul_pch(0b0101, a, b);
19287        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19288        assert_eq_m128h(r, e);
19289    }
19290
19291    #[simd_test(enable = "avx512fp16,avx512vl")]
19292    unsafe fn test_mm256_cmul_pch() {
19293        let a = _mm256_set1_pch(0.0, 1.0);
19294        let b = _mm256_set1_pch(0.0, -1.0);
19295        let r = _mm256_cmul_pch(a, b);
19296        let e = _mm256_set1_pch(-1.0, 0.0);
19297        assert_eq_m256h(r, e);
19298    }
19299
19300    #[simd_test(enable = "avx512fp16,avx512vl")]
19301    unsafe fn test_mm256_mask_cmul_pch() {
19302        let a = _mm256_set1_pch(0.0, 1.0);
19303        let b = _mm256_set1_pch(0.0, -1.0);
19304        let src = _mm256_setr_ph(
19305            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19306        );
19307        let r = _mm256_mask_cmul_pch(src, 0b01010101, a, b);
19308        let e = _mm256_setr_ph(
19309            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19310        );
19311        assert_eq_m256h(r, e);
19312    }
19313
19314    #[simd_test(enable = "avx512fp16,avx512vl")]
19315    unsafe fn test_mm256_maskz_cmul_pch() {
19316        let a = _mm256_set1_pch(0.0, 1.0);
19317        let b = _mm256_set1_pch(0.0, -1.0);
19318        let r = _mm256_maskz_cmul_pch(0b01010101, a, b);
19319        let e = _mm256_setr_ph(
19320            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19321        );
19322        assert_eq_m256h(r, e);
19323    }
19324
19325    #[simd_test(enable = "avx512fp16")]
19326    unsafe fn test_mm512_cmul_pch() {
19327        let a = _mm512_set1_pch(0.0, 1.0);
19328        let b = _mm512_set1_pch(0.0, -1.0);
19329        let r = _mm512_cmul_pch(a, b);
19330        let e = _mm512_set1_pch(-1.0, 0.0);
19331        assert_eq_m512h(r, e);
19332    }
19333
19334    #[simd_test(enable = "avx512fp16")]
19335    unsafe fn test_mm512_mask_cmul_pch() {
19336        let a = _mm512_set1_pch(0.0, 1.0);
19337        let b = _mm512_set1_pch(0.0, -1.0);
19338        let src = _mm512_setr_ph(
19339            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19340            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19341            32.0, 33.0,
19342        );
19343        let r = _mm512_mask_cmul_pch(src, 0b0101010101010101, a, b);
19344        let e = _mm512_setr_ph(
19345            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19346            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19347            33.0,
19348        );
19349        assert_eq_m512h(r, e);
19350    }
19351
19352    #[simd_test(enable = "avx512fp16")]
19353    unsafe fn test_mm512_maskz_cmul_pch() {
19354        let a = _mm512_set1_pch(0.0, 1.0);
19355        let b = _mm512_set1_pch(0.0, -1.0);
19356        let r = _mm512_maskz_cmul_pch(0b0101010101010101, a, b);
19357        let e = _mm512_setr_ph(
19358            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19359            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19360        );
19361        assert_eq_m512h(r, e);
19362    }
19363
19364    #[simd_test(enable = "avx512fp16")]
19365    unsafe fn test_mm512_cmul_round_pch() {
19366        let a = _mm512_set1_pch(0.0, 1.0);
19367        let b = _mm512_set1_pch(0.0, -1.0);
19368        let r = _mm512_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19369        let e = _mm512_set1_pch(-1.0, 0.0);
19370        assert_eq_m512h(r, e);
19371    }
19372
19373    #[simd_test(enable = "avx512fp16")]
19374    unsafe fn test_mm512_mask_cmul_round_pch() {
19375        let a = _mm512_set1_pch(0.0, 1.0);
19376        let b = _mm512_set1_pch(0.0, -1.0);
19377        let src = _mm512_setr_ph(
19378            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19379            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19380            32.0, 33.0,
19381        );
19382        let r = _mm512_mask_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19383            src,
19384            0b0101010101010101,
19385            a,
19386            b,
19387        );
19388        let e = _mm512_setr_ph(
19389            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19390            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19391            33.0,
19392        );
19393        assert_eq_m512h(r, e);
19394    }
19395
19396    #[simd_test(enable = "avx512fp16")]
19397    unsafe fn test_mm512_maskz_cmul_round_pch() {
19398        let a = _mm512_set1_pch(0.0, 1.0);
19399        let b = _mm512_set1_pch(0.0, -1.0);
19400        let r = _mm512_maskz_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19401            0b0101010101010101,
19402            a,
19403            b,
19404        );
19405        let e = _mm512_setr_ph(
19406            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19407            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19408        );
19409        assert_eq_m512h(r, e);
19410    }
19411
19412    #[simd_test(enable = "avx512fp16,avx512vl")]
19413    unsafe fn test_mm_cmul_sch() {
19414        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19415        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19416        let r = _mm_cmul_sch(a, b);
19417        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19418        assert_eq_m128h(r, e);
19419    }
19420
19421    #[simd_test(enable = "avx512fp16,avx512vl")]
19422    unsafe fn test_mm_mask_cmul_sch() {
19423        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19424        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19425        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19426        let r = _mm_mask_cmul_sch(src, 0, a, b);
19427        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19428        assert_eq_m128h(r, e);
19429    }
19430
19431    #[simd_test(enable = "avx512fp16,avx512vl")]
19432    unsafe fn test_mm_maskz_cmul_sch() {
19433        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19434        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19435        let r = _mm_maskz_cmul_sch(0, a, b);
19436        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19437        assert_eq_m128h(r, e);
19438    }
19439
19440    #[simd_test(enable = "avx512fp16,avx512vl")]
19441    unsafe fn test_mm_cmul_round_sch() {
19442        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19443        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19444        let r = _mm_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19445        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19446        assert_eq_m128h(r, e);
19447    }
19448
19449    #[simd_test(enable = "avx512fp16,avx512vl")]
19450    unsafe fn test_mm_mask_cmul_round_sch() {
19451        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19452        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19453        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19454        let r = _mm_mask_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19455            src, 0, a, b,
19456        );
19457        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19458        assert_eq_m128h(r, e);
19459    }
19460
19461    #[simd_test(enable = "avx512fp16,avx512vl")]
19462    unsafe fn test_mm_maskz_cmul_round_sch() {
19463        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19464        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19465        let r =
19466            _mm_maskz_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19467        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19468        assert_eq_m128h(r, e);
19469    }
19470
19471    #[simd_test(enable = "avx512fp16,avx512vl")]
19472    unsafe fn test_mm_fcmul_pch() {
19473        let a = _mm_set1_pch(0.0, 1.0);
19474        let b = _mm_set1_pch(0.0, -1.0);
19475        let r = _mm_fcmul_pch(a, b);
19476        let e = _mm_set1_pch(-1.0, 0.0);
19477        assert_eq_m128h(r, e);
19478    }
19479
19480    #[simd_test(enable = "avx512fp16,avx512vl")]
19481    unsafe fn test_mm_mask_fcmul_pch() {
19482        let a = _mm_set1_pch(0.0, 1.0);
19483        let b = _mm_set1_pch(0.0, -1.0);
19484        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19485        let r = _mm_mask_fcmul_pch(src, 0b0101, a, b);
19486        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19487        assert_eq_m128h(r, e);
19488    }
19489
19490    #[simd_test(enable = "avx512fp16,avx512vl")]
19491    unsafe fn test_mm_maskz_fcmul_pch() {
19492        let a = _mm_set1_pch(0.0, 1.0);
19493        let b = _mm_set1_pch(0.0, -1.0);
19494        let r = _mm_maskz_fcmul_pch(0b0101, a, b);
19495        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19496        assert_eq_m128h(r, e);
19497    }
19498
19499    #[simd_test(enable = "avx512fp16,avx512vl")]
19500    unsafe fn test_mm256_fcmul_pch() {
19501        let a = _mm256_set1_pch(0.0, 1.0);
19502        let b = _mm256_set1_pch(0.0, -1.0);
19503        let r = _mm256_fcmul_pch(a, b);
19504        let e = _mm256_set1_pch(-1.0, 0.0);
19505        assert_eq_m256h(r, e);
19506    }
19507
19508    #[simd_test(enable = "avx512fp16,avx512vl")]
19509    unsafe fn test_mm256_mask_fcmul_pch() {
19510        let a = _mm256_set1_pch(0.0, 1.0);
19511        let b = _mm256_set1_pch(0.0, -1.0);
19512        let src = _mm256_setr_ph(
19513            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19514        );
19515        let r = _mm256_mask_fcmul_pch(src, 0b01010101, a, b);
19516        let e = _mm256_setr_ph(
19517            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19518        );
19519        assert_eq_m256h(r, e);
19520    }
19521
19522    #[simd_test(enable = "avx512fp16,avx512vl")]
19523    unsafe fn test_mm256_maskz_fcmul_pch() {
19524        let a = _mm256_set1_pch(0.0, 1.0);
19525        let b = _mm256_set1_pch(0.0, -1.0);
19526        let r = _mm256_maskz_fcmul_pch(0b01010101, a, b);
19527        let e = _mm256_setr_ph(
19528            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19529        );
19530        assert_eq_m256h(r, e);
19531    }
19532
19533    #[simd_test(enable = "avx512fp16")]
19534    unsafe fn test_mm512_fcmul_pch() {
19535        let a = _mm512_set1_pch(0.0, 1.0);
19536        let b = _mm512_set1_pch(0.0, -1.0);
19537        let r = _mm512_fcmul_pch(a, b);
19538        let e = _mm512_set1_pch(-1.0, 0.0);
19539        assert_eq_m512h(r, e);
19540    }
19541
19542    #[simd_test(enable = "avx512fp16")]
19543    unsafe fn test_mm512_mask_fcmul_pch() {
19544        let a = _mm512_set1_pch(0.0, 1.0);
19545        let b = _mm512_set1_pch(0.0, -1.0);
19546        let src = _mm512_setr_ph(
19547            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19548            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19549            32.0, 33.0,
19550        );
19551        let r = _mm512_mask_fcmul_pch(src, 0b0101010101010101, a, b);
19552        let e = _mm512_setr_ph(
19553            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19554            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19555            33.0,
19556        );
19557        assert_eq_m512h(r, e);
19558    }
19559
19560    #[simd_test(enable = "avx512fp16")]
19561    unsafe fn test_mm512_maskz_fcmul_pch() {
19562        let a = _mm512_set1_pch(0.0, 1.0);
19563        let b = _mm512_set1_pch(0.0, -1.0);
19564        let r = _mm512_maskz_fcmul_pch(0b0101010101010101, a, b);
19565        let e = _mm512_setr_ph(
19566            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19567            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19568        );
19569        assert_eq_m512h(r, e);
19570    }
19571
19572    #[simd_test(enable = "avx512fp16")]
19573    unsafe fn test_mm512_fcmul_round_pch() {
19574        let a = _mm512_set1_pch(0.0, 1.0);
19575        let b = _mm512_set1_pch(0.0, -1.0);
19576        let r = _mm512_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19577        let e = _mm512_set1_pch(-1.0, 0.0);
19578        assert_eq_m512h(r, e);
19579    }
19580
19581    #[simd_test(enable = "avx512fp16")]
19582    unsafe fn test_mm512_mask_fcmul_round_pch() {
19583        let a = _mm512_set1_pch(0.0, 1.0);
19584        let b = _mm512_set1_pch(0.0, -1.0);
19585        let src = _mm512_setr_ph(
19586            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19587            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19588            32.0, 33.0,
19589        );
19590        let r = _mm512_mask_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19591            src,
19592            0b0101010101010101,
19593            a,
19594            b,
19595        );
19596        let e = _mm512_setr_ph(
19597            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19598            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19599            33.0,
19600        );
19601        assert_eq_m512h(r, e);
19602    }
19603
19604    #[simd_test(enable = "avx512fp16")]
19605    unsafe fn test_mm512_maskz_fcmul_round_pch() {
19606        let a = _mm512_set1_pch(0.0, 1.0);
19607        let b = _mm512_set1_pch(0.0, -1.0);
19608        let r = _mm512_maskz_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19609            0b0101010101010101,
19610            a,
19611            b,
19612        );
19613        let e = _mm512_setr_ph(
19614            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19615            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19616        );
19617        assert_eq_m512h(r, e);
19618    }
19619
19620    #[simd_test(enable = "avx512fp16,avx512vl")]
19621    unsafe fn test_mm_fcmul_sch() {
19622        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19623        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19624        let r = _mm_fcmul_sch(a, b);
19625        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19626        assert_eq_m128h(r, e);
19627    }
19628
19629    #[simd_test(enable = "avx512fp16,avx512vl")]
19630    unsafe fn test_mm_mask_fcmul_sch() {
19631        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19632        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19633        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19634        let r = _mm_mask_fcmul_sch(src, 0, a, b);
19635        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19636        assert_eq_m128h(r, e);
19637    }
19638
19639    #[simd_test(enable = "avx512fp16,avx512vl")]
19640    unsafe fn test_mm_maskz_fcmul_sch() {
19641        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19642        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19643        let r = _mm_maskz_fcmul_sch(0, a, b);
19644        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19645        assert_eq_m128h(r, e);
19646    }
19647
19648    #[simd_test(enable = "avx512fp16,avx512vl")]
19649    unsafe fn test_mm_fcmul_round_sch() {
19650        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19651        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19652        let r = _mm_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19653        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19654        assert_eq_m128h(r, e);
19655    }
19656
19657    #[simd_test(enable = "avx512fp16,avx512vl")]
19658    unsafe fn test_mm_mask_fcmul_round_sch() {
19659        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19660        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19661        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19662        let r = _mm_mask_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19663            src, 0, a, b,
19664        );
19665        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19666        assert_eq_m128h(r, e);
19667    }
19668
19669    #[simd_test(enable = "avx512fp16,avx512vl")]
19670    unsafe fn test_mm_maskz_fcmul_round_sch() {
19671        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19672        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19673        let r =
19674            _mm_maskz_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19675        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19676        assert_eq_m128h(r, e);
19677    }
19678
19679    #[simd_test(enable = "avx512fp16,avx512vl")]
19680    const unsafe fn test_mm_abs_ph() {
19681        let a = _mm_set_ph(-1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0);
19682        let r = _mm_abs_ph(a);
19683        let e = _mm_set_ph(1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0);
19684        assert_eq_m128h(r, e);
19685    }
19686
19687    #[simd_test(enable = "avx512fp16,avx512vl")]
19688    const unsafe fn test_mm256_abs_ph() {
19689        let a = _mm256_set_ph(
19690            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19691            -14.0,
19692        );
19693        let r = _mm256_abs_ph(a);
19694        let e = _mm256_set_ph(
19695            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19696        );
19697        assert_eq_m256h(r, e);
19698    }
19699
19700    #[simd_test(enable = "avx512fp16")]
19701    const unsafe fn test_mm512_abs_ph() {
19702        let a = _mm512_set_ph(
19703            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19704            -14.0, 15.0, -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0,
19705            27.0, -28.0, 29.0, -30.0,
19706        );
19707        let r = _mm512_abs_ph(a);
19708        let e = _mm512_set_ph(
19709            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19710            15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0,
19711            29.0, 30.0,
19712        );
19713        assert_eq_m512h(r, e);
19714    }
19715
19716    #[simd_test(enable = "avx512fp16,avx512vl")]
19717    const unsafe fn test_mm_conj_pch() {
19718        let a = _mm_set1_pch(0.0, 1.0);
19719        let r = _mm_conj_pch(a);
19720        let e = _mm_set1_pch(0.0, -1.0);
19721        assert_eq_m128h(r, e);
19722    }
19723
19724    #[simd_test(enable = "avx512fp16,avx512vl")]
19725    const unsafe fn test_mm_mask_conj_pch() {
19726        let a = _mm_set1_pch(0.0, 1.0);
19727        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19728        let r = _mm_mask_conj_pch(src, 0b0101, a);
19729        let e = _mm_setr_ph(0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0);
19730        assert_eq_m128h(r, e);
19731    }
19732
19733    #[simd_test(enable = "avx512fp16,avx512vl")]
19734    const unsafe fn test_mm_maskz_conj_pch() {
19735        let a = _mm_set1_pch(0.0, 1.0);
19736        let r = _mm_maskz_conj_pch(0b0101, a);
19737        let e = _mm_setr_ph(0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
19738        assert_eq_m128h(r, e);
19739    }
19740
19741    #[simd_test(enable = "avx512fp16,avx512vl")]
19742    const unsafe fn test_mm256_conj_pch() {
19743        let a = _mm256_set1_pch(0.0, 1.0);
19744        let r = _mm256_conj_pch(a);
19745        let e = _mm256_set1_pch(0.0, -1.0);
19746        assert_eq_m256h(r, e);
19747    }
19748
19749    #[simd_test(enable = "avx512fp16,avx512vl")]
19750    const unsafe fn test_mm256_mask_conj_pch() {
19751        let a = _mm256_set1_pch(0.0, 1.0);
19752        let src = _mm256_setr_ph(
19753            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19754        );
19755        let r = _mm256_mask_conj_pch(src, 0b01010101, a);
19756        let e = _mm256_setr_ph(
19757            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19758        );
19759        assert_eq_m256h(r, e);
19760    }
19761
19762    #[simd_test(enable = "avx512fp16,avx512vl")]
19763    const unsafe fn test_mm256_maskz_conj_pch() {
19764        let a = _mm256_set1_pch(0.0, 1.0);
19765        let r = _mm256_maskz_conj_pch(0b01010101, a);
19766        let e = _mm256_setr_ph(
19767            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19768        );
19769        assert_eq_m256h(r, e);
19770    }
19771
19772    #[simd_test(enable = "avx512fp16")]
19773    const unsafe fn test_mm512_conj_pch() {
19774        let a = _mm512_set1_pch(0.0, 1.0);
19775        let r = _mm512_conj_pch(a);
19776        let e = _mm512_set1_pch(0.0, -1.0);
19777        assert_eq_m512h(r, e);
19778    }
19779
19780    #[simd_test(enable = "avx512fp16")]
19781    const unsafe fn test_mm512_mask_conj_pch() {
19782        let a = _mm512_set1_pch(0.0, 1.0);
19783        let src = _mm512_setr_ph(
19784            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19785            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19786            32.0, 33.0,
19787        );
19788        let r = _mm512_mask_conj_pch(src, 0b0101010101010101, a);
19789        let e = _mm512_setr_ph(
19790            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19791            0.0, -1.0, 20.0, 21.0, 0.0, -1.0, 24.0, 25.0, 0.0, -1.0, 28.0, 29.0, 0.0, -1.0, 32.0,
19792            33.0,
19793        );
19794        assert_eq_m512h(r, e);
19795    }
19796
19797    #[simd_test(enable = "avx512fp16")]
19798    const unsafe fn test_mm512_maskz_conj_pch() {
19799        let a = _mm512_set1_pch(0.0, 1.0);
19800        let r = _mm512_maskz_conj_pch(0b0101010101010101, a);
19801        let e = _mm512_setr_ph(
19802            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19803            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19804        );
19805        assert_eq_m512h(r, e);
19806    }
19807
19808    #[simd_test(enable = "avx512fp16,avx512vl")]
19809    unsafe fn test_mm_fmadd_pch() {
19810        let a = _mm_set1_pch(0.0, 1.0);
19811        let b = _mm_set1_pch(0.0, 2.0);
19812        let c = _mm_set1_pch(0.0, 3.0);
19813        let r = _mm_fmadd_pch(a, b, c);
19814        let e = _mm_set1_pch(-2.0, 3.0);
19815        assert_eq_m128h(r, e);
19816    }
19817
19818    #[simd_test(enable = "avx512fp16,avx512vl")]
19819    unsafe fn test_mm_mask_fmadd_pch() {
19820        let a = _mm_set1_pch(0.0, 1.0);
19821        let b = _mm_set1_pch(0.0, 2.0);
19822        let c = _mm_set1_pch(0.0, 3.0);
19823        let r = _mm_mask_fmadd_pch(a, 0b0101, b, c);
19824        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0);
19825        assert_eq_m128h(r, e);
19826    }
19827
19828    #[simd_test(enable = "avx512fp16,avx512vl")]
19829    unsafe fn test_mm_mask3_fmadd_pch() {
19830        let a = _mm_set1_pch(0.0, 1.0);
19831        let b = _mm_set1_pch(0.0, 2.0);
19832        let c = _mm_set1_pch(0.0, 3.0);
19833        let r = _mm_mask3_fmadd_pch(a, b, c, 0b0101);
19834        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0);
19835        assert_eq_m128h(r, e);
19836    }
19837
19838    #[simd_test(enable = "avx512fp16,avx512vl")]
19839    unsafe fn test_mm_maskz_fmadd_pch() {
19840        let a = _mm_set1_pch(0.0, 1.0);
19841        let b = _mm_set1_pch(0.0, 2.0);
19842        let c = _mm_set1_pch(0.0, 3.0);
19843        let r = _mm_maskz_fmadd_pch(0b0101, a, b, c);
19844        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0);
19845        assert_eq_m128h(r, e);
19846    }
19847
19848    #[simd_test(enable = "avx512fp16,avx512vl")]
19849    unsafe fn test_mm256_fmadd_pch() {
19850        let a = _mm256_set1_pch(0.0, 1.0);
19851        let b = _mm256_set1_pch(0.0, 2.0);
19852        let c = _mm256_set1_pch(0.0, 3.0);
19853        let r = _mm256_fmadd_pch(a, b, c);
19854        let e = _mm256_set1_pch(-2.0, 3.0);
19855        assert_eq_m256h(r, e);
19856    }
19857
19858    #[simd_test(enable = "avx512fp16,avx512vl")]
19859    unsafe fn test_mm256_mask_fmadd_pch() {
19860        let a = _mm256_set1_pch(0.0, 1.0);
19861        let b = _mm256_set1_pch(0.0, 2.0);
19862        let c = _mm256_set1_pch(0.0, 3.0);
19863        let r = _mm256_mask_fmadd_pch(a, 0b01010101, b, c);
19864        let e = _mm256_setr_ph(
19865            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19866        );
19867        assert_eq_m256h(r, e);
19868    }
19869
19870    #[simd_test(enable = "avx512fp16,avx512vl")]
19871    unsafe fn test_mm256_mask3_fmadd_pch() {
19872        let a = _mm256_set1_pch(0.0, 1.0);
19873        let b = _mm256_set1_pch(0.0, 2.0);
19874        let c = _mm256_set1_pch(0.0, 3.0);
19875        let r = _mm256_mask3_fmadd_pch(a, b, c, 0b01010101);
19876        let e = _mm256_setr_ph(
19877            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19878        );
19879        assert_eq_m256h(r, e);
19880    }
19881
19882    #[simd_test(enable = "avx512fp16,avx512vl")]
19883    unsafe fn test_mm256_maskz_fmadd_pch() {
19884        let a = _mm256_set1_pch(0.0, 1.0);
19885        let b = _mm256_set1_pch(0.0, 2.0);
19886        let c = _mm256_set1_pch(0.0, 3.0);
19887        let r = _mm256_maskz_fmadd_pch(0b01010101, a, b, c);
19888        let e = _mm256_setr_ph(
19889            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19890        );
19891        assert_eq_m256h(r, e);
19892    }
19893
19894    #[simd_test(enable = "avx512fp16")]
19895    unsafe fn test_mm512_fmadd_pch() {
19896        let a = _mm512_set1_pch(0.0, 1.0);
19897        let b = _mm512_set1_pch(0.0, 2.0);
19898        let c = _mm512_set1_pch(0.0, 3.0);
19899        let r = _mm512_fmadd_pch(a, b, c);
19900        let e = _mm512_set1_pch(-2.0, 3.0);
19901        assert_eq_m512h(r, e);
19902    }
19903
19904    #[simd_test(enable = "avx512fp16")]
19905    unsafe fn test_mm512_mask_fmadd_pch() {
19906        let a = _mm512_set1_pch(0.0, 1.0);
19907        let b = _mm512_set1_pch(0.0, 2.0);
19908        let c = _mm512_set1_pch(0.0, 3.0);
19909        let r = _mm512_mask_fmadd_pch(a, 0b0101010101010101, b, c);
19910        let e = _mm512_setr_ph(
19911            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19912            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19913        );
19914        assert_eq_m512h(r, e);
19915    }
19916
19917    #[simd_test(enable = "avx512fp16")]
19918    unsafe fn test_mm512_mask3_fmadd_pch() {
19919        let a = _mm512_set1_pch(0.0, 1.0);
19920        let b = _mm512_set1_pch(0.0, 2.0);
19921        let c = _mm512_set1_pch(0.0, 3.0);
19922        let r = _mm512_mask3_fmadd_pch(a, b, c, 0b0101010101010101);
19923        let e = _mm512_setr_ph(
19924            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19925            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19926        );
19927        assert_eq_m512h(r, e);
19928    }
19929
19930    #[simd_test(enable = "avx512fp16")]
19931    unsafe fn test_mm512_maskz_fmadd_pch() {
19932        let a = _mm512_set1_pch(0.0, 1.0);
19933        let b = _mm512_set1_pch(0.0, 2.0);
19934        let c = _mm512_set1_pch(0.0, 3.0);
19935        let r = _mm512_maskz_fmadd_pch(0b0101010101010101, a, b, c);
19936        let e = _mm512_setr_ph(
19937            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19938            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19939        );
19940        assert_eq_m512h(r, e);
19941    }
19942
19943    #[simd_test(enable = "avx512fp16")]
19944    unsafe fn test_mm512_fmadd_round_pch() {
19945        let a = _mm512_set1_pch(0.0, 1.0);
19946        let b = _mm512_set1_pch(0.0, 2.0);
19947        let c = _mm512_set1_pch(0.0, 3.0);
19948        let r =
19949            _mm512_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19950        let e = _mm512_set1_pch(-2.0, 3.0);
19951        assert_eq_m512h(r, e);
19952    }
19953
19954    #[simd_test(enable = "avx512fp16")]
19955    unsafe fn test_mm512_mask_fmadd_round_pch() {
19956        let a = _mm512_set1_pch(0.0, 1.0);
19957        let b = _mm512_set1_pch(0.0, 2.0);
19958        let c = _mm512_set1_pch(0.0, 3.0);
19959        let r = _mm512_mask_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19960            a,
19961            0b0101010101010101,
19962            b,
19963            c,
19964        );
19965        let e = _mm512_setr_ph(
19966            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19967            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19968        );
19969        assert_eq_m512h(r, e);
19970    }
19971
19972    #[simd_test(enable = "avx512fp16")]
19973    unsafe fn test_mm512_mask3_fmadd_round_pch() {
19974        let a = _mm512_set1_pch(0.0, 1.0);
19975        let b = _mm512_set1_pch(0.0, 2.0);
19976        let c = _mm512_set1_pch(0.0, 3.0);
19977        let r = _mm512_mask3_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19978            a,
19979            b,
19980            c,
19981            0b0101010101010101,
19982        );
19983        let e = _mm512_setr_ph(
19984            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19985            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19986        );
19987        assert_eq_m512h(r, e);
19988    }
19989
19990    #[simd_test(enable = "avx512fp16")]
19991    unsafe fn test_mm512_maskz_fmadd_round_pch() {
19992        let a = _mm512_set1_pch(0.0, 1.0);
19993        let b = _mm512_set1_pch(0.0, 2.0);
19994        let c = _mm512_set1_pch(0.0, 3.0);
19995        let r = _mm512_maskz_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19996            0b0101010101010101,
19997            a,
19998            b,
19999            c,
20000        );
20001        let e = _mm512_setr_ph(
20002            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
20003            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
20004        );
20005        assert_eq_m512h(r, e);
20006    }
20007
20008    #[simd_test(enable = "avx512fp16,avx512vl")]
20009    unsafe fn test_mm_fmadd_sch() {
20010        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20011        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20012        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20013        let r = _mm_fmadd_sch(a, b, c);
20014        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20015        assert_eq_m128h(r, e);
20016    }
20017
20018    #[simd_test(enable = "avx512fp16,avx512vl")]
20019    unsafe fn test_mm_mask_fmadd_sch() {
20020        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20021        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20022        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20023        let r = _mm_mask_fmadd_sch(a, 0, b, c);
20024        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20025        assert_eq_m128h(r, e);
20026        let r = _mm_mask_fmadd_sch(a, 1, b, c);
20027        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20028        assert_eq_m128h(r, e);
20029    }
20030
20031    #[simd_test(enable = "avx512fp16,avx512vl")]
20032    unsafe fn test_mm_mask3_fmadd_sch() {
20033        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20034        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20035        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20036        let r = _mm_mask3_fmadd_sch(a, b, c, 0);
20037        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20038        assert_eq_m128h(r, e);
20039        let r = _mm_mask3_fmadd_sch(a, b, c, 1);
20040        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20041        assert_eq_m128h(r, e);
20042    }
20043
20044    #[simd_test(enable = "avx512fp16,avx512vl")]
20045    unsafe fn test_mm_maskz_fmadd_sch() {
20046        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20047        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20048        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20049        let r = _mm_maskz_fmadd_sch(0, a, b, c);
20050        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20051        assert_eq_m128h(r, e);
20052        let r = _mm_maskz_fmadd_sch(1, a, b, c);
20053        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20054        assert_eq_m128h(r, e);
20055    }
20056
20057    #[simd_test(enable = "avx512fp16,avx512vl")]
20058    unsafe fn test_mm_fmadd_round_sch() {
20059        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20060        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20061        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20062        let r = _mm_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20063        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20064        assert_eq_m128h(r, e);
20065    }
20066
20067    #[simd_test(enable = "avx512fp16,avx512vl")]
20068    unsafe fn test_mm_mask_fmadd_round_sch() {
20069        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20070        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20071        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20072        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20073            a, 0, b, c,
20074        );
20075        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20076        assert_eq_m128h(r, e);
20077        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20078            a, 1, b, c,
20079        );
20080        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20081        assert_eq_m128h(r, e);
20082    }
20083
20084    #[simd_test(enable = "avx512fp16,avx512vl")]
20085    unsafe fn test_mm_mask3_fmadd_round_sch() {
20086        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20087        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20088        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20089        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20090            a, b, c, 0,
20091        );
20092        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20093        assert_eq_m128h(r, e);
20094        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20095            a, b, c, 1,
20096        );
20097        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20098        assert_eq_m128h(r, e);
20099    }
20100
20101    #[simd_test(enable = "avx512fp16,avx512vl")]
20102    unsafe fn test_mm_maskz_fmadd_round_sch() {
20103        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20104        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20105        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20106        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20107            0, a, b, c,
20108        );
20109        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20110        assert_eq_m128h(r, e);
20111        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20112            1, a, b, c,
20113        );
20114        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20115        assert_eq_m128h(r, e);
20116    }
20117
20118    #[simd_test(enable = "avx512fp16,avx512vl")]
20119    unsafe fn test_mm_fcmadd_pch() {
20120        let a = _mm_set1_pch(0.0, 1.0);
20121        let b = _mm_set1_pch(0.0, 2.0);
20122        let c = _mm_set1_pch(0.0, 3.0);
20123        let r = _mm_fcmadd_pch(a, b, c);
20124        let e = _mm_set1_pch(2.0, 3.0);
20125        assert_eq_m128h(r, e);
20126    }
20127
20128    #[simd_test(enable = "avx512fp16,avx512vl")]
20129    unsafe fn test_mm_mask_fcmadd_pch() {
20130        let a = _mm_set1_pch(0.0, 1.0);
20131        let b = _mm_set1_pch(0.0, 2.0);
20132        let c = _mm_set1_pch(0.0, 3.0);
20133        let r = _mm_mask_fcmadd_pch(a, 0b0101, b, c);
20134        let e = _mm_setr_ph(2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0);
20135        assert_eq_m128h(r, e);
20136    }
20137
20138    #[simd_test(enable = "avx512fp16,avx512vl")]
20139    unsafe fn test_mm_mask3_fcmadd_pch() {
20140        let a = _mm_set1_pch(0.0, 1.0);
20141        let b = _mm_set1_pch(0.0, 2.0);
20142        let c = _mm_set1_pch(0.0, 3.0);
20143        let r = _mm_mask3_fcmadd_pch(a, b, c, 0b0101);
20144        let e = _mm_setr_ph(2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0);
20145        assert_eq_m128h(r, e);
20146    }
20147
20148    #[simd_test(enable = "avx512fp16,avx512vl")]
20149    unsafe fn test_mm_maskz_fcmadd_pch() {
20150        let a = _mm_set1_pch(0.0, 1.0);
20151        let b = _mm_set1_pch(0.0, 2.0);
20152        let c = _mm_set1_pch(0.0, 3.0);
20153        let r = _mm_maskz_fcmadd_pch(0b0101, a, b, c);
20154        let e = _mm_setr_ph(2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0);
20155        assert_eq_m128h(r, e);
20156    }
20157
20158    #[simd_test(enable = "avx512fp16,avx512vl")]
20159    unsafe fn test_mm256_fcmadd_pch() {
20160        let a = _mm256_set1_pch(0.0, 1.0);
20161        let b = _mm256_set1_pch(0.0, 2.0);
20162        let c = _mm256_set1_pch(0.0, 3.0);
20163        let r = _mm256_fcmadd_pch(a, b, c);
20164        let e = _mm256_set1_pch(2.0, 3.0);
20165        assert_eq_m256h(r, e);
20166    }
20167
20168    #[simd_test(enable = "avx512fp16,avx512vl")]
20169    unsafe fn test_mm256_mask_fcmadd_pch() {
20170        let a = _mm256_set1_pch(0.0, 1.0);
20171        let b = _mm256_set1_pch(0.0, 2.0);
20172        let c = _mm256_set1_pch(0.0, 3.0);
20173        let r = _mm256_mask_fcmadd_pch(a, 0b01010101, b, c);
20174        let e = _mm256_setr_ph(
20175            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
20176        );
20177        assert_eq_m256h(r, e);
20178    }
20179
20180    #[simd_test(enable = "avx512fp16,avx512vl")]
20181    unsafe fn test_mm256_mask3_fcmadd_pch() {
20182        let a = _mm256_set1_pch(0.0, 1.0);
20183        let b = _mm256_set1_pch(0.0, 2.0);
20184        let c = _mm256_set1_pch(0.0, 3.0);
20185        let r = _mm256_mask3_fcmadd_pch(a, b, c, 0b01010101);
20186        let e = _mm256_setr_ph(
20187            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
20188        );
20189        assert_eq_m256h(r, e);
20190    }
20191
20192    #[simd_test(enable = "avx512fp16,avx512vl")]
20193    unsafe fn test_mm256_maskz_fcmadd_pch() {
20194        let a = _mm256_set1_pch(0.0, 1.0);
20195        let b = _mm256_set1_pch(0.0, 2.0);
20196        let c = _mm256_set1_pch(0.0, 3.0);
20197        let r = _mm256_maskz_fcmadd_pch(0b01010101, a, b, c);
20198        let e = _mm256_setr_ph(
20199            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20200        );
20201        assert_eq_m256h(r, e);
20202    }
20203
20204    #[simd_test(enable = "avx512fp16")]
20205    unsafe fn test_mm512_fcmadd_pch() {
20206        let a = _mm512_set1_pch(0.0, 1.0);
20207        let b = _mm512_set1_pch(0.0, 2.0);
20208        let c = _mm512_set1_pch(0.0, 3.0);
20209        let r = _mm512_fcmadd_pch(a, b, c);
20210        let e = _mm512_set1_pch(2.0, 3.0);
20211        assert_eq_m512h(r, e);
20212    }
20213
20214    #[simd_test(enable = "avx512fp16")]
20215    unsafe fn test_mm512_mask_fcmadd_pch() {
20216        let a = _mm512_set1_pch(0.0, 1.0);
20217        let b = _mm512_set1_pch(0.0, 2.0);
20218        let c = _mm512_set1_pch(0.0, 3.0);
20219        let r = _mm512_mask_fcmadd_pch(a, 0b0101010101010101, b, c);
20220        let e = _mm512_setr_ph(
20221            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
20222            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
20223        );
20224        assert_eq_m512h(r, e);
20225    }
20226
20227    #[simd_test(enable = "avx512fp16")]
20228    unsafe fn test_mm512_mask3_fcmadd_pch() {
20229        let a = _mm512_set1_pch(0.0, 1.0);
20230        let b = _mm512_set1_pch(0.0, 2.0);
20231        let c = _mm512_set1_pch(0.0, 3.0);
20232        let r = _mm512_mask3_fcmadd_pch(a, b, c, 0b0101010101010101);
20233        let e = _mm512_setr_ph(
20234            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
20235            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
20236        );
20237        assert_eq_m512h(r, e);
20238    }
20239
20240    #[simd_test(enable = "avx512fp16")]
20241    unsafe fn test_mm512_maskz_fcmadd_pch() {
20242        let a = _mm512_set1_pch(0.0, 1.0);
20243        let b = _mm512_set1_pch(0.0, 2.0);
20244        let c = _mm512_set1_pch(0.0, 3.0);
20245        let r = _mm512_maskz_fcmadd_pch(0b0101010101010101, a, b, c);
20246        let e = _mm512_setr_ph(
20247            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
20248            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20249        );
20250        assert_eq_m512h(r, e);
20251    }
20252
20253    #[simd_test(enable = "avx512fp16")]
20254    unsafe fn test_mm512_fcmadd_round_pch() {
20255        let a = _mm512_set1_pch(0.0, 1.0);
20256        let b = _mm512_set1_pch(0.0, 2.0);
20257        let c = _mm512_set1_pch(0.0, 3.0);
20258        let r =
20259            _mm512_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20260        let e = _mm512_set1_pch(2.0, 3.0);
20261        assert_eq_m512h(r, e);
20262    }
20263
20264    #[simd_test(enable = "avx512fp16")]
20265    unsafe fn test_mm512_mask_fcmadd_round_pch() {
20266        let a = _mm512_set1_pch(0.0, 1.0);
20267        let b = _mm512_set1_pch(0.0, 2.0);
20268        let c = _mm512_set1_pch(0.0, 3.0);
20269        let r = _mm512_mask_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20270            a,
20271            0b0101010101010101,
20272            b,
20273            c,
20274        );
20275        let e = _mm512_setr_ph(
20276            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
20277            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
20278        );
20279        assert_eq_m512h(r, e);
20280    }
20281
20282    #[simd_test(enable = "avx512fp16")]
20283    unsafe fn test_mm512_mask3_fcmadd_round_pch() {
20284        let a = _mm512_set1_pch(0.0, 1.0);
20285        let b = _mm512_set1_pch(0.0, 2.0);
20286        let c = _mm512_set1_pch(0.0, 3.0);
20287        let r = _mm512_mask3_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20288            a,
20289            b,
20290            c,
20291            0b0101010101010101,
20292        );
20293        let e = _mm512_setr_ph(
20294            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
20295            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
20296        );
20297        assert_eq_m512h(r, e);
20298    }
20299
20300    #[simd_test(enable = "avx512fp16")]
20301    unsafe fn test_mm512_maskz_fcmadd_round_pch() {
20302        let a = _mm512_set1_pch(0.0, 1.0);
20303        let b = _mm512_set1_pch(0.0, 2.0);
20304        let c = _mm512_set1_pch(0.0, 3.0);
20305        let r = _mm512_maskz_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20306            0b0101010101010101,
20307            a,
20308            b,
20309            c,
20310        );
20311        let e = _mm512_setr_ph(
20312            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
20313            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20314        );
20315        assert_eq_m512h(r, e);
20316    }
20317
20318    #[simd_test(enable = "avx512fp16,avx512vl")]
20319    unsafe fn test_mm_fcmadd_sch() {
20320        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20321        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20322        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20323        let r = _mm_fcmadd_sch(a, b, c);
20324        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20325        assert_eq_m128h(r, e);
20326    }
20327
20328    #[simd_test(enable = "avx512fp16,avx512vl")]
20329    unsafe fn test_mm_mask_fcmadd_sch() {
20330        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20331        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20332        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20333        let r = _mm_mask_fcmadd_sch(a, 0, b, c);
20334        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20335        assert_eq_m128h(r, e);
20336        let r = _mm_mask_fcmadd_sch(a, 1, b, c);
20337        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20338        assert_eq_m128h(r, e);
20339    }
20340
20341    #[simd_test(enable = "avx512fp16,avx512vl")]
20342    unsafe fn test_mm_mask3_fcmadd_sch() {
20343        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20344        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20345        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20346        let r = _mm_mask3_fcmadd_sch(a, b, c, 0);
20347        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20348        assert_eq_m128h(r, e);
20349        let r = _mm_mask3_fcmadd_sch(a, b, c, 1);
20350        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20351        assert_eq_m128h(r, e);
20352    }
20353
20354    #[simd_test(enable = "avx512fp16,avx512vl")]
20355    unsafe fn test_mm_maskz_fcmadd_sch() {
20356        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20357        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20358        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20359        let r = _mm_maskz_fcmadd_sch(0, a, b, c);
20360        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20361        assert_eq_m128h(r, e);
20362        let r = _mm_maskz_fcmadd_sch(1, a, b, c);
20363        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20364        assert_eq_m128h(r, e);
20365    }
20366
20367    #[simd_test(enable = "avx512fp16,avx512vl")]
20368    unsafe fn test_mm_fcmadd_round_sch() {
20369        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20370        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20371        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20372        let r = _mm_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20373        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20374        assert_eq_m128h(r, e);
20375    }
20376
20377    #[simd_test(enable = "avx512fp16,avx512vl")]
20378    unsafe fn test_mm_mask_fcmadd_round_sch() {
20379        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20380        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20381        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20382        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20383            a, 0, b, c,
20384        );
20385        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20386        assert_eq_m128h(r, e);
20387        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20388            a, 1, b, c,
20389        );
20390        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20391        assert_eq_m128h(r, e);
20392    }
20393
20394    #[simd_test(enable = "avx512fp16,avx512vl")]
20395    unsafe fn test_mm_mask3_fcmadd_round_sch() {
20396        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20397        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20398        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20399        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20400            a, b, c, 0,
20401        );
20402        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20403        assert_eq_m128h(r, e);
20404        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20405            a, b, c, 1,
20406        );
20407        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20408        assert_eq_m128h(r, e);
20409    }
20410
20411    #[simd_test(enable = "avx512fp16,avx512vl")]
20412    unsafe fn test_mm_maskz_fcmadd_round_sch() {
20413        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20414        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20415        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20416        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20417            0, a, b, c,
20418        );
20419        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20420        assert_eq_m128h(r, e);
20421        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20422            1, a, b, c,
20423        );
20424        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20425        assert_eq_m128h(r, e);
20426    }
20427
20428    #[simd_test(enable = "avx512fp16,avx512vl")]
20429    const unsafe fn test_mm_fmadd_ph() {
20430        let a = _mm_set1_ph(1.0);
20431        let b = _mm_set1_ph(2.0);
20432        let c = _mm_set1_ph(3.0);
20433        let r = _mm_fmadd_ph(a, b, c);
20434        let e = _mm_set1_ph(5.0);
20435        assert_eq_m128h(r, e);
20436    }
20437
20438    #[simd_test(enable = "avx512fp16,avx512vl")]
20439    const unsafe fn test_mm_mask_fmadd_ph() {
20440        let a = _mm_set1_ph(1.0);
20441        let b = _mm_set1_ph(2.0);
20442        let c = _mm_set1_ph(3.0);
20443        let r = _mm_mask_fmadd_ph(a, 0b01010101, b, c);
20444        let e = _mm_set_ph(1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0);
20445        assert_eq_m128h(r, e);
20446    }
20447
20448    #[simd_test(enable = "avx512fp16,avx512vl")]
20449    const unsafe fn test_mm_mask3_fmadd_ph() {
20450        let a = _mm_set1_ph(1.0);
20451        let b = _mm_set1_ph(2.0);
20452        let c = _mm_set1_ph(3.0);
20453        let r = _mm_mask3_fmadd_ph(a, b, c, 0b01010101);
20454        let e = _mm_set_ph(3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0);
20455        assert_eq_m128h(r, e);
20456    }
20457
20458    #[simd_test(enable = "avx512fp16,avx512vl")]
20459    const unsafe fn test_mm_maskz_fmadd_ph() {
20460        let a = _mm_set1_ph(1.0);
20461        let b = _mm_set1_ph(2.0);
20462        let c = _mm_set1_ph(3.0);
20463        let r = _mm_maskz_fmadd_ph(0b01010101, a, b, c);
20464        let e = _mm_set_ph(0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0);
20465        assert_eq_m128h(r, e);
20466    }
20467
20468    #[simd_test(enable = "avx512fp16,avx512vl")]
20469    const unsafe fn test_mm256_fmadd_ph() {
20470        let a = _mm256_set1_ph(1.0);
20471        let b = _mm256_set1_ph(2.0);
20472        let c = _mm256_set1_ph(3.0);
20473        let r = _mm256_fmadd_ph(a, b, c);
20474        let e = _mm256_set1_ph(5.0);
20475        assert_eq_m256h(r, e);
20476    }
20477
20478    #[simd_test(enable = "avx512fp16,avx512vl")]
20479    const unsafe fn test_mm256_mask_fmadd_ph() {
20480        let a = _mm256_set1_ph(1.0);
20481        let b = _mm256_set1_ph(2.0);
20482        let c = _mm256_set1_ph(3.0);
20483        let r = _mm256_mask_fmadd_ph(a, 0b0101010101010101, b, c);
20484        let e = _mm256_set_ph(
20485            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20486        );
20487        assert_eq_m256h(r, e);
20488    }
20489
20490    #[simd_test(enable = "avx512fp16,avx512vl")]
20491    const unsafe fn test_mm256_mask3_fmadd_ph() {
20492        let a = _mm256_set1_ph(1.0);
20493        let b = _mm256_set1_ph(2.0);
20494        let c = _mm256_set1_ph(3.0);
20495        let r = _mm256_mask3_fmadd_ph(a, b, c, 0b0101010101010101);
20496        let e = _mm256_set_ph(
20497            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20498        );
20499        assert_eq_m256h(r, e);
20500    }
20501
20502    #[simd_test(enable = "avx512fp16,avx512vl")]
20503    const unsafe fn test_mm256_maskz_fmadd_ph() {
20504        let a = _mm256_set1_ph(1.0);
20505        let b = _mm256_set1_ph(2.0);
20506        let c = _mm256_set1_ph(3.0);
20507        let r = _mm256_maskz_fmadd_ph(0b0101010101010101, a, b, c);
20508        let e = _mm256_set_ph(
20509            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20510        );
20511        assert_eq_m256h(r, e);
20512    }
20513
20514    #[simd_test(enable = "avx512fp16")]
20515    const unsafe fn test_mm512_fmadd_ph() {
20516        let a = _mm512_set1_ph(1.0);
20517        let b = _mm512_set1_ph(2.0);
20518        let c = _mm512_set1_ph(3.0);
20519        let r = _mm512_fmadd_ph(a, b, c);
20520        let e = _mm512_set1_ph(5.0);
20521        assert_eq_m512h(r, e);
20522    }
20523
20524    #[simd_test(enable = "avx512fp16")]
20525    const unsafe fn test_mm512_mask_fmadd_ph() {
20526        let a = _mm512_set1_ph(1.0);
20527        let b = _mm512_set1_ph(2.0);
20528        let c = _mm512_set1_ph(3.0);
20529        let r = _mm512_mask_fmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20530        let e = _mm512_set_ph(
20531            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20532            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20533        );
20534        assert_eq_m512h(r, e);
20535    }
20536
20537    #[simd_test(enable = "avx512fp16")]
20538    const unsafe fn test_mm512_mask3_fmadd_ph() {
20539        let a = _mm512_set1_ph(1.0);
20540        let b = _mm512_set1_ph(2.0);
20541        let c = _mm512_set1_ph(3.0);
20542        let r = _mm512_mask3_fmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20543        let e = _mm512_set_ph(
20544            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20545            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20546        );
20547        assert_eq_m512h(r, e);
20548    }
20549
20550    #[simd_test(enable = "avx512fp16")]
20551    const unsafe fn test_mm512_maskz_fmadd_ph() {
20552        let a = _mm512_set1_ph(1.0);
20553        let b = _mm512_set1_ph(2.0);
20554        let c = _mm512_set1_ph(3.0);
20555        let r = _mm512_maskz_fmadd_ph(0b01010101010101010101010101010101, a, b, c);
20556        let e = _mm512_set_ph(
20557            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20558            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20559        );
20560        assert_eq_m512h(r, e);
20561    }
20562
20563    #[simd_test(enable = "avx512fp16")]
20564    unsafe fn test_mm512_fmadd_round_ph() {
20565        let a = _mm512_set1_ph(1.0);
20566        let b = _mm512_set1_ph(2.0);
20567        let c = _mm512_set1_ph(3.0);
20568        let r = _mm512_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20569        let e = _mm512_set1_ph(5.0);
20570        assert_eq_m512h(r, e);
20571    }
20572
20573    #[simd_test(enable = "avx512fp16")]
20574    unsafe fn test_mm512_mask_fmadd_round_ph() {
20575        let a = _mm512_set1_ph(1.0);
20576        let b = _mm512_set1_ph(2.0);
20577        let c = _mm512_set1_ph(3.0);
20578        let r = _mm512_mask_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20579            a,
20580            0b01010101010101010101010101010101,
20581            b,
20582            c,
20583        );
20584        let e = _mm512_set_ph(
20585            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20586            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20587        );
20588        assert_eq_m512h(r, e);
20589    }
20590
20591    #[simd_test(enable = "avx512fp16")]
20592    unsafe fn test_mm512_mask3_fmadd_round_ph() {
20593        let a = _mm512_set1_ph(1.0);
20594        let b = _mm512_set1_ph(2.0);
20595        let c = _mm512_set1_ph(3.0);
20596        let r = _mm512_mask3_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20597            a,
20598            b,
20599            c,
20600            0b01010101010101010101010101010101,
20601        );
20602        let e = _mm512_set_ph(
20603            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20604            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20605        );
20606        assert_eq_m512h(r, e);
20607    }
20608
20609    #[simd_test(enable = "avx512fp16")]
20610    unsafe fn test_mm512_maskz_fmadd_round_ph() {
20611        let a = _mm512_set1_ph(1.0);
20612        let b = _mm512_set1_ph(2.0);
20613        let c = _mm512_set1_ph(3.0);
20614        let r = _mm512_maskz_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20615            0b01010101010101010101010101010101,
20616            a,
20617            b,
20618            c,
20619        );
20620        let e = _mm512_set_ph(
20621            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20622            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20623        );
20624        assert_eq_m512h(r, e);
20625    }
20626
20627    #[simd_test(enable = "avx512fp16,avx512vl")]
20628    const unsafe fn test_mm_fmadd_sh() {
20629        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20630        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20631        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20632        let r = _mm_fmadd_sh(a, b, c);
20633        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20634        assert_eq_m128h(r, e);
20635    }
20636
20637    #[simd_test(enable = "avx512fp16,avx512vl")]
20638    const unsafe fn test_mm_mask_fmadd_sh() {
20639        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20640        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20641        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20642        let r = _mm_mask_fmadd_sh(a, 0, b, c);
20643        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20644        assert_eq_m128h(r, e);
20645        let r = _mm_mask_fmadd_sh(a, 1, b, c);
20646        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20647        assert_eq_m128h(r, e);
20648    }
20649
20650    #[simd_test(enable = "avx512fp16,avx512vl")]
20651    const unsafe fn test_mm_mask3_fmadd_sh() {
20652        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20653        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20654        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20655        let r = _mm_mask3_fmadd_sh(a, b, c, 0);
20656        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20657        assert_eq_m128h(r, e);
20658        let r = _mm_mask3_fmadd_sh(a, b, c, 1);
20659        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20660        assert_eq_m128h(r, e);
20661    }
20662
20663    #[simd_test(enable = "avx512fp16,avx512vl")]
20664    const unsafe fn test_mm_maskz_fmadd_sh() {
20665        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20666        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20667        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20668        let r = _mm_maskz_fmadd_sh(0, a, b, c);
20669        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20670        assert_eq_m128h(r, e);
20671        let r = _mm_maskz_fmadd_sh(1, a, b, c);
20672        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20673        assert_eq_m128h(r, e);
20674    }
20675
20676    #[simd_test(enable = "avx512fp16,avx512vl")]
20677    unsafe fn test_mm_fmadd_round_sh() {
20678        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20679        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20680        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20681        let r = _mm_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20682        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20683        assert_eq_m128h(r, e);
20684    }
20685
20686    #[simd_test(enable = "avx512fp16,avx512vl")]
20687    unsafe fn test_mm_mask_fmadd_round_sh() {
20688        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20689        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20690        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20691        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20692            a, 0, b, c,
20693        );
20694        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20695        assert_eq_m128h(r, e);
20696        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20697            a, 1, b, c,
20698        );
20699        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20700        assert_eq_m128h(r, e);
20701    }
20702
20703    #[simd_test(enable = "avx512fp16,avx512vl")]
20704    unsafe fn test_mm_mask3_fmadd_round_sh() {
20705        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20706        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20707        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20708        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20709            a, b, c, 0,
20710        );
20711        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20712        assert_eq_m128h(r, e);
20713        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20714            a, b, c, 1,
20715        );
20716        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20717        assert_eq_m128h(r, e);
20718    }
20719
20720    #[simd_test(enable = "avx512fp16,avx512vl")]
20721    unsafe fn test_mm_maskz_fmadd_round_sh() {
20722        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20723        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20724        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20725        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20726            0, a, b, c,
20727        );
20728        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20729        assert_eq_m128h(r, e);
20730        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20731            1, a, b, c,
20732        );
20733        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20734        assert_eq_m128h(r, e);
20735    }
20736
20737    #[simd_test(enable = "avx512fp16,avx512vl")]
20738    const unsafe fn test_mm_fmsub_ph() {
20739        let a = _mm_set1_ph(1.0);
20740        let b = _mm_set1_ph(2.0);
20741        let c = _mm_set1_ph(3.0);
20742        let r = _mm_fmsub_ph(a, b, c);
20743        let e = _mm_set1_ph(-1.0);
20744        assert_eq_m128h(r, e);
20745    }
20746
20747    #[simd_test(enable = "avx512fp16,avx512vl")]
20748    const unsafe fn test_mm_mask_fmsub_ph() {
20749        let a = _mm_set1_ph(1.0);
20750        let b = _mm_set1_ph(2.0);
20751        let c = _mm_set1_ph(3.0);
20752        let r = _mm_mask_fmsub_ph(a, 0b01010101, b, c);
20753        let e = _mm_set_ph(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
20754        assert_eq_m128h(r, e);
20755    }
20756
20757    #[simd_test(enable = "avx512fp16,avx512vl")]
20758    const unsafe fn test_mm_mask3_fmsub_ph() {
20759        let a = _mm_set1_ph(1.0);
20760        let b = _mm_set1_ph(2.0);
20761        let c = _mm_set1_ph(3.0);
20762        let r = _mm_mask3_fmsub_ph(a, b, c, 0b01010101);
20763        let e = _mm_set_ph(3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0);
20764        assert_eq_m128h(r, e);
20765    }
20766
20767    #[simd_test(enable = "avx512fp16,avx512vl")]
20768    const unsafe fn test_mm_maskz_fmsub_ph() {
20769        let a = _mm_set1_ph(1.0);
20770        let b = _mm_set1_ph(2.0);
20771        let c = _mm_set1_ph(3.0);
20772        let r = _mm_maskz_fmsub_ph(0b01010101, a, b, c);
20773        let e = _mm_set_ph(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0);
20774        assert_eq_m128h(r, e);
20775    }
20776
20777    #[simd_test(enable = "avx512fp16,avx512vl")]
20778    const unsafe fn test_mm256_fmsub_ph() {
20779        let a = _mm256_set1_ph(1.0);
20780        let b = _mm256_set1_ph(2.0);
20781        let c = _mm256_set1_ph(3.0);
20782        let r = _mm256_fmsub_ph(a, b, c);
20783        let e = _mm256_set1_ph(-1.0);
20784        assert_eq_m256h(r, e);
20785    }
20786
20787    #[simd_test(enable = "avx512fp16,avx512vl")]
20788    const unsafe fn test_mm256_mask_fmsub_ph() {
20789        let a = _mm256_set1_ph(1.0);
20790        let b = _mm256_set1_ph(2.0);
20791        let c = _mm256_set1_ph(3.0);
20792        let r = _mm256_mask_fmsub_ph(a, 0b0101010101010101, b, c);
20793        let e = _mm256_set_ph(
20794            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20795        );
20796        assert_eq_m256h(r, e);
20797    }
20798
20799    #[simd_test(enable = "avx512fp16,avx512vl")]
20800    const unsafe fn test_mm256_mask3_fmsub_ph() {
20801        let a = _mm256_set1_ph(1.0);
20802        let b = _mm256_set1_ph(2.0);
20803        let c = _mm256_set1_ph(3.0);
20804        let r = _mm256_mask3_fmsub_ph(a, b, c, 0b0101010101010101);
20805        let e = _mm256_set_ph(
20806            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20807        );
20808        assert_eq_m256h(r, e);
20809    }
20810
20811    #[simd_test(enable = "avx512fp16,avx512vl")]
20812    const unsafe fn test_mm256_maskz_fmsub_ph() {
20813        let a = _mm256_set1_ph(1.0);
20814        let b = _mm256_set1_ph(2.0);
20815        let c = _mm256_set1_ph(3.0);
20816        let r = _mm256_maskz_fmsub_ph(0b0101010101010101, a, b, c);
20817        let e = _mm256_set_ph(
20818            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20819        );
20820        assert_eq_m256h(r, e);
20821    }
20822
20823    #[simd_test(enable = "avx512fp16")]
20824    const unsafe fn test_mm512_fmsub_ph() {
20825        let a = _mm512_set1_ph(1.0);
20826        let b = _mm512_set1_ph(2.0);
20827        let c = _mm512_set1_ph(3.0);
20828        let r = _mm512_fmsub_ph(a, b, c);
20829        let e = _mm512_set1_ph(-1.0);
20830        assert_eq_m512h(r, e);
20831    }
20832
20833    #[simd_test(enable = "avx512fp16")]
20834    const unsafe fn test_mm512_mask_fmsub_ph() {
20835        let a = _mm512_set1_ph(1.0);
20836        let b = _mm512_set1_ph(2.0);
20837        let c = _mm512_set1_ph(3.0);
20838        let r = _mm512_mask_fmsub_ph(a, 0b01010101010101010101010101010101, b, c);
20839        let e = _mm512_set_ph(
20840            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20841            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20842        );
20843        assert_eq_m512h(r, e);
20844    }
20845
20846    #[simd_test(enable = "avx512fp16")]
20847    const unsafe fn test_mm512_mask3_fmsub_ph() {
20848        let a = _mm512_set1_ph(1.0);
20849        let b = _mm512_set1_ph(2.0);
20850        let c = _mm512_set1_ph(3.0);
20851        let r = _mm512_mask3_fmsub_ph(a, b, c, 0b01010101010101010101010101010101);
20852        let e = _mm512_set_ph(
20853            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20854            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20855        );
20856        assert_eq_m512h(r, e);
20857    }
20858
20859    #[simd_test(enable = "avx512fp16")]
20860    const unsafe fn test_mm512_maskz_fmsub_ph() {
20861        let a = _mm512_set1_ph(1.0);
20862        let b = _mm512_set1_ph(2.0);
20863        let c = _mm512_set1_ph(3.0);
20864        let r = _mm512_maskz_fmsub_ph(0b01010101010101010101010101010101, a, b, c);
20865        let e = _mm512_set_ph(
20866            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20867            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20868        );
20869        assert_eq_m512h(r, e);
20870    }
20871
20872    #[simd_test(enable = "avx512fp16")]
20873    unsafe fn test_mm512_fmsub_round_ph() {
20874        let a = _mm512_set1_ph(1.0);
20875        let b = _mm512_set1_ph(2.0);
20876        let c = _mm512_set1_ph(3.0);
20877        let r = _mm512_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20878        let e = _mm512_set1_ph(-1.0);
20879        assert_eq_m512h(r, e);
20880    }
20881
20882    #[simd_test(enable = "avx512fp16")]
20883    unsafe fn test_mm512_mask_fmsub_round_ph() {
20884        let a = _mm512_set1_ph(1.0);
20885        let b = _mm512_set1_ph(2.0);
20886        let c = _mm512_set1_ph(3.0);
20887        let r = _mm512_mask_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20888            a,
20889            0b01010101010101010101010101010101,
20890            b,
20891            c,
20892        );
20893        let e = _mm512_set_ph(
20894            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20895            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20896        );
20897        assert_eq_m512h(r, e);
20898    }
20899
20900    #[simd_test(enable = "avx512fp16")]
20901    unsafe fn test_mm512_mask3_fmsub_round_ph() {
20902        let a = _mm512_set1_ph(1.0);
20903        let b = _mm512_set1_ph(2.0);
20904        let c = _mm512_set1_ph(3.0);
20905        let r = _mm512_mask3_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20906            a,
20907            b,
20908            c,
20909            0b01010101010101010101010101010101,
20910        );
20911        let e = _mm512_set_ph(
20912            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20913            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20914        );
20915        assert_eq_m512h(r, e);
20916    }
20917
20918    #[simd_test(enable = "avx512fp16")]
20919    unsafe fn test_mm512_maskz_fmsub_round_ph() {
20920        let a = _mm512_set1_ph(1.0);
20921        let b = _mm512_set1_ph(2.0);
20922        let c = _mm512_set1_ph(3.0);
20923        let r = _mm512_maskz_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20924            0b01010101010101010101010101010101,
20925            a,
20926            b,
20927            c,
20928        );
20929        let e = _mm512_set_ph(
20930            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20931            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20932        );
20933        assert_eq_m512h(r, e);
20934    }
20935
20936    #[simd_test(enable = "avx512fp16,avx512vl")]
20937    const unsafe fn test_mm_fmsub_sh() {
20938        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20939        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20940        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20941        let r = _mm_fmsub_sh(a, b, c);
20942        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20943        assert_eq_m128h(r, e);
20944    }
20945
20946    #[simd_test(enable = "avx512fp16,avx512vl")]
20947    const unsafe fn test_mm_mask_fmsub_sh() {
20948        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20949        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20950        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20951        let r = _mm_mask_fmsub_sh(a, 0, b, c);
20952        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20953        assert_eq_m128h(r, e);
20954        let r = _mm_mask_fmsub_sh(a, 1, b, c);
20955        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20956        assert_eq_m128h(r, e);
20957    }
20958
20959    #[simd_test(enable = "avx512fp16,avx512vl")]
20960    const unsafe fn test_mm_mask3_fmsub_sh() {
20961        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20962        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20963        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20964        let r = _mm_mask3_fmsub_sh(a, b, c, 0);
20965        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20966        assert_eq_m128h(r, e);
20967        let r = _mm_mask3_fmsub_sh(a, b, c, 1);
20968        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20969        assert_eq_m128h(r, e);
20970    }
20971
20972    #[simd_test(enable = "avx512fp16,avx512vl")]
20973    const unsafe fn test_mm_maskz_fmsub_sh() {
20974        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20975        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20976        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20977        let r = _mm_maskz_fmsub_sh(0, a, b, c);
20978        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20979        assert_eq_m128h(r, e);
20980        let r = _mm_maskz_fmsub_sh(1, a, b, c);
20981        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20982        assert_eq_m128h(r, e);
20983    }
20984
20985    #[simd_test(enable = "avx512fp16,avx512vl")]
20986    unsafe fn test_mm_fmsub_round_sh() {
20987        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20988        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20989        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20990        let r = _mm_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20991        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20992        assert_eq_m128h(r, e);
20993    }
20994
20995    #[simd_test(enable = "avx512fp16,avx512vl")]
20996    unsafe fn test_mm_mask_fmsub_round_sh() {
20997        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20998        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20999        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21000        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21001            a, 0, b, c,
21002        );
21003        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21004        assert_eq_m128h(r, e);
21005        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21006            a, 1, b, c,
21007        );
21008        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
21009        assert_eq_m128h(r, e);
21010    }
21011
21012    #[simd_test(enable = "avx512fp16,avx512vl")]
21013    unsafe fn test_mm_mask3_fmsub_round_sh() {
21014        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21015        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21016        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21017        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21018            a, b, c, 0,
21019        );
21020        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21021        assert_eq_m128h(r, e);
21022        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21023            a, b, c, 1,
21024        );
21025        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
21026        assert_eq_m128h(r, e);
21027    }
21028
21029    #[simd_test(enable = "avx512fp16,avx512vl")]
21030    unsafe fn test_mm_maskz_fmsub_round_sh() {
21031        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21032        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21033        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21034        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21035            0, a, b, c,
21036        );
21037        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21038        assert_eq_m128h(r, e);
21039        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21040            1, a, b, c,
21041        );
21042        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
21043        assert_eq_m128h(r, e);
21044    }
21045
21046    #[simd_test(enable = "avx512fp16,avx512vl")]
21047    const unsafe fn test_mm_fnmadd_ph() {
21048        let a = _mm_set1_ph(1.0);
21049        let b = _mm_set1_ph(2.0);
21050        let c = _mm_set1_ph(3.0);
21051        let r = _mm_fnmadd_ph(a, b, c);
21052        let e = _mm_set1_ph(1.0);
21053        assert_eq_m128h(r, e);
21054    }
21055
21056    #[simd_test(enable = "avx512fp16,avx512vl")]
21057    const unsafe fn test_mm_mask_fnmadd_ph() {
21058        let a = _mm_set1_ph(1.0);
21059        let b = _mm_set1_ph(2.0);
21060        let c = _mm_set1_ph(3.0);
21061        let r = _mm_mask_fnmadd_ph(a, 0b01010101, b, c);
21062        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
21063        assert_eq_m128h(r, e);
21064    }
21065
21066    #[simd_test(enable = "avx512fp16,avx512vl")]
21067    const unsafe fn test_mm_mask3_fnmadd_ph() {
21068        let a = _mm_set1_ph(1.0);
21069        let b = _mm_set1_ph(2.0);
21070        let c = _mm_set1_ph(3.0);
21071        let r = _mm_mask3_fnmadd_ph(a, b, c, 0b01010101);
21072        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
21073        assert_eq_m128h(r, e);
21074    }
21075
21076    #[simd_test(enable = "avx512fp16,avx512vl")]
21077    const unsafe fn test_mm_maskz_fnmadd_ph() {
21078        let a = _mm_set1_ph(1.0);
21079        let b = _mm_set1_ph(2.0);
21080        let c = _mm_set1_ph(3.0);
21081        let r = _mm_maskz_fnmadd_ph(0b01010101, a, b, c);
21082        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
21083        assert_eq_m128h(r, e);
21084    }
21085
21086    #[simd_test(enable = "avx512fp16,avx512vl")]
21087    const unsafe fn test_mm256_fnmadd_ph() {
21088        let a = _mm256_set1_ph(1.0);
21089        let b = _mm256_set1_ph(2.0);
21090        let c = _mm256_set1_ph(3.0);
21091        let r = _mm256_fnmadd_ph(a, b, c);
21092        let e = _mm256_set1_ph(1.0);
21093        assert_eq_m256h(r, e);
21094    }
21095
21096    #[simd_test(enable = "avx512fp16,avx512vl")]
21097    const unsafe fn test_mm256_mask_fnmadd_ph() {
21098        let a = _mm256_set1_ph(1.0);
21099        let b = _mm256_set1_ph(2.0);
21100        let c = _mm256_set1_ph(3.0);
21101        let r = _mm256_mask_fnmadd_ph(a, 0b0101010101010101, b, c);
21102        let e = _mm256_set_ph(
21103            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21104        );
21105        assert_eq_m256h(r, e);
21106    }
21107
21108    #[simd_test(enable = "avx512fp16,avx512vl")]
21109    const unsafe fn test_mm256_mask3_fnmadd_ph() {
21110        let a = _mm256_set1_ph(1.0);
21111        let b = _mm256_set1_ph(2.0);
21112        let c = _mm256_set1_ph(3.0);
21113        let r = _mm256_mask3_fnmadd_ph(a, b, c, 0b0101010101010101);
21114        let e = _mm256_set_ph(
21115            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
21116        );
21117        assert_eq_m256h(r, e);
21118    }
21119
21120    #[simd_test(enable = "avx512fp16,avx512vl")]
21121    const unsafe fn test_mm256_maskz_fnmadd_ph() {
21122        let a = _mm256_set1_ph(1.0);
21123        let b = _mm256_set1_ph(2.0);
21124        let c = _mm256_set1_ph(3.0);
21125        let r = _mm256_maskz_fnmadd_ph(0b0101010101010101, a, b, c);
21126        let e = _mm256_set_ph(
21127            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
21128        );
21129        assert_eq_m256h(r, e);
21130    }
21131
21132    #[simd_test(enable = "avx512fp16")]
21133    const unsafe fn test_mm512_fnmadd_ph() {
21134        let a = _mm512_set1_ph(1.0);
21135        let b = _mm512_set1_ph(2.0);
21136        let c = _mm512_set1_ph(3.0);
21137        let r = _mm512_fnmadd_ph(a, b, c);
21138        let e = _mm512_set1_ph(1.0);
21139        assert_eq_m512h(r, e);
21140    }
21141
21142    #[simd_test(enable = "avx512fp16")]
21143    const unsafe fn test_mm512_mask_fnmadd_ph() {
21144        let a = _mm512_set1_ph(1.0);
21145        let b = _mm512_set1_ph(2.0);
21146        let c = _mm512_set1_ph(3.0);
21147        let r = _mm512_mask_fnmadd_ph(a, 0b01010101010101010101010101010101, b, c);
21148        let e = _mm512_set_ph(
21149            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21150            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21151        );
21152        assert_eq_m512h(r, e);
21153    }
21154
21155    #[simd_test(enable = "avx512fp16")]
21156    const unsafe fn test_mm512_mask3_fnmadd_ph() {
21157        let a = _mm512_set1_ph(1.0);
21158        let b = _mm512_set1_ph(2.0);
21159        let c = _mm512_set1_ph(3.0);
21160        let r = _mm512_mask3_fnmadd_ph(a, b, c, 0b01010101010101010101010101010101);
21161        let e = _mm512_set_ph(
21162            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
21163            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
21164        );
21165        assert_eq_m512h(r, e);
21166    }
21167
21168    #[simd_test(enable = "avx512fp16")]
21169    const unsafe fn test_mm512_maskz_fnmadd_ph() {
21170        let a = _mm512_set1_ph(1.0);
21171        let b = _mm512_set1_ph(2.0);
21172        let c = _mm512_set1_ph(3.0);
21173        let r = _mm512_maskz_fnmadd_ph(0b01010101010101010101010101010101, a, b, c);
21174        let e = _mm512_set_ph(
21175            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
21176            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
21177        );
21178        assert_eq_m512h(r, e);
21179    }
21180
21181    #[simd_test(enable = "avx512fp16")]
21182    unsafe fn test_mm512_fnmadd_round_ph() {
21183        let a = _mm512_set1_ph(1.0);
21184        let b = _mm512_set1_ph(2.0);
21185        let c = _mm512_set1_ph(3.0);
21186        let r =
21187            _mm512_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21188        let e = _mm512_set1_ph(1.0);
21189        assert_eq_m512h(r, e);
21190    }
21191
21192    #[simd_test(enable = "avx512fp16")]
21193    unsafe fn test_mm512_mask_fnmadd_round_ph() {
21194        let a = _mm512_set1_ph(1.0);
21195        let b = _mm512_set1_ph(2.0);
21196        let c = _mm512_set1_ph(3.0);
21197        let r = _mm512_mask_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21198            a,
21199            0b01010101010101010101010101010101,
21200            b,
21201            c,
21202        );
21203        let e = _mm512_set_ph(
21204            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21205            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
21206        );
21207        assert_eq_m512h(r, e);
21208    }
21209
21210    #[simd_test(enable = "avx512fp16")]
21211    unsafe fn test_mm512_mask3_fnmadd_round_ph() {
21212        let a = _mm512_set1_ph(1.0);
21213        let b = _mm512_set1_ph(2.0);
21214        let c = _mm512_set1_ph(3.0);
21215        let r = _mm512_mask3_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21216            a,
21217            b,
21218            c,
21219            0b01010101010101010101010101010101,
21220        );
21221        let e = _mm512_set_ph(
21222            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
21223            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
21224        );
21225        assert_eq_m512h(r, e);
21226    }
21227
21228    #[simd_test(enable = "avx512fp16")]
21229    unsafe fn test_mm512_maskz_fnmadd_round_ph() {
21230        let a = _mm512_set1_ph(1.0);
21231        let b = _mm512_set1_ph(2.0);
21232        let c = _mm512_set1_ph(3.0);
21233        let r = _mm512_maskz_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21234            0b01010101010101010101010101010101,
21235            a,
21236            b,
21237            c,
21238        );
21239        let e = _mm512_set_ph(
21240            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
21241            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
21242        );
21243        assert_eq_m512h(r, e);
21244    }
21245
21246    #[simd_test(enable = "avx512fp16,avx512vl")]
21247    const unsafe fn test_mm_fnmadd_sh() {
21248        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21249        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21250        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21251        let r = _mm_fnmadd_sh(a, b, c);
21252        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21253        assert_eq_m128h(r, e);
21254    }
21255
21256    #[simd_test(enable = "avx512fp16,avx512vl")]
21257    const unsafe fn test_mm_mask_fnmadd_sh() {
21258        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21259        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21260        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21261        let r = _mm_mask_fnmadd_sh(a, 0, b, c);
21262        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21263        assert_eq_m128h(r, e);
21264        let r = _mm_mask_fnmadd_sh(a, 1, b, c);
21265        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21266        assert_eq_m128h(r, e);
21267    }
21268
21269    #[simd_test(enable = "avx512fp16,avx512vl")]
21270    const unsafe fn test_mm_mask3_fnmadd_sh() {
21271        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21272        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21273        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21274        let r = _mm_mask3_fnmadd_sh(a, b, c, 0);
21275        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21276        assert_eq_m128h(r, e);
21277        let r = _mm_mask3_fnmadd_sh(a, b, c, 1);
21278        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
21279        assert_eq_m128h(r, e);
21280    }
21281
21282    #[simd_test(enable = "avx512fp16,avx512vl")]
21283    const unsafe fn test_mm_maskz_fnmadd_sh() {
21284        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21285        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21286        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21287        let r = _mm_maskz_fnmadd_sh(0, a, b, c);
21288        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21289        assert_eq_m128h(r, e);
21290        let r = _mm_maskz_fnmadd_sh(1, a, b, c);
21291        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21292        assert_eq_m128h(r, e);
21293    }
21294
21295    #[simd_test(enable = "avx512fp16,avx512vl")]
21296    unsafe fn test_mm_fnmadd_round_sh() {
21297        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21298        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21299        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21300        let r = _mm_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21301        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21302        assert_eq_m128h(r, e);
21303    }
21304
21305    #[simd_test(enable = "avx512fp16,avx512vl")]
21306    unsafe fn test_mm_mask_fnmadd_round_sh() {
21307        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21308        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21309        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21310        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21311            a, 0, b, c,
21312        );
21313        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21314        assert_eq_m128h(r, e);
21315        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21316            a, 1, b, c,
21317        );
21318        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21319        assert_eq_m128h(r, e);
21320    }
21321
21322    #[simd_test(enable = "avx512fp16,avx512vl")]
21323    unsafe fn test_mm_mask3_fnmadd_round_sh() {
21324        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21325        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21326        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21327        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21328            a, b, c, 0,
21329        );
21330        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21331        assert_eq_m128h(r, e);
21332        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21333            a, b, c, 1,
21334        );
21335        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
21336        assert_eq_m128h(r, e);
21337    }
21338
21339    #[simd_test(enable = "avx512fp16,avx512vl")]
21340    unsafe fn test_mm_maskz_fnmadd_round_sh() {
21341        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21342        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21343        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21344        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21345            0, a, b, c,
21346        );
21347        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21348        assert_eq_m128h(r, e);
21349        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21350            1, a, b, c,
21351        );
21352        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21353        assert_eq_m128h(r, e);
21354    }
21355
21356    #[simd_test(enable = "avx512fp16,avx512vl")]
21357    const unsafe fn test_mm_fnmsub_ph() {
21358        let a = _mm_set1_ph(1.0);
21359        let b = _mm_set1_ph(2.0);
21360        let c = _mm_set1_ph(3.0);
21361        let r = _mm_fnmsub_ph(a, b, c);
21362        let e = _mm_set1_ph(-5.0);
21363        assert_eq_m128h(r, e);
21364    }
21365
21366    #[simd_test(enable = "avx512fp16,avx512vl")]
21367    const unsafe fn test_mm_mask_fnmsub_ph() {
21368        let a = _mm_set1_ph(1.0);
21369        let b = _mm_set1_ph(2.0);
21370        let c = _mm_set1_ph(3.0);
21371        let r = _mm_mask_fnmsub_ph(a, 0b01010101, b, c);
21372        let e = _mm_set_ph(1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0);
21373        assert_eq_m128h(r, e);
21374    }
21375
21376    #[simd_test(enable = "avx512fp16,avx512vl")]
21377    const unsafe fn test_mm_mask3_fnmsub_ph() {
21378        let a = _mm_set1_ph(1.0);
21379        let b = _mm_set1_ph(2.0);
21380        let c = _mm_set1_ph(3.0);
21381        let r = _mm_mask3_fnmsub_ph(a, b, c, 0b01010101);
21382        let e = _mm_set_ph(3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0);
21383        assert_eq_m128h(r, e);
21384    }
21385
21386    #[simd_test(enable = "avx512fp16,avx512vl")]
21387    const unsafe fn test_mm_maskz_fnmsub_ph() {
21388        let a = _mm_set1_ph(1.0);
21389        let b = _mm_set1_ph(2.0);
21390        let c = _mm_set1_ph(3.0);
21391        let r = _mm_maskz_fnmsub_ph(0b01010101, a, b, c);
21392        let e = _mm_set_ph(0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0);
21393        assert_eq_m128h(r, e);
21394    }
21395
21396    #[simd_test(enable = "avx512fp16,avx512vl")]
21397    const unsafe fn test_mm256_fnmsub_ph() {
21398        let a = _mm256_set1_ph(1.0);
21399        let b = _mm256_set1_ph(2.0);
21400        let c = _mm256_set1_ph(3.0);
21401        let r = _mm256_fnmsub_ph(a, b, c);
21402        let e = _mm256_set1_ph(-5.0);
21403        assert_eq_m256h(r, e);
21404    }
21405
21406    #[simd_test(enable = "avx512fp16,avx512vl")]
21407    const unsafe fn test_mm256_mask_fnmsub_ph() {
21408        let a = _mm256_set1_ph(1.0);
21409        let b = _mm256_set1_ph(2.0);
21410        let c = _mm256_set1_ph(3.0);
21411        let r = _mm256_mask_fnmsub_ph(a, 0b0101010101010101, b, c);
21412        let e = _mm256_set_ph(
21413            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21414        );
21415        assert_eq_m256h(r, e);
21416    }
21417
21418    #[simd_test(enable = "avx512fp16,avx512vl")]
21419    const unsafe fn test_mm256_mask3_fnmsub_ph() {
21420        let a = _mm256_set1_ph(1.0);
21421        let b = _mm256_set1_ph(2.0);
21422        let c = _mm256_set1_ph(3.0);
21423        let r = _mm256_mask3_fnmsub_ph(a, b, c, 0b0101010101010101);
21424        let e = _mm256_set_ph(
21425            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21426        );
21427        assert_eq_m256h(r, e);
21428    }
21429
21430    #[simd_test(enable = "avx512fp16,avx512vl")]
21431    const unsafe fn test_mm256_maskz_fnmsub_ph() {
21432        let a = _mm256_set1_ph(1.0);
21433        let b = _mm256_set1_ph(2.0);
21434        let c = _mm256_set1_ph(3.0);
21435        let r = _mm256_maskz_fnmsub_ph(0b0101010101010101, a, b, c);
21436        let e = _mm256_set_ph(
21437            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21438        );
21439        assert_eq_m256h(r, e);
21440    }
21441
21442    #[simd_test(enable = "avx512fp16")]
21443    const unsafe fn test_mm512_fnmsub_ph() {
21444        let a = _mm512_set1_ph(1.0);
21445        let b = _mm512_set1_ph(2.0);
21446        let c = _mm512_set1_ph(3.0);
21447        let r = _mm512_fnmsub_ph(a, b, c);
21448        let e = _mm512_set1_ph(-5.0);
21449        assert_eq_m512h(r, e);
21450    }
21451
21452    #[simd_test(enable = "avx512fp16")]
21453    const unsafe fn test_mm512_mask_fnmsub_ph() {
21454        let a = _mm512_set1_ph(1.0);
21455        let b = _mm512_set1_ph(2.0);
21456        let c = _mm512_set1_ph(3.0);
21457        let r = _mm512_mask_fnmsub_ph(a, 0b01010101010101010101010101010101, b, c);
21458        let e = _mm512_set_ph(
21459            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21460            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21461        );
21462        assert_eq_m512h(r, e);
21463    }
21464
21465    #[simd_test(enable = "avx512fp16")]
21466    const unsafe fn test_mm512_mask3_fnmsub_ph() {
21467        let a = _mm512_set1_ph(1.0);
21468        let b = _mm512_set1_ph(2.0);
21469        let c = _mm512_set1_ph(3.0);
21470        let r = _mm512_mask3_fnmsub_ph(a, b, c, 0b01010101010101010101010101010101);
21471        let e = _mm512_set_ph(
21472            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21473            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21474        );
21475        assert_eq_m512h(r, e);
21476    }
21477
21478    #[simd_test(enable = "avx512fp16")]
21479    const unsafe fn test_mm512_maskz_fnmsub_ph() {
21480        let a = _mm512_set1_ph(1.0);
21481        let b = _mm512_set1_ph(2.0);
21482        let c = _mm512_set1_ph(3.0);
21483        let r = _mm512_maskz_fnmsub_ph(0b01010101010101010101010101010101, a, b, c);
21484        let e = _mm512_set_ph(
21485            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21486            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21487        );
21488        assert_eq_m512h(r, e);
21489    }
21490
21491    #[simd_test(enable = "avx512fp16")]
21492    unsafe fn test_mm512_fnmsub_round_ph() {
21493        let a = _mm512_set1_ph(1.0);
21494        let b = _mm512_set1_ph(2.0);
21495        let c = _mm512_set1_ph(3.0);
21496        let r =
21497            _mm512_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21498        let e = _mm512_set1_ph(-5.0);
21499        assert_eq_m512h(r, e);
21500    }
21501
21502    #[simd_test(enable = "avx512fp16")]
21503    unsafe fn test_mm512_mask_fnmsub_round_ph() {
21504        let a = _mm512_set1_ph(1.0);
21505        let b = _mm512_set1_ph(2.0);
21506        let c = _mm512_set1_ph(3.0);
21507        let r = _mm512_mask_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21508            a,
21509            0b01010101010101010101010101010101,
21510            b,
21511            c,
21512        );
21513        let e = _mm512_set_ph(
21514            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21515            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21516        );
21517        assert_eq_m512h(r, e);
21518    }
21519
21520    #[simd_test(enable = "avx512fp16")]
21521    unsafe fn test_mm512_mask3_fnmsub_round_ph() {
21522        let a = _mm512_set1_ph(1.0);
21523        let b = _mm512_set1_ph(2.0);
21524        let c = _mm512_set1_ph(3.0);
21525        let r = _mm512_mask3_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21526            a,
21527            b,
21528            c,
21529            0b01010101010101010101010101010101,
21530        );
21531        let e = _mm512_set_ph(
21532            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21533            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21534        );
21535        assert_eq_m512h(r, e);
21536    }
21537
21538    #[simd_test(enable = "avx512fp16")]
21539    unsafe fn test_mm512_maskz_fnmsub_round_ph() {
21540        let a = _mm512_set1_ph(1.0);
21541        let b = _mm512_set1_ph(2.0);
21542        let c = _mm512_set1_ph(3.0);
21543        let r = _mm512_maskz_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21544            0b01010101010101010101010101010101,
21545            a,
21546            b,
21547            c,
21548        );
21549        let e = _mm512_set_ph(
21550            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21551            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21552        );
21553        assert_eq_m512h(r, e);
21554    }
21555
21556    #[simd_test(enable = "avx512fp16,avx512vl")]
21557    const unsafe fn test_mm_fnmsub_sh() {
21558        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21559        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21560        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21561        let r = _mm_fnmsub_sh(a, b, c);
21562        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21563        assert_eq_m128h(r, e);
21564    }
21565
21566    #[simd_test(enable = "avx512fp16,avx512vl")]
21567    const unsafe fn test_mm_mask_fnmsub_sh() {
21568        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21569        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21570        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21571        let r = _mm_mask_fnmsub_sh(a, 0, b, c);
21572        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21573        assert_eq_m128h(r, e);
21574        let r = _mm_mask_fnmsub_sh(a, 1, b, c);
21575        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21576        assert_eq_m128h(r, e);
21577    }
21578
21579    #[simd_test(enable = "avx512fp16,avx512vl")]
21580    const unsafe fn test_mm_mask3_fnmsub_sh() {
21581        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21582        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21583        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21584        let r = _mm_mask3_fnmsub_sh(a, b, c, 0);
21585        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21586        assert_eq_m128h(r, e);
21587        let r = _mm_mask3_fnmsub_sh(a, b, c, 1);
21588        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21589        assert_eq_m128h(r, e);
21590    }
21591
21592    #[simd_test(enable = "avx512fp16,avx512vl")]
21593    const unsafe fn test_mm_maskz_fnmsub_sh() {
21594        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21595        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21596        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21597        let r = _mm_maskz_fnmsub_sh(0, a, b, c);
21598        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21599        assert_eq_m128h(r, e);
21600        let r = _mm_maskz_fnmsub_sh(1, a, b, c);
21601        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21602        assert_eq_m128h(r, e);
21603    }
21604
21605    #[simd_test(enable = "avx512fp16,avx512vl")]
21606    unsafe fn test_mm_fnmsub_round_sh() {
21607        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21608        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21609        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21610        let r = _mm_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21611        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21612        assert_eq_m128h(r, e);
21613    }
21614
21615    #[simd_test(enable = "avx512fp16,avx512vl")]
21616    unsafe fn test_mm_mask_fnmsub_round_sh() {
21617        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21618        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21619        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21620        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21621            a, 0, b, c,
21622        );
21623        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21624        assert_eq_m128h(r, e);
21625        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21626            a, 1, b, c,
21627        );
21628        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21629        assert_eq_m128h(r, e);
21630    }
21631
21632    #[simd_test(enable = "avx512fp16,avx512vl")]
21633    unsafe fn test_mm_mask3_fnmsub_round_sh() {
21634        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21635        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21636        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21637        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21638            a, b, c, 0,
21639        );
21640        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21641        assert_eq_m128h(r, e);
21642        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21643            a, b, c, 1,
21644        );
21645        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21646        assert_eq_m128h(r, e);
21647    }
21648
21649    #[simd_test(enable = "avx512fp16,avx512vl")]
21650    unsafe fn test_mm_maskz_fnmsub_round_sh() {
21651        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21652        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21653        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21654        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21655            0, a, b, c,
21656        );
21657        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21658        assert_eq_m128h(r, e);
21659        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21660            1, a, b, c,
21661        );
21662        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21663        assert_eq_m128h(r, e);
21664    }
21665
21666    #[simd_test(enable = "avx512fp16,avx512vl")]
21667    const unsafe fn test_mm_fmaddsub_ph() {
21668        let a = _mm_set1_ph(1.0);
21669        let b = _mm_set1_ph(2.0);
21670        let c = _mm_set1_ph(3.0);
21671        let r = _mm_fmaddsub_ph(a, b, c);
21672        let e = _mm_set_ph(5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0);
21673        assert_eq_m128h(r, e);
21674    }
21675
21676    #[simd_test(enable = "avx512fp16,avx512vl")]
21677    const unsafe fn test_mm_mask_fmaddsub_ph() {
21678        let a = _mm_set1_ph(1.0);
21679        let b = _mm_set1_ph(2.0);
21680        let c = _mm_set1_ph(3.0);
21681        let r = _mm_mask_fmaddsub_ph(a, 0b00110011, b, c);
21682        let e = _mm_set_ph(1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0);
21683        assert_eq_m128h(r, e);
21684    }
21685
21686    #[simd_test(enable = "avx512fp16,avx512vl")]
21687    const unsafe fn test_mm_mask3_fmaddsub_ph() {
21688        let a = _mm_set1_ph(1.0);
21689        let b = _mm_set1_ph(2.0);
21690        let c = _mm_set1_ph(3.0);
21691        let r = _mm_mask3_fmaddsub_ph(a, b, c, 0b00110011);
21692        let e = _mm_set_ph(3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0);
21693        assert_eq_m128h(r, e);
21694    }
21695
21696    #[simd_test(enable = "avx512fp16,avx512vl")]
21697    const unsafe fn test_mm_maskz_fmaddsub_ph() {
21698        let a = _mm_set1_ph(1.0);
21699        let b = _mm_set1_ph(2.0);
21700        let c = _mm_set1_ph(3.0);
21701        let r = _mm_maskz_fmaddsub_ph(0b00110011, a, b, c);
21702        let e = _mm_set_ph(0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0);
21703        assert_eq_m128h(r, e);
21704    }
21705
21706    #[simd_test(enable = "avx512fp16,avx512vl")]
21707    const unsafe fn test_mm256_fmaddsub_ph() {
21708        let a = _mm256_set1_ph(1.0);
21709        let b = _mm256_set1_ph(2.0);
21710        let c = _mm256_set1_ph(3.0);
21711        let r = _mm256_fmaddsub_ph(a, b, c);
21712        let e = _mm256_set_ph(
21713            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21714        );
21715        assert_eq_m256h(r, e);
21716    }
21717
21718    #[simd_test(enable = "avx512fp16,avx512vl")]
21719    const unsafe fn test_mm256_mask_fmaddsub_ph() {
21720        let a = _mm256_set1_ph(1.0);
21721        let b = _mm256_set1_ph(2.0);
21722        let c = _mm256_set1_ph(3.0);
21723        let r = _mm256_mask_fmaddsub_ph(a, 0b0011001100110011, b, c);
21724        let e = _mm256_set_ph(
21725            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21726        );
21727        assert_eq_m256h(r, e);
21728    }
21729
21730    #[simd_test(enable = "avx512fp16,avx512vl")]
21731    const unsafe fn test_mm256_mask3_fmaddsub_ph() {
21732        let a = _mm256_set1_ph(1.0);
21733        let b = _mm256_set1_ph(2.0);
21734        let c = _mm256_set1_ph(3.0);
21735        let r = _mm256_mask3_fmaddsub_ph(a, b, c, 0b0011001100110011);
21736        let e = _mm256_set_ph(
21737            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21738        );
21739        assert_eq_m256h(r, e);
21740    }
21741
21742    #[simd_test(enable = "avx512fp16,avx512vl")]
21743    const unsafe fn test_mm256_maskz_fmaddsub_ph() {
21744        let a = _mm256_set1_ph(1.0);
21745        let b = _mm256_set1_ph(2.0);
21746        let c = _mm256_set1_ph(3.0);
21747        let r = _mm256_maskz_fmaddsub_ph(0b0011001100110011, a, b, c);
21748        let e = _mm256_set_ph(
21749            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21750        );
21751        assert_eq_m256h(r, e);
21752    }
21753
21754    #[simd_test(enable = "avx512fp16")]
21755    const unsafe fn test_mm512_fmaddsub_ph() {
21756        let a = _mm512_set1_ph(1.0);
21757        let b = _mm512_set1_ph(2.0);
21758        let c = _mm512_set1_ph(3.0);
21759        let r = _mm512_fmaddsub_ph(a, b, c);
21760        let e = _mm512_set_ph(
21761            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21762            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21763        );
21764        assert_eq_m512h(r, e);
21765    }
21766
21767    #[simd_test(enable = "avx512fp16")]
21768    const unsafe fn test_mm512_mask_fmaddsub_ph() {
21769        let a = _mm512_set1_ph(1.0);
21770        let b = _mm512_set1_ph(2.0);
21771        let c = _mm512_set1_ph(3.0);
21772        let r = _mm512_mask_fmaddsub_ph(a, 0b00110011001100110011001100110011, b, c);
21773        let e = _mm512_set_ph(
21774            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21775            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21776        );
21777        assert_eq_m512h(r, e);
21778    }
21779
21780    #[simd_test(enable = "avx512fp16")]
21781    const unsafe fn test_mm512_mask3_fmaddsub_ph() {
21782        let a = _mm512_set1_ph(1.0);
21783        let b = _mm512_set1_ph(2.0);
21784        let c = _mm512_set1_ph(3.0);
21785        let r = _mm512_mask3_fmaddsub_ph(a, b, c, 0b00110011001100110011001100110011);
21786        let e = _mm512_set_ph(
21787            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21788            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21789        );
21790        assert_eq_m512h(r, e);
21791    }
21792
21793    #[simd_test(enable = "avx512fp16")]
21794    const unsafe fn test_mm512_maskz_fmaddsub_ph() {
21795        let a = _mm512_set1_ph(1.0);
21796        let b = _mm512_set1_ph(2.0);
21797        let c = _mm512_set1_ph(3.0);
21798        let r = _mm512_maskz_fmaddsub_ph(0b00110011001100110011001100110011, a, b, c);
21799        let e = _mm512_set_ph(
21800            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21801            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21802        );
21803        assert_eq_m512h(r, e);
21804    }
21805
21806    #[simd_test(enable = "avx512fp16")]
21807    unsafe fn test_mm512_fmaddsub_round_ph() {
21808        let a = _mm512_set1_ph(1.0);
21809        let b = _mm512_set1_ph(2.0);
21810        let c = _mm512_set1_ph(3.0);
21811        let r =
21812            _mm512_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21813        let e = _mm512_set_ph(
21814            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21815            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21816        );
21817        assert_eq_m512h(r, e);
21818    }
21819
21820    #[simd_test(enable = "avx512fp16")]
21821    unsafe fn test_mm512_mask_fmaddsub_round_ph() {
21822        let a = _mm512_set1_ph(1.0);
21823        let b = _mm512_set1_ph(2.0);
21824        let c = _mm512_set1_ph(3.0);
21825        let r = _mm512_mask_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21826            a,
21827            0b00110011001100110011001100110011,
21828            b,
21829            c,
21830        );
21831        let e = _mm512_set_ph(
21832            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21833            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21834        );
21835        assert_eq_m512h(r, e);
21836    }
21837
21838    #[simd_test(enable = "avx512fp16")]
21839    unsafe fn test_mm512_mask3_fmaddsub_round_ph() {
21840        let a = _mm512_set1_ph(1.0);
21841        let b = _mm512_set1_ph(2.0);
21842        let c = _mm512_set1_ph(3.0);
21843        let r = _mm512_mask3_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21844            a,
21845            b,
21846            c,
21847            0b00110011001100110011001100110011,
21848        );
21849        let e = _mm512_set_ph(
21850            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21851            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21852        );
21853        assert_eq_m512h(r, e);
21854    }
21855
21856    #[simd_test(enable = "avx512fp16")]
21857    unsafe fn test_mm512_maskz_fmaddsub_round_ph() {
21858        let a = _mm512_set1_ph(1.0);
21859        let b = _mm512_set1_ph(2.0);
21860        let c = _mm512_set1_ph(3.0);
21861        let r = _mm512_maskz_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21862            0b00110011001100110011001100110011,
21863            a,
21864            b,
21865            c,
21866        );
21867        let e = _mm512_set_ph(
21868            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21869            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21870        );
21871        assert_eq_m512h(r, e);
21872    }
21873
21874    #[simd_test(enable = "avx512fp16,avx512vl")]
21875    const unsafe fn test_mm_fmsubadd_ph() {
21876        let a = _mm_set1_ph(1.0);
21877        let b = _mm_set1_ph(2.0);
21878        let c = _mm_set1_ph(3.0);
21879        let r = _mm_fmsubadd_ph(a, b, c);
21880        let e = _mm_set_ph(-1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0);
21881        assert_eq_m128h(r, e);
21882    }
21883
21884    #[simd_test(enable = "avx512fp16,avx512vl")]
21885    const unsafe fn test_mm_mask_fmsubadd_ph() {
21886        let a = _mm_set1_ph(1.0);
21887        let b = _mm_set1_ph(2.0);
21888        let c = _mm_set1_ph(3.0);
21889        let r = _mm_mask_fmsubadd_ph(a, 0b00110011, b, c);
21890        let e = _mm_set_ph(1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0);
21891        assert_eq_m128h(r, e);
21892    }
21893
21894    #[simd_test(enable = "avx512fp16,avx512vl")]
21895    const unsafe fn test_mm_mask3_fmsubadd_ph() {
21896        let a = _mm_set1_ph(1.0);
21897        let b = _mm_set1_ph(2.0);
21898        let c = _mm_set1_ph(3.0);
21899        let r = _mm_mask3_fmsubadd_ph(a, b, c, 0b00110011);
21900        let e = _mm_set_ph(3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0);
21901        assert_eq_m128h(r, e);
21902    }
21903
21904    #[simd_test(enable = "avx512fp16,avx512vl")]
21905    const unsafe fn test_mm_maskz_fmsubadd_ph() {
21906        let a = _mm_set1_ph(1.0);
21907        let b = _mm_set1_ph(2.0);
21908        let c = _mm_set1_ph(3.0);
21909        let r = _mm_maskz_fmsubadd_ph(0b00110011, a, b, c);
21910        let e = _mm_set_ph(0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0);
21911        assert_eq_m128h(r, e);
21912    }
21913
21914    #[simd_test(enable = "avx512fp16,avx512vl")]
21915    const unsafe fn test_mm256_fmsubadd_ph() {
21916        let a = _mm256_set1_ph(1.0);
21917        let b = _mm256_set1_ph(2.0);
21918        let c = _mm256_set1_ph(3.0);
21919        let r = _mm256_fmsubadd_ph(a, b, c);
21920        let e = _mm256_set_ph(
21921            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21922        );
21923        assert_eq_m256h(r, e);
21924    }
21925
21926    #[simd_test(enable = "avx512fp16,avx512vl")]
21927    const unsafe fn test_mm256_mask_fmsubadd_ph() {
21928        let a = _mm256_set1_ph(1.0);
21929        let b = _mm256_set1_ph(2.0);
21930        let c = _mm256_set1_ph(3.0);
21931        let r = _mm256_mask_fmsubadd_ph(a, 0b0011001100110011, b, c);
21932        let e = _mm256_set_ph(
21933            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21934        );
21935        assert_eq_m256h(r, e);
21936    }
21937
21938    #[simd_test(enable = "avx512fp16,avx512vl")]
21939    const unsafe fn test_mm256_mask3_fmsubadd_ph() {
21940        let a = _mm256_set1_ph(1.0);
21941        let b = _mm256_set1_ph(2.0);
21942        let c = _mm256_set1_ph(3.0);
21943        let r = _mm256_mask3_fmsubadd_ph(a, b, c, 0b0011001100110011);
21944        let e = _mm256_set_ph(
21945            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21946        );
21947        assert_eq_m256h(r, e);
21948    }
21949
21950    #[simd_test(enable = "avx512fp16,avx512vl")]
21951    const unsafe fn test_mm256_maskz_fmsubadd_ph() {
21952        let a = _mm256_set1_ph(1.0);
21953        let b = _mm256_set1_ph(2.0);
21954        let c = _mm256_set1_ph(3.0);
21955        let r = _mm256_maskz_fmsubadd_ph(0b0011001100110011, a, b, c);
21956        let e = _mm256_set_ph(
21957            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21958        );
21959        assert_eq_m256h(r, e);
21960    }
21961
21962    #[simd_test(enable = "avx512fp16")]
21963    const unsafe fn test_mm512_fmsubadd_ph() {
21964        let a = _mm512_set1_ph(1.0);
21965        let b = _mm512_set1_ph(2.0);
21966        let c = _mm512_set1_ph(3.0);
21967        let r = _mm512_fmsubadd_ph(a, b, c);
21968        let e = _mm512_set_ph(
21969            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21970            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21971        );
21972        assert_eq_m512h(r, e);
21973    }
21974
21975    #[simd_test(enable = "avx512fp16")]
21976    const unsafe fn test_mm512_mask_fmsubadd_ph() {
21977        let a = _mm512_set1_ph(1.0);
21978        let b = _mm512_set1_ph(2.0);
21979        let c = _mm512_set1_ph(3.0);
21980        let r = _mm512_mask_fmsubadd_ph(a, 0b00110011001100110011001100110011, b, c);
21981        let e = _mm512_set_ph(
21982            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21983            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21984        );
21985        assert_eq_m512h(r, e);
21986    }
21987
21988    #[simd_test(enable = "avx512fp16")]
21989    const unsafe fn test_mm512_mask3_fmsubadd_ph() {
21990        let a = _mm512_set1_ph(1.0);
21991        let b = _mm512_set1_ph(2.0);
21992        let c = _mm512_set1_ph(3.0);
21993        let r = _mm512_mask3_fmsubadd_ph(a, b, c, 0b00110011001100110011001100110011);
21994        let e = _mm512_set_ph(
21995            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21996            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21997        );
21998        assert_eq_m512h(r, e);
21999    }
22000
22001    #[simd_test(enable = "avx512fp16")]
22002    const unsafe fn test_mm512_maskz_fmsubadd_ph() {
22003        let a = _mm512_set1_ph(1.0);
22004        let b = _mm512_set1_ph(2.0);
22005        let c = _mm512_set1_ph(3.0);
22006        let r = _mm512_maskz_fmsubadd_ph(0b00110011001100110011001100110011, a, b, c);
22007        let e = _mm512_set_ph(
22008            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
22009            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
22010        );
22011        assert_eq_m512h(r, e);
22012    }
22013
22014    #[simd_test(enable = "avx512fp16")]
22015    unsafe fn test_mm512_fmsubadd_round_ph() {
22016        let a = _mm512_set1_ph(1.0);
22017        let b = _mm512_set1_ph(2.0);
22018        let c = _mm512_set1_ph(3.0);
22019        let r =
22020            _mm512_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
22021        let e = _mm512_set_ph(
22022            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
22023            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
22024        );
22025        assert_eq_m512h(r, e);
22026    }
22027
22028    #[simd_test(enable = "avx512fp16")]
22029    unsafe fn test_mm512_mask_fmsubadd_round_ph() {
22030        let a = _mm512_set1_ph(1.0);
22031        let b = _mm512_set1_ph(2.0);
22032        let c = _mm512_set1_ph(3.0);
22033        let r = _mm512_mask_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22034            a,
22035            0b00110011001100110011001100110011,
22036            b,
22037            c,
22038        );
22039        let e = _mm512_set_ph(
22040            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
22041            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
22042        );
22043        assert_eq_m512h(r, e);
22044    }
22045
22046    #[simd_test(enable = "avx512fp16")]
22047    unsafe fn test_mm512_mask3_fmsubadd_round_ph() {
22048        let a = _mm512_set1_ph(1.0);
22049        let b = _mm512_set1_ph(2.0);
22050        let c = _mm512_set1_ph(3.0);
22051        let r = _mm512_mask3_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22052            a,
22053            b,
22054            c,
22055            0b00110011001100110011001100110011,
22056        );
22057        let e = _mm512_set_ph(
22058            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
22059            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
22060        );
22061        assert_eq_m512h(r, e);
22062    }
22063
22064    #[simd_test(enable = "avx512fp16")]
22065    unsafe fn test_mm512_maskz_fmsubadd_round_ph() {
22066        let a = _mm512_set1_ph(1.0);
22067        let b = _mm512_set1_ph(2.0);
22068        let c = _mm512_set1_ph(3.0);
22069        let r = _mm512_maskz_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22070            0b00110011001100110011001100110011,
22071            a,
22072            b,
22073            c,
22074        );
22075        let e = _mm512_set_ph(
22076            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
22077            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
22078        );
22079        assert_eq_m512h(r, e);
22080    }
22081
22082    #[simd_test(enable = "avx512fp16,avx512vl")]
22083    unsafe fn test_mm_rcp_ph() {
22084        let a = _mm_set1_ph(2.0);
22085        let r = _mm_rcp_ph(a);
22086        let e = _mm_set1_ph(0.5);
22087        assert_eq_m128h(r, e);
22088    }
22089
22090    #[simd_test(enable = "avx512fp16,avx512vl")]
22091    unsafe fn test_mm_mask_rcp_ph() {
22092        let a = _mm_set1_ph(2.0);
22093        let src = _mm_set1_ph(1.0);
22094        let r = _mm_mask_rcp_ph(src, 0b01010101, a);
22095        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
22096        assert_eq_m128h(r, e);
22097    }
22098
22099    #[simd_test(enable = "avx512fp16,avx512vl")]
22100    unsafe fn test_mm_maskz_rcp_ph() {
22101        let a = _mm_set1_ph(2.0);
22102        let r = _mm_maskz_rcp_ph(0b01010101, a);
22103        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
22104        assert_eq_m128h(r, e);
22105    }
22106
22107    #[simd_test(enable = "avx512fp16,avx512vl")]
22108    unsafe fn test_mm256_rcp_ph() {
22109        let a = _mm256_set1_ph(2.0);
22110        let r = _mm256_rcp_ph(a);
22111        let e = _mm256_set1_ph(0.5);
22112        assert_eq_m256h(r, e);
22113    }
22114
22115    #[simd_test(enable = "avx512fp16,avx512vl")]
22116    unsafe fn test_mm256_mask_rcp_ph() {
22117        let a = _mm256_set1_ph(2.0);
22118        let src = _mm256_set1_ph(1.0);
22119        let r = _mm256_mask_rcp_ph(src, 0b0101010101010101, a);
22120        let e = _mm256_set_ph(
22121            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22122        );
22123        assert_eq_m256h(r, e);
22124    }
22125
22126    #[simd_test(enable = "avx512fp16,avx512vl")]
22127    unsafe fn test_mm256_maskz_rcp_ph() {
22128        let a = _mm256_set1_ph(2.0);
22129        let r = _mm256_maskz_rcp_ph(0b0101010101010101, a);
22130        let e = _mm256_set_ph(
22131            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22132        );
22133        assert_eq_m256h(r, e);
22134    }
22135
22136    #[simd_test(enable = "avx512fp16")]
22137    unsafe fn test_mm512_rcp_ph() {
22138        let a = _mm512_set1_ph(2.0);
22139        let r = _mm512_rcp_ph(a);
22140        let e = _mm512_set1_ph(0.5);
22141        assert_eq_m512h(r, e);
22142    }
22143
22144    #[simd_test(enable = "avx512fp16")]
22145    unsafe fn test_mm512_mask_rcp_ph() {
22146        let a = _mm512_set1_ph(2.0);
22147        let src = _mm512_set1_ph(1.0);
22148        let r = _mm512_mask_rcp_ph(src, 0b01010101010101010101010101010101, a);
22149        let e = _mm512_set_ph(
22150            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
22151            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22152        );
22153        assert_eq_m512h(r, e);
22154    }
22155
22156    #[simd_test(enable = "avx512fp16")]
22157    unsafe fn test_mm512_maskz_rcp_ph() {
22158        let a = _mm512_set1_ph(2.0);
22159        let r = _mm512_maskz_rcp_ph(0b01010101010101010101010101010101, a);
22160        let e = _mm512_set_ph(
22161            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
22162            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22163        );
22164        assert_eq_m512h(r, e);
22165    }
22166
22167    #[simd_test(enable = "avx512fp16,avx512vl")]
22168    unsafe fn test_mm_rcp_sh() {
22169        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22170        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22171        let r = _mm_rcp_sh(a, b);
22172        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22173        assert_eq_m128h(r, e);
22174    }
22175
22176    #[simd_test(enable = "avx512fp16,avx512vl")]
22177    unsafe fn test_mm_mask_rcp_sh() {
22178        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22179        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22180        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22181        let r = _mm_mask_rcp_sh(src, 0, a, b);
22182        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22183        assert_eq_m128h(r, e);
22184        let r = _mm_mask_rcp_sh(src, 1, a, b);
22185        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22186        assert_eq_m128h(r, e);
22187    }
22188
22189    #[simd_test(enable = "avx512fp16,avx512vl")]
22190    unsafe fn test_mm_maskz_rcp_sh() {
22191        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22192        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22193        let r = _mm_maskz_rcp_sh(0, a, b);
22194        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22195        assert_eq_m128h(r, e);
22196        let r = _mm_maskz_rcp_sh(1, a, b);
22197        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22198        assert_eq_m128h(r, e);
22199    }
22200
22201    #[simd_test(enable = "avx512fp16,avx512vl")]
22202    unsafe fn test_mm_rsqrt_ph() {
22203        let a = _mm_set1_ph(4.0);
22204        let r = _mm_rsqrt_ph(a);
22205        let e = _mm_set1_ph(0.5);
22206        assert_eq_m128h(r, e);
22207    }
22208
22209    #[simd_test(enable = "avx512fp16,avx512vl")]
22210    unsafe fn test_mm_mask_rsqrt_ph() {
22211        let a = _mm_set1_ph(4.0);
22212        let src = _mm_set1_ph(1.0);
22213        let r = _mm_mask_rsqrt_ph(src, 0b01010101, a);
22214        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
22215        assert_eq_m128h(r, e);
22216    }
22217
22218    #[simd_test(enable = "avx512fp16,avx512vl")]
22219    unsafe fn test_mm_maskz_rsqrt_ph() {
22220        let a = _mm_set1_ph(4.0);
22221        let r = _mm_maskz_rsqrt_ph(0b01010101, a);
22222        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
22223        assert_eq_m128h(r, e);
22224    }
22225
22226    #[simd_test(enable = "avx512fp16,avx512vl")]
22227    unsafe fn test_mm256_rsqrt_ph() {
22228        let a = _mm256_set1_ph(4.0);
22229        let r = _mm256_rsqrt_ph(a);
22230        let e = _mm256_set1_ph(0.5);
22231        assert_eq_m256h(r, e);
22232    }
22233
22234    #[simd_test(enable = "avx512fp16,avx512vl")]
22235    unsafe fn test_mm256_mask_rsqrt_ph() {
22236        let a = _mm256_set1_ph(4.0);
22237        let src = _mm256_set1_ph(1.0);
22238        let r = _mm256_mask_rsqrt_ph(src, 0b0101010101010101, a);
22239        let e = _mm256_set_ph(
22240            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22241        );
22242        assert_eq_m256h(r, e);
22243    }
22244
22245    #[simd_test(enable = "avx512fp16,avx512vl")]
22246    unsafe fn test_mm256_maskz_rsqrt_ph() {
22247        let a = _mm256_set1_ph(4.0);
22248        let r = _mm256_maskz_rsqrt_ph(0b0101010101010101, a);
22249        let e = _mm256_set_ph(
22250            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22251        );
22252        assert_eq_m256h(r, e);
22253    }
22254
22255    #[simd_test(enable = "avx512fp16")]
22256    unsafe fn test_mm512_rsqrt_ph() {
22257        let a = _mm512_set1_ph(4.0);
22258        let r = _mm512_rsqrt_ph(a);
22259        let e = _mm512_set1_ph(0.5);
22260        assert_eq_m512h(r, e);
22261    }
22262
22263    #[simd_test(enable = "avx512fp16")]
22264    unsafe fn test_mm512_mask_rsqrt_ph() {
22265        let a = _mm512_set1_ph(4.0);
22266        let src = _mm512_set1_ph(1.0);
22267        let r = _mm512_mask_rsqrt_ph(src, 0b01010101010101010101010101010101, a);
22268        let e = _mm512_set_ph(
22269            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
22270            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22271        );
22272        assert_eq_m512h(r, e);
22273    }
22274
22275    #[simd_test(enable = "avx512fp16")]
22276    unsafe fn test_mm512_maskz_rsqrt_ph() {
22277        let a = _mm512_set1_ph(4.0);
22278        let r = _mm512_maskz_rsqrt_ph(0b01010101010101010101010101010101, a);
22279        let e = _mm512_set_ph(
22280            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
22281            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22282        );
22283        assert_eq_m512h(r, e);
22284    }
22285
22286    #[simd_test(enable = "avx512fp16,avx512vl")]
22287    unsafe fn test_mm_rsqrt_sh() {
22288        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22289        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22290        let r = _mm_rsqrt_sh(a, b);
22291        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22292        assert_eq_m128h(r, e);
22293    }
22294
22295    #[simd_test(enable = "avx512fp16,avx512vl")]
22296    unsafe fn test_mm_mask_rsqrt_sh() {
22297        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22298        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22299        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22300        let r = _mm_mask_rsqrt_sh(src, 0, a, b);
22301        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22302        assert_eq_m128h(r, e);
22303        let r = _mm_mask_rsqrt_sh(src, 1, a, b);
22304        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22305        assert_eq_m128h(r, e);
22306    }
22307
22308    #[simd_test(enable = "avx512fp16,avx512vl")]
22309    unsafe fn test_mm_maskz_rsqrt_sh() {
22310        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22311        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22312        let r = _mm_maskz_rsqrt_sh(0, a, b);
22313        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22314        assert_eq_m128h(r, e);
22315        let r = _mm_maskz_rsqrt_sh(1, a, b);
22316        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22317        assert_eq_m128h(r, e);
22318    }
22319
22320    #[simd_test(enable = "avx512fp16,avx512vl")]
22321    unsafe fn test_mm_sqrt_ph() {
22322        let a = _mm_set1_ph(4.0);
22323        let r = _mm_sqrt_ph(a);
22324        let e = _mm_set1_ph(2.0);
22325        assert_eq_m128h(r, e);
22326    }
22327
22328    #[simd_test(enable = "avx512fp16,avx512vl")]
22329    unsafe fn test_mm_mask_sqrt_ph() {
22330        let a = _mm_set1_ph(4.0);
22331        let src = _mm_set1_ph(1.0);
22332        let r = _mm_mask_sqrt_ph(src, 0b01010101, a);
22333        let e = _mm_set_ph(1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0);
22334        assert_eq_m128h(r, e);
22335    }
22336
22337    #[simd_test(enable = "avx512fp16,avx512vl")]
22338    unsafe fn test_mm_maskz_sqrt_ph() {
22339        let a = _mm_set1_ph(4.0);
22340        let r = _mm_maskz_sqrt_ph(0b01010101, a);
22341        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22342        assert_eq_m128h(r, e);
22343    }
22344
22345    #[simd_test(enable = "avx512fp16,avx512vl")]
22346    unsafe fn test_mm256_sqrt_ph() {
22347        let a = _mm256_set1_ph(4.0);
22348        let r = _mm256_sqrt_ph(a);
22349        let e = _mm256_set1_ph(2.0);
22350        assert_eq_m256h(r, e);
22351    }
22352
22353    #[simd_test(enable = "avx512fp16,avx512vl")]
22354    unsafe fn test_mm256_mask_sqrt_ph() {
22355        let a = _mm256_set1_ph(4.0);
22356        let src = _mm256_set1_ph(1.0);
22357        let r = _mm256_mask_sqrt_ph(src, 0b0101010101010101, a);
22358        let e = _mm256_set_ph(
22359            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22360        );
22361        assert_eq_m256h(r, e);
22362    }
22363
22364    #[simd_test(enable = "avx512fp16,avx512vl")]
22365    unsafe fn test_mm256_maskz_sqrt_ph() {
22366        let a = _mm256_set1_ph(4.0);
22367        let r = _mm256_maskz_sqrt_ph(0b0101010101010101, a);
22368        let e = _mm256_set_ph(
22369            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22370        );
22371        assert_eq_m256h(r, e);
22372    }
22373
22374    #[simd_test(enable = "avx512fp16")]
22375    unsafe fn test_mm512_sqrt_ph() {
22376        let a = _mm512_set1_ph(4.0);
22377        let r = _mm512_sqrt_ph(a);
22378        let e = _mm512_set1_ph(2.0);
22379        assert_eq_m512h(r, e);
22380    }
22381
22382    #[simd_test(enable = "avx512fp16")]
22383    unsafe fn test_mm512_mask_sqrt_ph() {
22384        let a = _mm512_set1_ph(4.0);
22385        let src = _mm512_set1_ph(1.0);
22386        let r = _mm512_mask_sqrt_ph(src, 0b01010101010101010101010101010101, a);
22387        let e = _mm512_set_ph(
22388            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22389            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22390        );
22391        assert_eq_m512h(r, e);
22392    }
22393
22394    #[simd_test(enable = "avx512fp16")]
22395    unsafe fn test_mm512_maskz_sqrt_ph() {
22396        let a = _mm512_set1_ph(4.0);
22397        let r = _mm512_maskz_sqrt_ph(0b01010101010101010101010101010101, a);
22398        let e = _mm512_set_ph(
22399            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22400            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22401        );
22402        assert_eq_m512h(r, e);
22403    }
22404
22405    #[simd_test(enable = "avx512fp16")]
22406    unsafe fn test_mm512_sqrt_round_ph() {
22407        let a = _mm512_set1_ph(4.0);
22408        let r = _mm512_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
22409        let e = _mm512_set1_ph(2.0);
22410        assert_eq_m512h(r, e);
22411    }
22412
22413    #[simd_test(enable = "avx512fp16")]
22414    unsafe fn test_mm512_mask_sqrt_round_ph() {
22415        let a = _mm512_set1_ph(4.0);
22416        let src = _mm512_set1_ph(1.0);
22417        let r = _mm512_mask_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22418            src,
22419            0b01010101010101010101010101010101,
22420            a,
22421        );
22422        let e = _mm512_set_ph(
22423            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22424            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22425        );
22426        assert_eq_m512h(r, e);
22427    }
22428
22429    #[simd_test(enable = "avx512fp16")]
22430    unsafe fn test_mm512_maskz_sqrt_round_ph() {
22431        let a = _mm512_set1_ph(4.0);
22432        let r = _mm512_maskz_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22433            0b01010101010101010101010101010101,
22434            a,
22435        );
22436        let e = _mm512_set_ph(
22437            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22438            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22439        );
22440        assert_eq_m512h(r, e);
22441    }
22442
22443    #[simd_test(enable = "avx512fp16,avx512vl")]
22444    unsafe fn test_mm_sqrt_sh() {
22445        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22446        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22447        let r = _mm_sqrt_sh(a, b);
22448        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22449        assert_eq_m128h(r, e);
22450    }
22451
22452    #[simd_test(enable = "avx512fp16,avx512vl")]
22453    unsafe fn test_mm_mask_sqrt_sh() {
22454        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22455        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22456        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22457        let r = _mm_mask_sqrt_sh(src, 0, a, b);
22458        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22459        assert_eq_m128h(r, e);
22460        let r = _mm_mask_sqrt_sh(src, 1, a, b);
22461        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22462        assert_eq_m128h(r, e);
22463    }
22464
22465    #[simd_test(enable = "avx512fp16,avx512vl")]
22466    unsafe fn test_mm_maskz_sqrt_sh() {
22467        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22468        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22469        let r = _mm_maskz_sqrt_sh(0, a, b);
22470        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22471        assert_eq_m128h(r, e);
22472        let r = _mm_maskz_sqrt_sh(1, a, b);
22473        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22474        assert_eq_m128h(r, e);
22475    }
22476
22477    #[simd_test(enable = "avx512fp16,avx512vl")]
22478    unsafe fn test_mm_sqrt_round_sh() {
22479        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22480        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22481        let r = _mm_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22482        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22483        assert_eq_m128h(r, e);
22484    }
22485
22486    #[simd_test(enable = "avx512fp16,avx512vl")]
22487    unsafe fn test_mm_mask_sqrt_round_sh() {
22488        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22489        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22490        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22491        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22492            src, 0, a, b,
22493        );
22494        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22495        assert_eq_m128h(r, e);
22496        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22497            src, 1, a, b,
22498        );
22499        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22500        assert_eq_m128h(r, e);
22501    }
22502
22503    #[simd_test(enable = "avx512fp16,avx512vl")]
22504    unsafe fn test_mm_maskz_sqrt_round_sh() {
22505        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22506        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22507        let r =
22508            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22509        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22510        assert_eq_m128h(r, e);
22511        let r =
22512            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22513        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22514        assert_eq_m128h(r, e);
22515    }
22516
22517    #[simd_test(enable = "avx512fp16,avx512vl")]
22518    unsafe fn test_mm_max_ph() {
22519        let a = _mm_set1_ph(2.0);
22520        let b = _mm_set1_ph(1.0);
22521        let r = _mm_max_ph(a, b);
22522        let e = _mm_set1_ph(2.0);
22523        assert_eq_m128h(r, e);
22524    }
22525
22526    #[simd_test(enable = "avx512fp16,avx512vl")]
22527    unsafe fn test_mm_mask_max_ph() {
22528        let a = _mm_set1_ph(2.0);
22529        let b = _mm_set1_ph(1.0);
22530        let src = _mm_set1_ph(3.0);
22531        let r = _mm_mask_max_ph(src, 0b01010101, a, b);
22532        let e = _mm_set_ph(3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0);
22533        assert_eq_m128h(r, e);
22534    }
22535
22536    #[simd_test(enable = "avx512fp16,avx512vl")]
22537    unsafe fn test_mm_maskz_max_ph() {
22538        let a = _mm_set1_ph(2.0);
22539        let b = _mm_set1_ph(1.0);
22540        let r = _mm_maskz_max_ph(0b01010101, a, b);
22541        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22542        assert_eq_m128h(r, e);
22543    }
22544
22545    #[simd_test(enable = "avx512fp16,avx512vl")]
22546    unsafe fn test_mm256_max_ph() {
22547        let a = _mm256_set1_ph(2.0);
22548        let b = _mm256_set1_ph(1.0);
22549        let r = _mm256_max_ph(a, b);
22550        let e = _mm256_set1_ph(2.0);
22551        assert_eq_m256h(r, e);
22552    }
22553
22554    #[simd_test(enable = "avx512fp16,avx512vl")]
22555    unsafe fn test_mm256_mask_max_ph() {
22556        let a = _mm256_set1_ph(2.0);
22557        let b = _mm256_set1_ph(1.0);
22558        let src = _mm256_set1_ph(3.0);
22559        let r = _mm256_mask_max_ph(src, 0b0101010101010101, a, b);
22560        let e = _mm256_set_ph(
22561            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22562        );
22563        assert_eq_m256h(r, e);
22564    }
22565
22566    #[simd_test(enable = "avx512fp16,avx512vl")]
22567    unsafe fn test_mm256_maskz_max_ph() {
22568        let a = _mm256_set1_ph(2.0);
22569        let b = _mm256_set1_ph(1.0);
22570        let r = _mm256_maskz_max_ph(0b0101010101010101, a, b);
22571        let e = _mm256_set_ph(
22572            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22573        );
22574        assert_eq_m256h(r, e);
22575    }
22576
22577    #[simd_test(enable = "avx512fp16")]
22578    unsafe fn test_mm512_max_ph() {
22579        let a = _mm512_set1_ph(2.0);
22580        let b = _mm512_set1_ph(1.0);
22581        let r = _mm512_max_ph(a, b);
22582        let e = _mm512_set1_ph(2.0);
22583        assert_eq_m512h(r, e);
22584    }
22585
22586    #[simd_test(enable = "avx512fp16")]
22587    unsafe fn test_mm512_mask_max_ph() {
22588        let a = _mm512_set1_ph(2.0);
22589        let b = _mm512_set1_ph(1.0);
22590        let src = _mm512_set1_ph(3.0);
22591        let r = _mm512_mask_max_ph(src, 0b01010101010101010101010101010101, a, b);
22592        let e = _mm512_set_ph(
22593            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22594            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22595        );
22596        assert_eq_m512h(r, e);
22597    }
22598
22599    #[simd_test(enable = "avx512fp16")]
22600    unsafe fn test_mm512_maskz_max_ph() {
22601        let a = _mm512_set1_ph(2.0);
22602        let b = _mm512_set1_ph(1.0);
22603        let r = _mm512_maskz_max_ph(0b01010101010101010101010101010101, a, b);
22604        let e = _mm512_set_ph(
22605            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22606            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22607        );
22608        assert_eq_m512h(r, e);
22609    }
22610
22611    #[simd_test(enable = "avx512fp16")]
22612    unsafe fn test_mm512_max_round_ph() {
22613        let a = _mm512_set1_ph(2.0);
22614        let b = _mm512_set1_ph(1.0);
22615        let r = _mm512_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22616        let e = _mm512_set1_ph(2.0);
22617        assert_eq_m512h(r, e);
22618    }
22619
22620    #[simd_test(enable = "avx512fp16")]
22621    unsafe fn test_mm512_mask_max_round_ph() {
22622        let a = _mm512_set1_ph(2.0);
22623        let b = _mm512_set1_ph(1.0);
22624        let src = _mm512_set1_ph(3.0);
22625        let r = _mm512_mask_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22626            src,
22627            0b01010101010101010101010101010101,
22628            a,
22629            b,
22630        );
22631        let e = _mm512_set_ph(
22632            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22633            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22634        );
22635        assert_eq_m512h(r, e);
22636    }
22637
22638    #[simd_test(enable = "avx512fp16")]
22639    unsafe fn test_mm512_maskz_max_round_ph() {
22640        let a = _mm512_set1_ph(2.0);
22641        let b = _mm512_set1_ph(1.0);
22642        let r = _mm512_maskz_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22643            0b01010101010101010101010101010101,
22644            a,
22645            b,
22646        );
22647        let e = _mm512_set_ph(
22648            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22649            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22650        );
22651        assert_eq_m512h(r, e);
22652    }
22653
22654    #[simd_test(enable = "avx512fp16,avx512vl")]
22655    unsafe fn test_mm_max_sh() {
22656        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22657        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22658        let r = _mm_max_sh(a, b);
22659        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22660        assert_eq_m128h(r, e);
22661    }
22662
22663    #[simd_test(enable = "avx512fp16,avx512vl")]
22664    unsafe fn test_mm_mask_max_sh() {
22665        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22666        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22667        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22668        let r = _mm_mask_max_sh(src, 0, a, b);
22669        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22670        assert_eq_m128h(r, e);
22671        let r = _mm_mask_max_sh(src, 1, a, b);
22672        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22673        assert_eq_m128h(r, e);
22674    }
22675
22676    #[simd_test(enable = "avx512fp16,avx512vl")]
22677    unsafe fn test_mm_maskz_max_sh() {
22678        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22679        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22680        let r = _mm_maskz_max_sh(0, a, b);
22681        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22682        assert_eq_m128h(r, e);
22683        let r = _mm_maskz_max_sh(1, a, b);
22684        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22685        assert_eq_m128h(r, e);
22686    }
22687
22688    #[simd_test(enable = "avx512fp16,avx512vl")]
22689    unsafe fn test_mm_max_round_sh() {
22690        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22691        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22692        let r = _mm_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22693        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22694        assert_eq_m128h(r, e);
22695    }
22696
22697    #[simd_test(enable = "avx512fp16,avx512vl")]
22698    unsafe fn test_mm_mask_max_round_sh() {
22699        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22700        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22701        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22702        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22703            src, 0, a, b,
22704        );
22705        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22706        assert_eq_m128h(r, e);
22707        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22708            src, 1, a, b,
22709        );
22710        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22711        assert_eq_m128h(r, e);
22712    }
22713
22714    #[simd_test(enable = "avx512fp16,avx512vl")]
22715    unsafe fn test_mm_maskz_max_round_sh() {
22716        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22717        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22718        let r =
22719            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22720        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22721        assert_eq_m128h(r, e);
22722        let r =
22723            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22724        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22725        assert_eq_m128h(r, e);
22726    }
22727
22728    #[simd_test(enable = "avx512fp16,avx512vl")]
22729    unsafe fn test_mm_min_ph() {
22730        let a = _mm_set1_ph(2.0);
22731        let b = _mm_set1_ph(1.0);
22732        let r = _mm_min_ph(a, b);
22733        let e = _mm_set1_ph(1.0);
22734        assert_eq_m128h(r, e);
22735    }
22736
22737    #[simd_test(enable = "avx512fp16,avx512vl")]
22738    unsafe fn test_mm_mask_min_ph() {
22739        let a = _mm_set1_ph(2.0);
22740        let b = _mm_set1_ph(1.0);
22741        let src = _mm_set1_ph(3.0);
22742        let r = _mm_mask_min_ph(src, 0b01010101, a, b);
22743        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
22744        assert_eq_m128h(r, e);
22745    }
22746
22747    #[simd_test(enable = "avx512fp16,avx512vl")]
22748    unsafe fn test_mm_maskz_min_ph() {
22749        let a = _mm_set1_ph(2.0);
22750        let b = _mm_set1_ph(1.0);
22751        let r = _mm_maskz_min_ph(0b01010101, a, b);
22752        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22753        assert_eq_m128h(r, e);
22754    }
22755
22756    #[simd_test(enable = "avx512fp16,avx512vl")]
22757    unsafe fn test_mm256_min_ph() {
22758        let a = _mm256_set1_ph(2.0);
22759        let b = _mm256_set1_ph(1.0);
22760        let r = _mm256_min_ph(a, b);
22761        let e = _mm256_set1_ph(1.0);
22762        assert_eq_m256h(r, e);
22763    }
22764
22765    #[simd_test(enable = "avx512fp16,avx512vl")]
22766    unsafe fn test_mm256_mask_min_ph() {
22767        let a = _mm256_set1_ph(2.0);
22768        let b = _mm256_set1_ph(1.0);
22769        let src = _mm256_set1_ph(3.0);
22770        let r = _mm256_mask_min_ph(src, 0b0101010101010101, a, b);
22771        let e = _mm256_set_ph(
22772            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22773        );
22774        assert_eq_m256h(r, e);
22775    }
22776
22777    #[simd_test(enable = "avx512fp16,avx512vl")]
22778    unsafe fn test_mm256_maskz_min_ph() {
22779        let a = _mm256_set1_ph(2.0);
22780        let b = _mm256_set1_ph(1.0);
22781        let r = _mm256_maskz_min_ph(0b0101010101010101, a, b);
22782        let e = _mm256_set_ph(
22783            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22784        );
22785        assert_eq_m256h(r, e);
22786    }
22787
22788    #[simd_test(enable = "avx512fp16")]
22789    unsafe fn test_mm512_min_ph() {
22790        let a = _mm512_set1_ph(2.0);
22791        let b = _mm512_set1_ph(1.0);
22792        let r = _mm512_min_ph(a, b);
22793        let e = _mm512_set1_ph(1.0);
22794        assert_eq_m512h(r, e);
22795    }
22796
22797    #[simd_test(enable = "avx512fp16")]
22798    unsafe fn test_mm512_mask_min_ph() {
22799        let a = _mm512_set1_ph(2.0);
22800        let b = _mm512_set1_ph(1.0);
22801        let src = _mm512_set1_ph(3.0);
22802        let r = _mm512_mask_min_ph(src, 0b01010101010101010101010101010101, a, b);
22803        let e = _mm512_set_ph(
22804            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22805            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22806        );
22807        assert_eq_m512h(r, e);
22808    }
22809
22810    #[simd_test(enable = "avx512fp16")]
22811    unsafe fn test_mm512_maskz_min_ph() {
22812        let a = _mm512_set1_ph(2.0);
22813        let b = _mm512_set1_ph(1.0);
22814        let r = _mm512_maskz_min_ph(0b01010101010101010101010101010101, a, b);
22815        let e = _mm512_set_ph(
22816            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22817            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22818        );
22819        assert_eq_m512h(r, e);
22820    }
22821
22822    #[simd_test(enable = "avx512fp16")]
22823    unsafe fn test_mm512_min_round_ph() {
22824        let a = _mm512_set1_ph(2.0);
22825        let b = _mm512_set1_ph(1.0);
22826        let r = _mm512_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22827        let e = _mm512_set1_ph(1.0);
22828        assert_eq_m512h(r, e);
22829    }
22830
22831    #[simd_test(enable = "avx512fp16")]
22832    unsafe fn test_mm512_mask_min_round_ph() {
22833        let a = _mm512_set1_ph(2.0);
22834        let b = _mm512_set1_ph(1.0);
22835        let src = _mm512_set1_ph(3.0);
22836        let r = _mm512_mask_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22837            src,
22838            0b01010101010101010101010101010101,
22839            a,
22840            b,
22841        );
22842        let e = _mm512_set_ph(
22843            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22844            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22845        );
22846        assert_eq_m512h(r, e);
22847    }
22848
22849    #[simd_test(enable = "avx512fp16")]
22850    unsafe fn test_mm512_maskz_min_round_ph() {
22851        let a = _mm512_set1_ph(2.0);
22852        let b = _mm512_set1_ph(1.0);
22853        let r = _mm512_maskz_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22854            0b01010101010101010101010101010101,
22855            a,
22856            b,
22857        );
22858        let e = _mm512_set_ph(
22859            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22860            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22861        );
22862        assert_eq_m512h(r, e);
22863    }
22864
22865    #[simd_test(enable = "avx512fp16,avx512vl")]
22866    unsafe fn test_mm_min_sh() {
22867        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22868        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22869        let r = _mm_min_sh(a, b);
22870        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22871        assert_eq_m128h(r, e);
22872    }
22873
22874    #[simd_test(enable = "avx512fp16,avx512vl")]
22875    unsafe fn test_mm_mask_min_sh() {
22876        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22877        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22878        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22879        let r = _mm_mask_min_sh(src, 0, a, b);
22880        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22881        assert_eq_m128h(r, e);
22882        let r = _mm_mask_min_sh(src, 1, a, b);
22883        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22884        assert_eq_m128h(r, e);
22885    }
22886
22887    #[simd_test(enable = "avx512fp16,avx512vl")]
22888    unsafe fn test_mm_maskz_min_sh() {
22889        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22890        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22891        let r = _mm_maskz_min_sh(0, a, b);
22892        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22893        assert_eq_m128h(r, e);
22894        let r = _mm_maskz_min_sh(1, a, b);
22895        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22896        assert_eq_m128h(r, e);
22897    }
22898
22899    #[simd_test(enable = "avx512fp16,avx512vl")]
22900    unsafe fn test_mm_min_round_sh() {
22901        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22902        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22903        let r = _mm_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22904        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22905        assert_eq_m128h(r, e);
22906    }
22907
22908    #[simd_test(enable = "avx512fp16,avx512vl")]
22909    unsafe fn test_mm_mask_min_round_sh() {
22910        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22911        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22912        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22913        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22914            src, 0, a, b,
22915        );
22916        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22917        assert_eq_m128h(r, e);
22918        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22919            src, 1, a, b,
22920        );
22921        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22922        assert_eq_m128h(r, e);
22923    }
22924
22925    #[simd_test(enable = "avx512fp16,avx512vl")]
22926    unsafe fn test_mm_maskz_min_round_sh() {
22927        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22928        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22929        let r =
22930            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22931        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22932        assert_eq_m128h(r, e);
22933        let r =
22934            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22935        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22936        assert_eq_m128h(r, e);
22937    }
22938
22939    #[simd_test(enable = "avx512fp16,avx512vl")]
22940    unsafe fn test_mm_getexp_ph() {
22941        let a = _mm_set1_ph(3.0);
22942        let r = _mm_getexp_ph(a);
22943        let e = _mm_set1_ph(1.0);
22944        assert_eq_m128h(r, e);
22945    }
22946
22947    #[simd_test(enable = "avx512fp16,avx512vl")]
22948    unsafe fn test_mm_mask_getexp_ph() {
22949        let a = _mm_set1_ph(3.0);
22950        let src = _mm_set1_ph(4.0);
22951        let r = _mm_mask_getexp_ph(src, 0b01010101, a);
22952        let e = _mm_set_ph(4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0);
22953        assert_eq_m128h(r, e);
22954    }
22955
22956    #[simd_test(enable = "avx512fp16,avx512vl")]
22957    unsafe fn test_mm_maskz_getexp_ph() {
22958        let a = _mm_set1_ph(3.0);
22959        let r = _mm_maskz_getexp_ph(0b01010101, a);
22960        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22961        assert_eq_m128h(r, e);
22962    }
22963
22964    #[simd_test(enable = "avx512fp16,avx512vl")]
22965    unsafe fn test_mm256_getexp_ph() {
22966        let a = _mm256_set1_ph(3.0);
22967        let r = _mm256_getexp_ph(a);
22968        let e = _mm256_set1_ph(1.0);
22969        assert_eq_m256h(r, e);
22970    }
22971
22972    #[simd_test(enable = "avx512fp16,avx512vl")]
22973    unsafe fn test_mm256_mask_getexp_ph() {
22974        let a = _mm256_set1_ph(3.0);
22975        let src = _mm256_set1_ph(4.0);
22976        let r = _mm256_mask_getexp_ph(src, 0b0101010101010101, a);
22977        let e = _mm256_set_ph(
22978            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22979        );
22980        assert_eq_m256h(r, e);
22981    }
22982
22983    #[simd_test(enable = "avx512fp16,avx512vl")]
22984    unsafe fn test_mm256_maskz_getexp_ph() {
22985        let a = _mm256_set1_ph(3.0);
22986        let r = _mm256_maskz_getexp_ph(0b0101010101010101, a);
22987        let e = _mm256_set_ph(
22988            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22989        );
22990        assert_eq_m256h(r, e);
22991    }
22992
22993    #[simd_test(enable = "avx512fp16")]
22994    unsafe fn test_mm512_getexp_ph() {
22995        let a = _mm512_set1_ph(3.0);
22996        let r = _mm512_getexp_ph(a);
22997        let e = _mm512_set1_ph(1.0);
22998        assert_eq_m512h(r, e);
22999    }
23000
23001    #[simd_test(enable = "avx512fp16")]
23002    unsafe fn test_mm512_mask_getexp_ph() {
23003        let a = _mm512_set1_ph(3.0);
23004        let src = _mm512_set1_ph(4.0);
23005        let r = _mm512_mask_getexp_ph(src, 0b01010101010101010101010101010101, a);
23006        let e = _mm512_set_ph(
23007            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
23008            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
23009        );
23010        assert_eq_m512h(r, e);
23011    }
23012
23013    #[simd_test(enable = "avx512fp16")]
23014    unsafe fn test_mm512_maskz_getexp_ph() {
23015        let a = _mm512_set1_ph(3.0);
23016        let r = _mm512_maskz_getexp_ph(0b01010101010101010101010101010101, a);
23017        let e = _mm512_set_ph(
23018            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23019            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23020        );
23021        assert_eq_m512h(r, e);
23022    }
23023
23024    #[simd_test(enable = "avx512fp16")]
23025    unsafe fn test_mm512_getexp_round_ph() {
23026        let a = _mm512_set1_ph(3.0);
23027        let r = _mm512_getexp_round_ph::<_MM_FROUND_NO_EXC>(a);
23028        let e = _mm512_set1_ph(1.0);
23029        assert_eq_m512h(r, e);
23030    }
23031
23032    #[simd_test(enable = "avx512fp16")]
23033    unsafe fn test_mm512_mask_getexp_round_ph() {
23034        let a = _mm512_set1_ph(3.0);
23035        let src = _mm512_set1_ph(4.0);
23036        let r = _mm512_mask_getexp_round_ph::<_MM_FROUND_NO_EXC>(
23037            src,
23038            0b01010101010101010101010101010101,
23039            a,
23040        );
23041        let e = _mm512_set_ph(
23042            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
23043            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
23044        );
23045        assert_eq_m512h(r, e);
23046    }
23047
23048    #[simd_test(enable = "avx512fp16")]
23049    unsafe fn test_mm512_maskz_getexp_round_ph() {
23050        let a = _mm512_set1_ph(3.0);
23051        let r = _mm512_maskz_getexp_round_ph::<_MM_FROUND_NO_EXC>(
23052            0b01010101010101010101010101010101,
23053            a,
23054        );
23055        let e = _mm512_set_ph(
23056            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23057            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23058        );
23059        assert_eq_m512h(r, e);
23060    }
23061
23062    #[simd_test(enable = "avx512fp16,avx512vl")]
23063    unsafe fn test_mm_getexp_sh() {
23064        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23065        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23066        let r = _mm_getexp_sh(a, b);
23067        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23068        assert_eq_m128h(r, e);
23069    }
23070
23071    #[simd_test(enable = "avx512fp16,avx512vl")]
23072    unsafe fn test_mm_mask_getexp_sh() {
23073        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23074        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23075        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
23076        let r = _mm_mask_getexp_sh(src, 0, a, b);
23077        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23078        assert_eq_m128h(r, e);
23079        let r = _mm_mask_getexp_sh(src, 1, a, b);
23080        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23081        assert_eq_m128h(r, e);
23082    }
23083
23084    #[simd_test(enable = "avx512fp16,avx512vl")]
23085    unsafe fn test_mm_maskz_getexp_sh() {
23086        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23087        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23088        let r = _mm_maskz_getexp_sh(0, a, b);
23089        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23090        assert_eq_m128h(r, e);
23091        let r = _mm_maskz_getexp_sh(1, a, b);
23092        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23093        assert_eq_m128h(r, e);
23094    }
23095
23096    #[simd_test(enable = "avx512fp16,avx512vl")]
23097    unsafe fn test_mm_getexp_round_sh() {
23098        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23099        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23100        let r = _mm_getexp_round_sh::<_MM_FROUND_NO_EXC>(a, b);
23101        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23102        assert_eq_m128h(r, e);
23103    }
23104
23105    #[simd_test(enable = "avx512fp16,avx512vl")]
23106    unsafe fn test_mm_mask_getexp_round_sh() {
23107        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23108        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23109        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
23110        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 0, a, b);
23111        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23112        assert_eq_m128h(r, e);
23113        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 1, a, b);
23114        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23115        assert_eq_m128h(r, e);
23116    }
23117
23118    #[simd_test(enable = "avx512fp16,avx512vl")]
23119    unsafe fn test_mm_maskz_getexp_round_sh() {
23120        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
23121        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23122        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(0, a, b);
23123        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23124        assert_eq_m128h(r, e);
23125        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(1, a, b);
23126        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23127        assert_eq_m128h(r, e);
23128    }
23129
23130    #[simd_test(enable = "avx512fp16,avx512vl")]
23131    unsafe fn test_mm_getmant_ph() {
23132        let a = _mm_set1_ph(10.0);
23133        let r = _mm_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
23134        let e = _mm_set1_ph(1.25);
23135        assert_eq_m128h(r, e);
23136    }
23137
23138    #[simd_test(enable = "avx512fp16,avx512vl")]
23139    unsafe fn test_mm_mask_getmant_ph() {
23140        let a = _mm_set1_ph(10.0);
23141        let src = _mm_set1_ph(20.0);
23142        let r = _mm_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0b01010101, a);
23143        let e = _mm_set_ph(20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25);
23144        assert_eq_m128h(r, e);
23145    }
23146
23147    #[simd_test(enable = "avx512fp16,avx512vl")]
23148    unsafe fn test_mm_maskz_getmant_ph() {
23149        let a = _mm_set1_ph(10.0);
23150        let r = _mm_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0b01010101, a);
23151        let e = _mm_set_ph(0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25);
23152        assert_eq_m128h(r, e);
23153    }
23154
23155    #[simd_test(enable = "avx512fp16,avx512vl")]
23156    unsafe fn test_mm256_getmant_ph() {
23157        let a = _mm256_set1_ph(10.0);
23158        let r = _mm256_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
23159        let e = _mm256_set1_ph(1.25);
23160        assert_eq_m256h(r, e);
23161    }
23162
23163    #[simd_test(enable = "avx512fp16,avx512vl")]
23164    unsafe fn test_mm256_mask_getmant_ph() {
23165        let a = _mm256_set1_ph(10.0);
23166        let src = _mm256_set1_ph(20.0);
23167        let r = _mm256_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
23168            src,
23169            0b0101010101010101,
23170            a,
23171        );
23172        let e = _mm256_set_ph(
23173            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23174            20.0, 1.25,
23175        );
23176        assert_eq_m256h(r, e);
23177    }
23178
23179    #[simd_test(enable = "avx512fp16,avx512vl")]
23180    unsafe fn test_mm256_maskz_getmant_ph() {
23181        let a = _mm256_set1_ph(10.0);
23182        let r = _mm256_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
23183            0b0101010101010101,
23184            a,
23185        );
23186        let e = _mm256_set_ph(
23187            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23188        );
23189        assert_eq_m256h(r, e);
23190    }
23191
23192    #[simd_test(enable = "avx512fp16")]
23193    unsafe fn test_mm512_getmant_ph() {
23194        let a = _mm512_set1_ph(10.0);
23195        let r = _mm512_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
23196        let e = _mm512_set1_ph(1.25);
23197        assert_eq_m512h(r, e);
23198    }
23199
23200    #[simd_test(enable = "avx512fp16")]
23201    unsafe fn test_mm512_mask_getmant_ph() {
23202        let a = _mm512_set1_ph(10.0);
23203        let src = _mm512_set1_ph(20.0);
23204        let r = _mm512_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
23205            src,
23206            0b01010101010101010101010101010101,
23207            a,
23208        );
23209        let e = _mm512_set_ph(
23210            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23211            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23212            20.0, 1.25, 20.0, 1.25,
23213        );
23214        assert_eq_m512h(r, e);
23215    }
23216
23217    #[simd_test(enable = "avx512fp16")]
23218    unsafe fn test_mm512_maskz_getmant_ph() {
23219        let a = _mm512_set1_ph(10.0);
23220        let r = _mm512_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
23221            0b01010101010101010101010101010101,
23222            a,
23223        );
23224        let e = _mm512_set_ph(
23225            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23226            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23227        );
23228        assert_eq_m512h(r, e);
23229    }
23230
23231    #[simd_test(enable = "avx512fp16")]
23232    unsafe fn test_mm512_getmant_round_ph() {
23233        let a = _mm512_set1_ph(10.0);
23234        let r =
23235            _mm512_getmant_round_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
23236                a,
23237            );
23238        let e = _mm512_set1_ph(1.25);
23239        assert_eq_m512h(r, e);
23240    }
23241
23242    #[simd_test(enable = "avx512fp16")]
23243    unsafe fn test_mm512_mask_getmant_round_ph() {
23244        let a = _mm512_set1_ph(10.0);
23245        let src = _mm512_set1_ph(20.0);
23246        let r = _mm512_mask_getmant_round_ph::<
23247            _MM_MANT_NORM_P75_1P5,
23248            _MM_MANT_SIGN_NAN,
23249            _MM_FROUND_NO_EXC,
23250        >(src, 0b01010101010101010101010101010101, a);
23251        let e = _mm512_set_ph(
23252            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23253            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23254            20.0, 1.25, 20.0, 1.25,
23255        );
23256        assert_eq_m512h(r, e);
23257    }
23258
23259    #[simd_test(enable = "avx512fp16")]
23260    unsafe fn test_mm512_maskz_getmant_round_ph() {
23261        let a = _mm512_set1_ph(10.0);
23262        let r = _mm512_maskz_getmant_round_ph::<
23263            _MM_MANT_NORM_P75_1P5,
23264            _MM_MANT_SIGN_NAN,
23265            _MM_FROUND_NO_EXC,
23266        >(0b01010101010101010101010101010101, a);
23267        let e = _mm512_set_ph(
23268            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23269            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23270        );
23271        assert_eq_m512h(r, e);
23272    }
23273
23274    #[simd_test(enable = "avx512fp16,avx512vl")]
23275    unsafe fn test_mm_getmant_sh() {
23276        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23277        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23278        let r = _mm_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a, b);
23279        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23280        assert_eq_m128h(r, e);
23281    }
23282
23283    #[simd_test(enable = "avx512fp16,avx512vl")]
23284    unsafe fn test_mm_mask_getmant_sh() {
23285        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23286        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23287        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
23288        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0, a, b);
23289        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
23290        assert_eq_m128h(r, e);
23291        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 1, a, b);
23292        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23293        assert_eq_m128h(r, e);
23294    }
23295
23296    #[simd_test(enable = "avx512fp16,avx512vl")]
23297    unsafe fn test_mm_maskz_getmant_sh() {
23298        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23299        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23300        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0, a, b);
23301        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23302        assert_eq_m128h(r, e);
23303        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(1, a, b);
23304        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23305        assert_eq_m128h(r, e);
23306    }
23307
23308    #[simd_test(enable = "avx512fp16,avx512vl")]
23309    unsafe fn test_mm_getmant_round_sh() {
23310        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23311        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23312        let r = _mm_getmant_round_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
23313            a, b,
23314        );
23315        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23316        assert_eq_m128h(r, e);
23317    }
23318
23319    #[simd_test(enable = "avx512fp16,avx512vl")]
23320    unsafe fn test_mm_mask_getmant_round_sh() {
23321        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23322        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23323        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
23324        let r = _mm_mask_getmant_round_sh::<
23325            _MM_MANT_NORM_P75_1P5,
23326            _MM_MANT_SIGN_NAN,
23327            _MM_FROUND_NO_EXC,
23328        >(src, 0, a, b);
23329        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
23330        assert_eq_m128h(r, e);
23331        let r = _mm_mask_getmant_round_sh::<
23332            _MM_MANT_NORM_P75_1P5,
23333            _MM_MANT_SIGN_NAN,
23334            _MM_FROUND_NO_EXC,
23335        >(src, 1, a, b);
23336        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23337        assert_eq_m128h(r, e);
23338    }
23339
23340    #[simd_test(enable = "avx512fp16,avx512vl")]
23341    unsafe fn test_mm_maskz_getmant_round_sh() {
23342        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23343        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23344        let r = _mm_maskz_getmant_round_sh::<
23345            _MM_MANT_NORM_P75_1P5,
23346            _MM_MANT_SIGN_NAN,
23347            _MM_FROUND_NO_EXC,
23348        >(0, a, b);
23349        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23350        assert_eq_m128h(r, e);
23351        let r = _mm_maskz_getmant_round_sh::<
23352            _MM_MANT_NORM_P75_1P5,
23353            _MM_MANT_SIGN_NAN,
23354            _MM_FROUND_NO_EXC,
23355        >(1, a, b);
23356        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23357        assert_eq_m128h(r, e);
23358    }
23359
23360    #[simd_test(enable = "avx512fp16,avx512vl")]
23361    unsafe fn test_mm_roundscale_ph() {
23362        let a = _mm_set1_ph(1.1);
23363        let r = _mm_roundscale_ph::<0>(a);
23364        let e = _mm_set1_ph(1.0);
23365        assert_eq_m128h(r, e);
23366    }
23367
23368    #[simd_test(enable = "avx512fp16,avx512vl")]
23369    unsafe fn test_mm_mask_roundscale_ph() {
23370        let a = _mm_set1_ph(1.1);
23371        let src = _mm_set1_ph(2.0);
23372        let r = _mm_mask_roundscale_ph::<0>(src, 0b01010101, a);
23373        let e = _mm_set_ph(2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0);
23374        assert_eq_m128h(r, e);
23375    }
23376
23377    #[simd_test(enable = "avx512fp16,avx512vl")]
23378    unsafe fn test_mm_maskz_roundscale_ph() {
23379        let a = _mm_set1_ph(1.1);
23380        let r = _mm_maskz_roundscale_ph::<0>(0b01010101, a);
23381        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
23382        assert_eq_m128h(r, e);
23383    }
23384
23385    #[simd_test(enable = "avx512fp16,avx512vl")]
23386    unsafe fn test_mm256_roundscale_ph() {
23387        let a = _mm256_set1_ph(1.1);
23388        let r = _mm256_roundscale_ph::<0>(a);
23389        let e = _mm256_set1_ph(1.0);
23390        assert_eq_m256h(r, e);
23391    }
23392
23393    #[simd_test(enable = "avx512fp16,avx512vl")]
23394    unsafe fn test_mm256_mask_roundscale_ph() {
23395        let a = _mm256_set1_ph(1.1);
23396        let src = _mm256_set1_ph(2.0);
23397        let r = _mm256_mask_roundscale_ph::<0>(src, 0b0101010101010101, a);
23398        let e = _mm256_set_ph(
23399            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23400        );
23401        assert_eq_m256h(r, e);
23402    }
23403
23404    #[simd_test(enable = "avx512fp16,avx512vl")]
23405    unsafe fn test_mm256_maskz_roundscale_ph() {
23406        let a = _mm256_set1_ph(1.1);
23407        let r = _mm256_maskz_roundscale_ph::<0>(0b0101010101010101, a);
23408        let e = _mm256_set_ph(
23409            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23410        );
23411        assert_eq_m256h(r, e);
23412    }
23413
23414    #[simd_test(enable = "avx512fp16")]
23415    unsafe fn test_mm512_roundscale_ph() {
23416        let a = _mm512_set1_ph(1.1);
23417        let r = _mm512_roundscale_ph::<0>(a);
23418        let e = _mm512_set1_ph(1.0);
23419        assert_eq_m512h(r, e);
23420    }
23421
23422    #[simd_test(enable = "avx512fp16")]
23423    unsafe fn test_mm512_mask_roundscale_ph() {
23424        let a = _mm512_set1_ph(1.1);
23425        let src = _mm512_set1_ph(2.0);
23426        let r = _mm512_mask_roundscale_ph::<0>(src, 0b01010101010101010101010101010101, a);
23427        let e = _mm512_set_ph(
23428            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23429            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23430        );
23431        assert_eq_m512h(r, e);
23432    }
23433
23434    #[simd_test(enable = "avx512fp16")]
23435    unsafe fn test_mm512_maskz_roundscale_ph() {
23436        let a = _mm512_set1_ph(1.1);
23437        let r = _mm512_maskz_roundscale_ph::<0>(0b01010101010101010101010101010101, a);
23438        let e = _mm512_set_ph(
23439            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23440            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23441        );
23442        assert_eq_m512h(r, e);
23443    }
23444
23445    #[simd_test(enable = "avx512fp16")]
23446    unsafe fn test_mm512_roundscale_round_ph() {
23447        let a = _mm512_set1_ph(1.1);
23448        let r = _mm512_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(a);
23449        let e = _mm512_set1_ph(1.0);
23450        assert_eq_m512h(r, e);
23451    }
23452
23453    #[simd_test(enable = "avx512fp16")]
23454    unsafe fn test_mm512_mask_roundscale_round_ph() {
23455        let a = _mm512_set1_ph(1.1);
23456        let src = _mm512_set1_ph(2.0);
23457        let r = _mm512_mask_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23458            src,
23459            0b01010101010101010101010101010101,
23460            a,
23461        );
23462        let e = _mm512_set_ph(
23463            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23464            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23465        );
23466        assert_eq_m512h(r, e);
23467    }
23468
23469    #[simd_test(enable = "avx512fp16")]
23470    unsafe fn test_mm512_maskz_roundscale_round_ph() {
23471        let a = _mm512_set1_ph(1.1);
23472        let r = _mm512_maskz_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23473            0b01010101010101010101010101010101,
23474            a,
23475        );
23476        let e = _mm512_set_ph(
23477            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23478            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23479        );
23480        assert_eq_m512h(r, e);
23481    }
23482
23483    #[simd_test(enable = "avx512fp16,avx512vl")]
23484    unsafe fn test_mm_roundscale_sh() {
23485        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23486        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23487        let r = _mm_roundscale_sh::<0>(a, b);
23488        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23489        assert_eq_m128h(r, e);
23490    }
23491
23492    #[simd_test(enable = "avx512fp16,avx512vl")]
23493    unsafe fn test_mm_mask_roundscale_sh() {
23494        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23495        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23496        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23497        let r = _mm_mask_roundscale_sh::<0>(src, 0, a, b);
23498        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23499        assert_eq_m128h(r, e);
23500        let r = _mm_mask_roundscale_sh::<0>(src, 1, a, b);
23501        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23502        assert_eq_m128h(r, e);
23503    }
23504
23505    #[simd_test(enable = "avx512fp16,avx512vl")]
23506    unsafe fn test_mm_maskz_roundscale_sh() {
23507        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23508        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23509        let r = _mm_maskz_roundscale_sh::<0>(0, a, b);
23510        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23511        assert_eq_m128h(r, e);
23512        let r = _mm_maskz_roundscale_sh::<0>(1, a, b);
23513        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23514        assert_eq_m128h(r, e);
23515    }
23516
23517    #[simd_test(enable = "avx512fp16,avx512vl")]
23518    unsafe fn test_mm_roundscale_round_sh() {
23519        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23520        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23521        let r = _mm_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
23522        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23523        assert_eq_m128h(r, e);
23524    }
23525
23526    #[simd_test(enable = "avx512fp16,avx512vl")]
23527    unsafe fn test_mm_mask_roundscale_round_sh() {
23528        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23529        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23530        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23531        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
23532        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23533        assert_eq_m128h(r, e);
23534        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
23535        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23536        assert_eq_m128h(r, e);
23537    }
23538
23539    #[simd_test(enable = "avx512fp16,avx512vl")]
23540    unsafe fn test_mm_maskz_roundscale_round_sh() {
23541        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23542        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23543        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
23544        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23545        assert_eq_m128h(r, e);
23546        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
23547        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23548        assert_eq_m128h(r, e);
23549    }
23550
23551    #[simd_test(enable = "avx512fp16,avx512vl")]
23552    unsafe fn test_mm_scalef_ph() {
23553        let a = _mm_set1_ph(1.);
23554        let b = _mm_set1_ph(3.);
23555        let r = _mm_scalef_ph(a, b);
23556        let e = _mm_set1_ph(8.0);
23557        assert_eq_m128h(r, e);
23558    }
23559
23560    #[simd_test(enable = "avx512fp16,avx512vl")]
23561    unsafe fn test_mm_mask_scalef_ph() {
23562        let a = _mm_set1_ph(1.);
23563        let b = _mm_set1_ph(3.);
23564        let src = _mm_set1_ph(2.);
23565        let r = _mm_mask_scalef_ph(src, 0b01010101, a, b);
23566        let e = _mm_set_ph(2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0);
23567        assert_eq_m128h(r, e);
23568    }
23569
23570    #[simd_test(enable = "avx512fp16,avx512vl")]
23571    unsafe fn test_mm_maskz_scalef_ph() {
23572        let a = _mm_set1_ph(1.);
23573        let b = _mm_set1_ph(3.);
23574        let r = _mm_maskz_scalef_ph(0b01010101, a, b);
23575        let e = _mm_set_ph(0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0);
23576        assert_eq_m128h(r, e);
23577    }
23578
23579    #[simd_test(enable = "avx512fp16,avx512vl")]
23580    unsafe fn test_mm256_scalef_ph() {
23581        let a = _mm256_set1_ph(1.);
23582        let b = _mm256_set1_ph(3.);
23583        let r = _mm256_scalef_ph(a, b);
23584        let e = _mm256_set1_ph(8.0);
23585        assert_eq_m256h(r, e);
23586    }
23587
23588    #[simd_test(enable = "avx512fp16,avx512vl")]
23589    unsafe fn test_mm256_mask_scalef_ph() {
23590        let a = _mm256_set1_ph(1.);
23591        let b = _mm256_set1_ph(3.);
23592        let src = _mm256_set1_ph(2.);
23593        let r = _mm256_mask_scalef_ph(src, 0b0101010101010101, a, b);
23594        let e = _mm256_set_ph(
23595            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23596        );
23597        assert_eq_m256h(r, e);
23598    }
23599
23600    #[simd_test(enable = "avx512fp16,avx512vl")]
23601    unsafe fn test_mm256_maskz_scalef_ph() {
23602        let a = _mm256_set1_ph(1.);
23603        let b = _mm256_set1_ph(3.);
23604        let r = _mm256_maskz_scalef_ph(0b0101010101010101, a, b);
23605        let e = _mm256_set_ph(
23606            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23607        );
23608        assert_eq_m256h(r, e);
23609    }
23610
23611    #[simd_test(enable = "avx512fp16")]
23612    unsafe fn test_mm512_scalef_ph() {
23613        let a = _mm512_set1_ph(1.);
23614        let b = _mm512_set1_ph(3.);
23615        let r = _mm512_scalef_ph(a, b);
23616        let e = _mm512_set1_ph(8.0);
23617        assert_eq_m512h(r, e);
23618    }
23619
23620    #[simd_test(enable = "avx512fp16")]
23621    unsafe fn test_mm512_mask_scalef_ph() {
23622        let a = _mm512_set1_ph(1.);
23623        let b = _mm512_set1_ph(3.);
23624        let src = _mm512_set1_ph(2.);
23625        let r = _mm512_mask_scalef_ph(src, 0b01010101010101010101010101010101, a, b);
23626        let e = _mm512_set_ph(
23627            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23628            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23629        );
23630        assert_eq_m512h(r, e);
23631    }
23632
23633    #[simd_test(enable = "avx512fp16")]
23634    unsafe fn test_mm512_maskz_scalef_ph() {
23635        let a = _mm512_set1_ph(1.);
23636        let b = _mm512_set1_ph(3.);
23637        let r = _mm512_maskz_scalef_ph(0b01010101010101010101010101010101, a, b);
23638        let e = _mm512_set_ph(
23639            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23640            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23641        );
23642        assert_eq_m512h(r, e);
23643    }
23644
23645    #[simd_test(enable = "avx512fp16")]
23646    unsafe fn test_mm512_scalef_round_ph() {
23647        let a = _mm512_set1_ph(1.);
23648        let b = _mm512_set1_ph(3.);
23649        let r = _mm512_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23650        let e = _mm512_set1_ph(8.0);
23651        assert_eq_m512h(r, e);
23652    }
23653
23654    #[simd_test(enable = "avx512fp16")]
23655    unsafe fn test_mm512_mask_scalef_round_ph() {
23656        let a = _mm512_set1_ph(1.);
23657        let b = _mm512_set1_ph(3.);
23658        let src = _mm512_set1_ph(2.);
23659        let r = _mm512_mask_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23660            src,
23661            0b01010101010101010101010101010101,
23662            a,
23663            b,
23664        );
23665        let e = _mm512_set_ph(
23666            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23667            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23668        );
23669        assert_eq_m512h(r, e);
23670    }
23671
23672    #[simd_test(enable = "avx512fp16")]
23673    unsafe fn test_mm512_maskz_scalef_round_ph() {
23674        let a = _mm512_set1_ph(1.);
23675        let b = _mm512_set1_ph(3.);
23676        let r = _mm512_maskz_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23677            0b01010101010101010101010101010101,
23678            a,
23679            b,
23680        );
23681        let e = _mm512_set_ph(
23682            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23683            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23684        );
23685        assert_eq_m512h(r, e);
23686    }
23687
23688    #[simd_test(enable = "avx512fp16,avx512vl")]
23689    unsafe fn test_mm_scalef_sh() {
23690        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23691        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23692        let r = _mm_scalef_sh(a, b);
23693        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23694        assert_eq_m128h(r, e);
23695    }
23696
23697    #[simd_test(enable = "avx512fp16,avx512vl")]
23698    unsafe fn test_mm_mask_scalef_sh() {
23699        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23700        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23701        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23702        let r = _mm_mask_scalef_sh(src, 0, a, b);
23703        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23704        assert_eq_m128h(r, e);
23705        let r = _mm_mask_scalef_sh(src, 1, a, b);
23706        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23707        assert_eq_m128h(r, e);
23708    }
23709
23710    #[simd_test(enable = "avx512fp16,avx512vl")]
23711    unsafe fn test_mm_maskz_scalef_sh() {
23712        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23713        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23714        let r = _mm_maskz_scalef_sh(0, a, b);
23715        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23716        assert_eq_m128h(r, e);
23717        let r = _mm_maskz_scalef_sh(1, a, b);
23718        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23719        assert_eq_m128h(r, e);
23720    }
23721
23722    #[simd_test(enable = "avx512fp16,avx512vl")]
23723    unsafe fn test_mm_scalef_round_sh() {
23724        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23725        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23726        let r = _mm_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23727        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23728        assert_eq_m128h(r, e);
23729    }
23730
23731    #[simd_test(enable = "avx512fp16,avx512vl")]
23732    unsafe fn test_mm_mask_scalef_round_sh() {
23733        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23734        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23735        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23736        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23737            src, 0, a, b,
23738        );
23739        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23740        assert_eq_m128h(r, e);
23741        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23742            src, 1, a, b,
23743        );
23744        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23745        assert_eq_m128h(r, e);
23746    }
23747
23748    #[simd_test(enable = "avx512fp16,avx512vl")]
23749    unsafe fn test_mm_maskz_scalef_round_sh() {
23750        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23751        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23752        let r =
23753            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
23754        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23755        assert_eq_m128h(r, e);
23756        let r =
23757            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
23758        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23759        assert_eq_m128h(r, e);
23760    }
23761
23762    #[simd_test(enable = "avx512fp16,avx512vl")]
23763    unsafe fn test_mm_reduce_ph() {
23764        let a = _mm_set1_ph(1.25);
23765        let r = _mm_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23766        let e = _mm_set1_ph(0.25);
23767        assert_eq_m128h(r, e);
23768    }
23769
23770    #[simd_test(enable = "avx512fp16,avx512vl")]
23771    unsafe fn test_mm_mask_reduce_ph() {
23772        let a = _mm_set1_ph(1.25);
23773        let src = _mm_set1_ph(2.0);
23774        let r = _mm_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01010101, a);
23775        let e = _mm_set_ph(2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25);
23776        assert_eq_m128h(r, e);
23777    }
23778
23779    #[simd_test(enable = "avx512fp16,avx512vl")]
23780    unsafe fn test_mm_maskz_reduce_ph() {
23781        let a = _mm_set1_ph(1.25);
23782        let r = _mm_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01010101, a);
23783        let e = _mm_set_ph(0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25);
23784        assert_eq_m128h(r, e);
23785    }
23786
23787    #[simd_test(enable = "avx512fp16,avx512vl")]
23788    unsafe fn test_mm256_reduce_ph() {
23789        let a = _mm256_set1_ph(1.25);
23790        let r = _mm256_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23791        let e = _mm256_set1_ph(0.25);
23792        assert_eq_m256h(r, e);
23793    }
23794
23795    #[simd_test(enable = "avx512fp16,avx512vl")]
23796    unsafe fn test_mm256_mask_reduce_ph() {
23797        let a = _mm256_set1_ph(1.25);
23798        let src = _mm256_set1_ph(2.0);
23799        let r = _mm256_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0101010101010101, a);
23800        let e = _mm256_set_ph(
23801            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23802        );
23803        assert_eq_m256h(r, e);
23804    }
23805
23806    #[simd_test(enable = "avx512fp16,avx512vl")]
23807    unsafe fn test_mm256_maskz_reduce_ph() {
23808        let a = _mm256_set1_ph(1.25);
23809        let r = _mm256_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0101010101010101, a);
23810        let e = _mm256_set_ph(
23811            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23812        );
23813        assert_eq_m256h(r, e);
23814    }
23815
23816    #[simd_test(enable = "avx512fp16")]
23817    unsafe fn test_mm512_reduce_ph() {
23818        let a = _mm512_set1_ph(1.25);
23819        let r = _mm512_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23820        let e = _mm512_set1_ph(0.25);
23821        assert_eq_m512h(r, e);
23822    }
23823
23824    #[simd_test(enable = "avx512fp16")]
23825    unsafe fn test_mm512_mask_reduce_ph() {
23826        let a = _mm512_set1_ph(1.25);
23827        let src = _mm512_set1_ph(2.0);
23828        let r = _mm512_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23829            src,
23830            0b01010101010101010101010101010101,
23831            a,
23832        );
23833        let e = _mm512_set_ph(
23834            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23835            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23836        );
23837        assert_eq_m512h(r, e);
23838    }
23839
23840    #[simd_test(enable = "avx512fp16")]
23841    unsafe fn test_mm512_maskz_reduce_ph() {
23842        let a = _mm512_set1_ph(1.25);
23843        let r = _mm512_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23844            0b01010101010101010101010101010101,
23845            a,
23846        );
23847        let e = _mm512_set_ph(
23848            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23849            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23850        );
23851        assert_eq_m512h(r, e);
23852    }
23853
23854    #[simd_test(enable = "avx512fp16")]
23855    unsafe fn test_mm512_reduce_round_ph() {
23856        let a = _mm512_set1_ph(1.25);
23857        let r = _mm512_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
23858        let e = _mm512_set1_ph(0.25);
23859        assert_eq_m512h(r, e);
23860    }
23861
23862    #[simd_test(enable = "avx512fp16")]
23863    unsafe fn test_mm512_mask_reduce_round_ph() {
23864        let a = _mm512_set1_ph(1.25);
23865        let src = _mm512_set1_ph(2.0);
23866        let r = _mm512_mask_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23867            src,
23868            0b01010101010101010101010101010101,
23869            a,
23870        );
23871        let e = _mm512_set_ph(
23872            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23873            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23874        );
23875        assert_eq_m512h(r, e);
23876    }
23877
23878    #[simd_test(enable = "avx512fp16")]
23879    unsafe fn test_mm512_maskz_reduce_round_ph() {
23880        let a = _mm512_set1_ph(1.25);
23881        let r = _mm512_maskz_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23882            0b01010101010101010101010101010101,
23883            a,
23884        );
23885        let e = _mm512_set_ph(
23886            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23887            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23888        );
23889        assert_eq_m512h(r, e);
23890    }
23891
23892    #[simd_test(enable = "avx512fp16,avx512vl")]
23893    unsafe fn test_mm_reduce_sh() {
23894        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23895        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23896        let r = _mm_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
23897        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23898        assert_eq_m128h(r, e);
23899    }
23900
23901    #[simd_test(enable = "avx512fp16,avx512vl")]
23902    unsafe fn test_mm_mask_reduce_sh() {
23903        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23904        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23905        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23906        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0, a, b);
23907        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23908        assert_eq_m128h(r, e);
23909        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 1, a, b);
23910        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23911        assert_eq_m128h(r, e);
23912    }
23913
23914    #[simd_test(enable = "avx512fp16,avx512vl")]
23915    unsafe fn test_mm_maskz_reduce_sh() {
23916        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23917        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23918        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(0, a, b);
23919        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23920        assert_eq_m128h(r, e);
23921        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(1, a, b);
23922        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23923        assert_eq_m128h(r, e);
23924    }
23925
23926    #[simd_test(enable = "avx512fp16,avx512vl")]
23927    unsafe fn test_mm_reduce_round_sh() {
23928        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23929        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23930        let r = _mm_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
23931        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23932        assert_eq_m128h(r, e);
23933    }
23934
23935    #[simd_test(enable = "avx512fp16,avx512vl")]
23936    unsafe fn test_mm_mask_reduce_round_sh() {
23937        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23938        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23939        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23940        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23941            src, 0, a, b,
23942        );
23943        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23944        assert_eq_m128h(r, e);
23945        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23946            src, 1, a, b,
23947        );
23948        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23949        assert_eq_m128h(r, e);
23950    }
23951
23952    #[simd_test(enable = "avx512fp16,avx512vl")]
23953    unsafe fn test_mm_maskz_reduce_round_sh() {
23954        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23955        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23956        let r =
23957            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0, a, b);
23958        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23959        assert_eq_m128h(r, e);
23960        let r =
23961            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(1, a, b);
23962        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23963        assert_eq_m128h(r, e);
23964    }
23965
23966    #[simd_test(enable = "avx512fp16,avx512vl")]
23967    const unsafe fn test_mm_reduce_add_ph() {
23968        let a = _mm_set1_ph(2.0);
23969        let r = _mm_reduce_add_ph(a);
23970        assert_eq!(r, 16.0);
23971    }
23972
23973    #[simd_test(enable = "avx512fp16,avx512vl")]
23974    const unsafe fn test_mm256_reduce_add_ph() {
23975        let a = _mm256_set1_ph(2.0);
23976        let r = _mm256_reduce_add_ph(a);
23977        assert_eq!(r, 32.0);
23978    }
23979
23980    #[simd_test(enable = "avx512fp16")]
23981    const unsafe fn test_mm512_reduce_add_ph() {
23982        let a = _mm512_set1_ph(2.0);
23983        let r = _mm512_reduce_add_ph(a);
23984        assert_eq!(r, 64.0);
23985    }
23986
23987    #[simd_test(enable = "avx512fp16,avx512vl")]
23988    const unsafe fn test_mm_reduce_mul_ph() {
23989        let a = _mm_set1_ph(2.0);
23990        let r = _mm_reduce_mul_ph(a);
23991        assert_eq!(r, 256.0);
23992    }
23993
23994    #[simd_test(enable = "avx512fp16,avx512vl")]
23995    const unsafe fn test_mm256_reduce_mul_ph() {
23996        let a = _mm256_set1_ph(2.0);
23997        let r = _mm256_reduce_mul_ph(a);
23998        assert_eq!(r, 65536.0);
23999    }
24000
24001    #[simd_test(enable = "avx512fp16")]
24002    const unsafe fn test_mm512_reduce_mul_ph() {
24003        let a = _mm512_set1_ph(2.0);
24004        let r = _mm512_reduce_mul_ph(a);
24005        assert_eq!(r, 16777216.0);
24006    }
24007
24008    #[simd_test(enable = "avx512fp16,avx512vl")]
24009    unsafe fn test_mm_reduce_max_ph() {
24010        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24011        let r = _mm_reduce_max_ph(a);
24012        assert_eq!(r, 8.0);
24013    }
24014
24015    #[simd_test(enable = "avx512fp16,avx512vl")]
24016    unsafe fn test_mm256_reduce_max_ph() {
24017        let a = _mm256_set_ph(
24018            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24019        );
24020        let r = _mm256_reduce_max_ph(a);
24021        assert_eq!(r, 16.0);
24022    }
24023
24024    #[simd_test(enable = "avx512fp16")]
24025    unsafe fn test_mm512_reduce_max_ph() {
24026        let a = _mm512_set_ph(
24027            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24028            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24029            31.0, 32.0,
24030        );
24031        let r = _mm512_reduce_max_ph(a);
24032        assert_eq!(r, 32.0);
24033    }
24034
24035    #[simd_test(enable = "avx512fp16,avx512vl")]
24036    unsafe fn test_mm_reduce_min_ph() {
24037        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24038        let r = _mm_reduce_min_ph(a);
24039        assert_eq!(r, 1.0);
24040    }
24041
24042    #[simd_test(enable = "avx512fp16,avx512vl")]
24043    unsafe fn test_mm256_reduce_min_ph() {
24044        let a = _mm256_set_ph(
24045            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24046        );
24047        let r = _mm256_reduce_min_ph(a);
24048        assert_eq!(r, 1.0);
24049    }
24050
24051    #[simd_test(enable = "avx512fp16")]
24052    unsafe fn test_mm512_reduce_min_ph() {
24053        let a = _mm512_set_ph(
24054            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24055            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24056            31.0, 32.0,
24057        );
24058        let r = _mm512_reduce_min_ph(a);
24059        assert_eq!(r, 1.0);
24060    }
24061
24062    #[simd_test(enable = "avx512fp16,avx512vl")]
24063    unsafe fn test_mm_fpclass_ph_mask() {
24064        let a = _mm_set_ph(
24065            1.,
24066            f16::INFINITY,
24067            f16::NEG_INFINITY,
24068            0.0,
24069            -0.0,
24070            -2.0,
24071            f16::NAN,
24072            5.9e-8, // Denormal
24073        );
24074        let r = _mm_fpclass_ph_mask::<0x18>(a); // infinities
24075        assert_eq!(r, 0b01100000);
24076    }
24077
24078    #[simd_test(enable = "avx512fp16,avx512vl")]
24079    unsafe fn test_mm_mask_fpclass_ph_mask() {
24080        let a = _mm_set_ph(
24081            1.,
24082            f16::INFINITY,
24083            f16::NEG_INFINITY,
24084            0.0,
24085            -0.0,
24086            -2.0,
24087            f16::NAN,
24088            5.9e-8, // Denormal
24089        );
24090        let r = _mm_mask_fpclass_ph_mask::<0x18>(0b01010101, a);
24091        assert_eq!(r, 0b01000000);
24092    }
24093
24094    #[simd_test(enable = "avx512fp16,avx512vl")]
24095    unsafe fn test_mm256_fpclass_ph_mask() {
24096        let a = _mm256_set_ph(
24097            1.,
24098            f16::INFINITY,
24099            f16::NEG_INFINITY,
24100            0.0,
24101            -0.0,
24102            -2.0,
24103            f16::NAN,
24104            5.9e-8, // Denormal
24105            1.,
24106            f16::INFINITY,
24107            f16::NEG_INFINITY,
24108            0.0,
24109            -0.0,
24110            -2.0,
24111            f16::NAN,
24112            5.9e-8, // Denormal
24113        );
24114        let r = _mm256_fpclass_ph_mask::<0x18>(a); // infinities
24115        assert_eq!(r, 0b0110000001100000);
24116    }
24117
24118    #[simd_test(enable = "avx512fp16,avx512vl")]
24119    unsafe fn test_mm256_mask_fpclass_ph_mask() {
24120        let a = _mm256_set_ph(
24121            1.,
24122            f16::INFINITY,
24123            f16::NEG_INFINITY,
24124            0.0,
24125            -0.0,
24126            -2.0,
24127            f16::NAN,
24128            5.9e-8, // Denormal
24129            1.,
24130            f16::INFINITY,
24131            f16::NEG_INFINITY,
24132            0.0,
24133            -0.0,
24134            -2.0,
24135            f16::NAN,
24136            5.9e-8, // Denormal
24137        );
24138        let r = _mm256_mask_fpclass_ph_mask::<0x18>(0b0101010101010101, a);
24139        assert_eq!(r, 0b0100000001000000);
24140    }
24141
24142    #[simd_test(enable = "avx512fp16")]
24143    unsafe fn test_mm512_fpclass_ph_mask() {
24144        let a = _mm512_set_ph(
24145            1.,
24146            f16::INFINITY,
24147            f16::NEG_INFINITY,
24148            0.0,
24149            -0.0,
24150            -2.0,
24151            f16::NAN,
24152            5.9e-8, // Denormal
24153            1.,
24154            f16::INFINITY,
24155            f16::NEG_INFINITY,
24156            0.0,
24157            -0.0,
24158            -2.0,
24159            f16::NAN,
24160            5.9e-8, // Denormal
24161            1.,
24162            f16::INFINITY,
24163            f16::NEG_INFINITY,
24164            0.0,
24165            -0.0,
24166            -2.0,
24167            f16::NAN,
24168            5.9e-8, // Denormal
24169            1.,
24170            f16::INFINITY,
24171            f16::NEG_INFINITY,
24172            0.0,
24173            -0.0,
24174            -2.0,
24175            f16::NAN,
24176            5.9e-8, // Denormal
24177        );
24178        let r = _mm512_fpclass_ph_mask::<0x18>(a); // infinities
24179        assert_eq!(r, 0b01100000011000000110000001100000);
24180    }
24181
24182    #[simd_test(enable = "avx512fp16")]
24183    unsafe fn test_mm512_mask_fpclass_ph_mask() {
24184        let a = _mm512_set_ph(
24185            1.,
24186            f16::INFINITY,
24187            f16::NEG_INFINITY,
24188            0.0,
24189            -0.0,
24190            -2.0,
24191            f16::NAN,
24192            5.9e-8, // Denormal
24193            1.,
24194            f16::INFINITY,
24195            f16::NEG_INFINITY,
24196            0.0,
24197            -0.0,
24198            -2.0,
24199            f16::NAN,
24200            5.9e-8, // Denormal
24201            1.,
24202            f16::INFINITY,
24203            f16::NEG_INFINITY,
24204            0.0,
24205            -0.0,
24206            -2.0,
24207            f16::NAN,
24208            5.9e-8, // Denormal
24209            1.,
24210            f16::INFINITY,
24211            f16::NEG_INFINITY,
24212            0.0,
24213            -0.0,
24214            -2.0,
24215            f16::NAN,
24216            5.9e-8, // Denormal
24217        );
24218        let r = _mm512_mask_fpclass_ph_mask::<0x18>(0b01010101010101010101010101010101, a);
24219        assert_eq!(r, 0b01000000010000000100000001000000);
24220    }
24221
24222    #[simd_test(enable = "avx512fp16")]
24223    unsafe fn test_mm_fpclass_sh_mask() {
24224        let a = _mm_set_sh(f16::INFINITY);
24225        let r = _mm_fpclass_sh_mask::<0x18>(a);
24226        assert_eq!(r, 1);
24227    }
24228
24229    #[simd_test(enable = "avx512fp16")]
24230    unsafe fn test_mm_mask_fpclass_sh_mask() {
24231        let a = _mm_set_sh(f16::INFINITY);
24232        let r = _mm_mask_fpclass_sh_mask::<0x18>(0, a);
24233        assert_eq!(r, 0);
24234        let r = _mm_mask_fpclass_sh_mask::<0x18>(1, a);
24235        assert_eq!(r, 1);
24236    }
24237
24238    #[simd_test(enable = "avx512fp16,avx512vl")]
24239    const unsafe fn test_mm_mask_blend_ph() {
24240        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24241        let b = _mm_set_ph(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0);
24242        let r = _mm_mask_blend_ph(0b01010101, a, b);
24243        let e = _mm_set_ph(1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0);
24244        assert_eq_m128h(r, e);
24245    }
24246
24247    #[simd_test(enable = "avx512fp16,avx512vl")]
24248    const unsafe fn test_mm256_mask_blend_ph() {
24249        let a = _mm256_set_ph(
24250            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24251        );
24252        let b = _mm256_set_ph(
24253            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
24254            -14.0, -15.0, -16.0,
24255        );
24256        let r = _mm256_mask_blend_ph(0b0101010101010101, a, b);
24257        let e = _mm256_set_ph(
24258            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
24259            -16.0,
24260        );
24261        assert_eq_m256h(r, e);
24262    }
24263
24264    #[simd_test(enable = "avx512fp16")]
24265    const unsafe fn test_mm512_mask_blend_ph() {
24266        let a = _mm512_set_ph(
24267            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24268            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24269            31.0, 32.0,
24270        );
24271        let b = _mm512_set_ph(
24272            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
24273            -14.0, -15.0, -16.0, -17.0, -18.0, -19.0, -20.0, -21.0, -22.0, -23.0, -24.0, -25.0,
24274            -26.0, -27.0, -28.0, -29.0, -30.0, -31.0, -32.0,
24275        );
24276        let r = _mm512_mask_blend_ph(0b01010101010101010101010101010101, a, b);
24277        let e = _mm512_set_ph(
24278            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
24279            -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0, 27.0, -28.0,
24280            29.0, -30.0, 31.0, -32.0,
24281        );
24282        assert_eq_m512h(r, e);
24283    }
24284
24285    #[simd_test(enable = "avx512fp16,avx512vl")]
24286    unsafe fn test_mm_permutex2var_ph() {
24287        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24288        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
24289        let idx = _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14);
24290        let r = _mm_permutex2var_ph(a, idx, b);
24291        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
24292        assert_eq_m128h(r, e);
24293    }
24294
24295    #[simd_test(enable = "avx512fp16,avx512vl")]
24296    unsafe fn test_mm256_permutex2var_ph() {
24297        let a = _mm256_setr_ph(
24298            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24299        );
24300        let b = _mm256_setr_ph(
24301            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24302            31.0, 32.0,
24303        );
24304        let idx = _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
24305        let r = _mm256_permutex2var_ph(a, idx, b);
24306        let e = _mm256_setr_ph(
24307            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24308            31.0,
24309        );
24310        assert_eq_m256h(r, e);
24311    }
24312
24313    #[simd_test(enable = "avx512fp16")]
24314    unsafe fn test_mm512_permutex2var_ph() {
24315        let a = _mm512_setr_ph(
24316            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24317            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24318            31.0, 32.0,
24319        );
24320        let b = _mm512_setr_ph(
24321            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
24322            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
24323            61.0, 62.0, 63.0, 64.0,
24324        );
24325        let idx = _mm512_set_epi16(
24326            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20,
24327            18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
24328        );
24329        let r = _mm512_permutex2var_ph(a, idx, b);
24330        let e = _mm512_setr_ph(
24331            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24332            31.0, 33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0, 49.0, 51.0, 53.0, 55.0, 57.0,
24333            59.0, 61.0, 63.0,
24334        );
24335        assert_eq_m512h(r, e);
24336    }
24337
24338    #[simd_test(enable = "avx512fp16,avx512vl")]
24339    unsafe fn test_mm_permutexvar_ph() {
24340        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24341        let idx = _mm_set_epi16(0, 2, 4, 6, 1, 3, 5, 7);
24342        let r = _mm_permutexvar_ph(idx, a);
24343        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0);
24344        assert_eq_m128h(r, e);
24345    }
24346
24347    #[simd_test(enable = "avx512fp16,avx512vl")]
24348    unsafe fn test_mm256_permutexvar_ph() {
24349        let a = _mm256_set_ph(
24350            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24351        );
24352        let idx = _mm256_set_epi16(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
24353        let r = _mm256_permutexvar_ph(idx, a);
24354        let e = _mm256_setr_ph(
24355            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
24356        );
24357        assert_eq_m256h(r, e);
24358    }
24359
24360    #[simd_test(enable = "avx512fp16")]
24361    unsafe fn test_mm512_permutexvar_ph() {
24362        let a = _mm512_set_ph(
24363            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24364            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24365            31.0, 32.0,
24366        );
24367        let idx = _mm512_set_epi16(
24368            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15,
24369            17, 19, 21, 23, 25, 27, 29, 31,
24370        );
24371        let r = _mm512_permutexvar_ph(idx, a);
24372        let e = _mm512_setr_ph(
24373            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24374            31.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0,
24375            30.0, 32.0,
24376        );
24377        assert_eq_m512h(r, e);
24378    }
24379
24380    #[simd_test(enable = "avx512fp16,avx512vl")]
24381    unsafe fn test_mm_cvtepi16_ph() {
24382        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24383        let r = _mm_cvtepi16_ph(a);
24384        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24385        assert_eq_m128h(r, e);
24386    }
24387
24388    #[simd_test(enable = "avx512fp16,avx512vl")]
24389    unsafe fn test_mm_mask_cvtepi16_ph() {
24390        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24391        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24392        let r = _mm_mask_cvtepi16_ph(src, 0b01010101, a);
24393        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24394        assert_eq_m128h(r, e);
24395    }
24396
24397    #[simd_test(enable = "avx512fp16,avx512vl")]
24398    unsafe fn test_mm_maskz_cvtepi16_ph() {
24399        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24400        let r = _mm_maskz_cvtepi16_ph(0b01010101, a);
24401        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24402        assert_eq_m128h(r, e);
24403    }
24404
24405    #[simd_test(enable = "avx512fp16,avx512vl")]
24406    unsafe fn test_mm256_cvtepi16_ph() {
24407        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24408        let r = _mm256_cvtepi16_ph(a);
24409        let e = _mm256_set_ph(
24410            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24411        );
24412        assert_eq_m256h(r, e);
24413    }
24414
24415    #[simd_test(enable = "avx512fp16,avx512vl")]
24416    unsafe fn test_mm256_mask_cvtepi16_ph() {
24417        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24418        let src = _mm256_set_ph(
24419            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24420        );
24421        let r = _mm256_mask_cvtepi16_ph(src, 0b0101010101010101, a);
24422        let e = _mm256_set_ph(
24423            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24424        );
24425        assert_eq_m256h(r, e);
24426    }
24427
24428    #[simd_test(enable = "avx512fp16,avx512vl")]
24429    unsafe fn test_mm256_maskz_cvtepi16_ph() {
24430        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24431        let r = _mm256_maskz_cvtepi16_ph(0b0101010101010101, a);
24432        let e = _mm256_set_ph(
24433            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24434        );
24435        assert_eq_m256h(r, e);
24436    }
24437
24438    #[simd_test(enable = "avx512fp16")]
24439    unsafe fn test_mm512_cvtepi16_ph() {
24440        let a = _mm512_set_epi16(
24441            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24442            25, 26, 27, 28, 29, 30, 31, 32,
24443        );
24444        let r = _mm512_cvtepi16_ph(a);
24445        let e = _mm512_set_ph(
24446            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24447            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24448            31.0, 32.0,
24449        );
24450        assert_eq_m512h(r, e);
24451    }
24452
24453    #[simd_test(enable = "avx512fp16")]
24454    unsafe fn test_mm512_mask_cvtepi16_ph() {
24455        let a = _mm512_set_epi16(
24456            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24457            25, 26, 27, 28, 29, 30, 31, 32,
24458        );
24459        let src = _mm512_set_ph(
24460            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24461            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24462        );
24463        let r = _mm512_mask_cvtepi16_ph(src, 0b01010101010101010101010101010101, a);
24464        let e = _mm512_set_ph(
24465            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24466            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24467        );
24468        assert_eq_m512h(r, e);
24469    }
24470
24471    #[simd_test(enable = "avx512fp16")]
24472    unsafe fn test_mm512_maskz_cvtepi16_ph() {
24473        let a = _mm512_set_epi16(
24474            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24475            25, 26, 27, 28, 29, 30, 31, 32,
24476        );
24477        let r = _mm512_maskz_cvtepi16_ph(0b01010101010101010101010101010101, a);
24478        let e = _mm512_set_ph(
24479            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24480            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24481        );
24482        assert_eq_m512h(r, e);
24483    }
24484
24485    #[simd_test(enable = "avx512fp16")]
24486    unsafe fn test_mm512_cvt_roundepi16_ph() {
24487        let a = _mm512_set_epi16(
24488            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24489            25, 26, 27, 28, 29, 30, 31, 32,
24490        );
24491        let r = _mm512_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24492        let e = _mm512_set_ph(
24493            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24494            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24495            31.0, 32.0,
24496        );
24497        assert_eq_m512h(r, e);
24498    }
24499
24500    #[simd_test(enable = "avx512fp16")]
24501    unsafe fn test_mm512_mask_cvt_roundepi16_ph() {
24502        let a = _mm512_set_epi16(
24503            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24504            25, 26, 27, 28, 29, 30, 31, 32,
24505        );
24506        let src = _mm512_set_ph(
24507            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24508            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24509        );
24510        let r = _mm512_mask_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24511            src,
24512            0b01010101010101010101010101010101,
24513            a,
24514        );
24515        let e = _mm512_set_ph(
24516            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24517            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24518        );
24519        assert_eq_m512h(r, e);
24520    }
24521
24522    #[simd_test(enable = "avx512fp16")]
24523    unsafe fn test_mm512_maskz_cvt_roundepi16_ph() {
24524        let a = _mm512_set_epi16(
24525            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24526            25, 26, 27, 28, 29, 30, 31, 32,
24527        );
24528        let r = _mm512_maskz_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24529            0b01010101010101010101010101010101,
24530            a,
24531        );
24532        let e = _mm512_set_ph(
24533            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24534            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24535        );
24536        assert_eq_m512h(r, e);
24537    }
24538
24539    #[simd_test(enable = "avx512fp16,avx512vl")]
24540    unsafe fn test_mm_cvtepu16_ph() {
24541        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24542        let r = _mm_cvtepu16_ph(a);
24543        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24544        assert_eq_m128h(r, e);
24545    }
24546
24547    #[simd_test(enable = "avx512fp16,avx512vl")]
24548    unsafe fn test_mm_mask_cvtepu16_ph() {
24549        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24550        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24551        let r = _mm_mask_cvtepu16_ph(src, 0b01010101, a);
24552        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24553        assert_eq_m128h(r, e);
24554    }
24555
24556    #[simd_test(enable = "avx512fp16,avx512vl")]
24557    unsafe fn test_mm_maskz_cvtepu16_ph() {
24558        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24559        let r = _mm_maskz_cvtepu16_ph(0b01010101, a);
24560        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24561        assert_eq_m128h(r, e);
24562    }
24563
24564    #[simd_test(enable = "avx512fp16,avx512vl")]
24565    unsafe fn test_mm256_cvtepu16_ph() {
24566        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24567        let r = _mm256_cvtepu16_ph(a);
24568        let e = _mm256_set_ph(
24569            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24570        );
24571        assert_eq_m256h(r, e);
24572    }
24573
24574    #[simd_test(enable = "avx512fp16,avx512vl")]
24575    unsafe fn test_mm256_mask_cvtepu16_ph() {
24576        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24577        let src = _mm256_set_ph(
24578            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24579        );
24580        let r = _mm256_mask_cvtepu16_ph(src, 0b0101010101010101, a);
24581        let e = _mm256_set_ph(
24582            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24583        );
24584        assert_eq_m256h(r, e);
24585    }
24586
24587    #[simd_test(enable = "avx512fp16,avx512vl")]
24588    unsafe fn test_mm256_maskz_cvtepu16_ph() {
24589        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24590        let r = _mm256_maskz_cvtepu16_ph(0b0101010101010101, a);
24591        let e = _mm256_set_ph(
24592            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24593        );
24594        assert_eq_m256h(r, e);
24595    }
24596
24597    #[simd_test(enable = "avx512fp16")]
24598    unsafe fn test_mm512_cvtepu16_ph() {
24599        let a = _mm512_set_epi16(
24600            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24601            25, 26, 27, 28, 29, 30, 31, 32,
24602        );
24603        let r = _mm512_cvtepu16_ph(a);
24604        let e = _mm512_set_ph(
24605            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24606            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24607            31.0, 32.0,
24608        );
24609        assert_eq_m512h(r, e);
24610    }
24611
24612    #[simd_test(enable = "avx512fp16")]
24613    unsafe fn test_mm512_mask_cvtepu16_ph() {
24614        let a = _mm512_set_epi16(
24615            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24616            25, 26, 27, 28, 29, 30, 31, 32,
24617        );
24618        let src = _mm512_set_ph(
24619            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24620            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24621        );
24622        let r = _mm512_mask_cvtepu16_ph(src, 0b01010101010101010101010101010101, a);
24623        let e = _mm512_set_ph(
24624            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24625            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24626        );
24627        assert_eq_m512h(r, e);
24628    }
24629
24630    #[simd_test(enable = "avx512fp16")]
24631    unsafe fn test_mm512_maskz_cvtepu16_ph() {
24632        let a = _mm512_set_epi16(
24633            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24634            25, 26, 27, 28, 29, 30, 31, 32,
24635        );
24636        let r = _mm512_maskz_cvtepu16_ph(0b01010101010101010101010101010101, a);
24637        let e = _mm512_set_ph(
24638            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24639            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24640        );
24641        assert_eq_m512h(r, e);
24642    }
24643
24644    #[simd_test(enable = "avx512fp16")]
24645    unsafe fn test_mm512_cvt_roundepu16_ph() {
24646        let a = _mm512_set_epi16(
24647            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24648            25, 26, 27, 28, 29, 30, 31, 32,
24649        );
24650        let r = _mm512_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24651        let e = _mm512_set_ph(
24652            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24653            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24654            31.0, 32.0,
24655        );
24656        assert_eq_m512h(r, e);
24657    }
24658
24659    #[simd_test(enable = "avx512fp16")]
24660    unsafe fn test_mm512_mask_cvt_roundepu16_ph() {
24661        let a = _mm512_set_epi16(
24662            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24663            25, 26, 27, 28, 29, 30, 31, 32,
24664        );
24665        let src = _mm512_set_ph(
24666            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24667            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24668        );
24669        let r = _mm512_mask_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24670            src,
24671            0b01010101010101010101010101010101,
24672            a,
24673        );
24674        let e = _mm512_set_ph(
24675            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24676            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24677        );
24678        assert_eq_m512h(r, e);
24679    }
24680
24681    #[simd_test(enable = "avx512fp16")]
24682    unsafe fn test_mm512_maskz_cvt_roundepu16_ph() {
24683        let a = _mm512_set_epi16(
24684            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24685            25, 26, 27, 28, 29, 30, 31, 32,
24686        );
24687        let r = _mm512_maskz_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24688            0b01010101010101010101010101010101,
24689            a,
24690        );
24691        let e = _mm512_set_ph(
24692            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24693            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24694        );
24695        assert_eq_m512h(r, e);
24696    }
24697
24698    #[simd_test(enable = "avx512fp16,avx512vl")]
24699    unsafe fn test_mm_cvtepi32_ph() {
24700        let a = _mm_set_epi32(1, 2, 3, 4);
24701        let r = _mm_cvtepi32_ph(a);
24702        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24703        assert_eq_m128h(r, e);
24704    }
24705
24706    #[simd_test(enable = "avx512fp16,avx512vl")]
24707    unsafe fn test_mm_mask_cvtepi32_ph() {
24708        let a = _mm_set_epi32(1, 2, 3, 4);
24709        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24710        let r = _mm_mask_cvtepi32_ph(src, 0b0101, a);
24711        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24712        assert_eq_m128h(r, e);
24713    }
24714
24715    #[simd_test(enable = "avx512fp16,avx512vl")]
24716    unsafe fn test_mm_maskz_cvtepi32_ph() {
24717        let a = _mm_set_epi32(1, 2, 3, 4);
24718        let r = _mm_maskz_cvtepi32_ph(0b0101, a);
24719        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24720        assert_eq_m128h(r, e);
24721    }
24722
24723    #[simd_test(enable = "avx512fp16,avx512vl")]
24724    unsafe fn test_mm256_cvtepi32_ph() {
24725        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24726        let r = _mm256_cvtepi32_ph(a);
24727        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24728        assert_eq_m128h(r, e);
24729    }
24730
24731    #[simd_test(enable = "avx512fp16,avx512vl")]
24732    unsafe fn test_mm256_mask_cvtepi32_ph() {
24733        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24734        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24735        let r = _mm256_mask_cvtepi32_ph(src, 0b01010101, a);
24736        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24737        assert_eq_m128h(r, e);
24738    }
24739
24740    #[simd_test(enable = "avx512fp16,avx512vl")]
24741    unsafe fn test_mm256_maskz_cvtepi32_ph() {
24742        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24743        let r = _mm256_maskz_cvtepi32_ph(0b01010101, a);
24744        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24745        assert_eq_m128h(r, e);
24746    }
24747
24748    #[simd_test(enable = "avx512fp16")]
24749    unsafe fn test_mm512_cvtepi32_ph() {
24750        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24751        let r = _mm512_cvtepi32_ph(a);
24752        let e = _mm256_set_ph(
24753            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24754        );
24755        assert_eq_m256h(r, e);
24756    }
24757
24758    #[simd_test(enable = "avx512fp16,avx512vl")]
24759    unsafe fn test_mm512_mask_cvtepi32_ph() {
24760        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24761        let src = _mm256_set_ph(
24762            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24763        );
24764        let r = _mm512_mask_cvtepi32_ph(src, 0b0101010101010101, a);
24765        let e = _mm256_set_ph(
24766            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24767        );
24768        assert_eq_m256h(r, e);
24769    }
24770
24771    #[simd_test(enable = "avx512fp16,avx512vl")]
24772    unsafe fn test_mm512_maskz_cvtepi32_ph() {
24773        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24774        let r = _mm512_maskz_cvtepi32_ph(0b0101010101010101, a);
24775        let e = _mm256_set_ph(
24776            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24777        );
24778        assert_eq_m256h(r, e);
24779    }
24780
24781    #[simd_test(enable = "avx512fp16,avx512vl")]
24782    unsafe fn test_mm512_cvt_roundepi32_ph() {
24783        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24784        let r = _mm512_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24785        let e = _mm256_set_ph(
24786            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24787        );
24788        assert_eq_m256h(r, e);
24789    }
24790
24791    #[simd_test(enable = "avx512fp16,avx512vl")]
24792    unsafe fn test_mm512_mask_cvt_roundepi32_ph() {
24793        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24794        let src = _mm256_set_ph(
24795            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24796        );
24797        let r = _mm512_mask_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24798            src,
24799            0b0101010101010101,
24800            a,
24801        );
24802        let e = _mm256_set_ph(
24803            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24804        );
24805        assert_eq_m256h(r, e);
24806    }
24807
24808    #[simd_test(enable = "avx512fp16,avx512vl")]
24809    unsafe fn test_mm512_maskz_cvt_roundepi32_ph() {
24810        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24811        let r = _mm512_maskz_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24812            0b0101010101010101,
24813            a,
24814        );
24815        let e = _mm256_set_ph(
24816            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24817        );
24818        assert_eq_m256h(r, e);
24819    }
24820
24821    #[simd_test(enable = "avx512fp16,avx512vl")]
24822    unsafe fn test_mm_cvti32_sh() {
24823        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24824        let r = _mm_cvti32_sh(a, 10);
24825        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24826        assert_eq_m128h(r, e);
24827    }
24828
24829    #[simd_test(enable = "avx512fp16,avx512vl")]
24830    unsafe fn test_mm_cvt_roundi32_sh() {
24831        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24832        let r = _mm_cvt_roundi32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24833        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24834        assert_eq_m128h(r, e);
24835    }
24836
24837    #[simd_test(enable = "avx512fp16,avx512vl")]
24838    unsafe fn test_mm_cvtepu32_ph() {
24839        let a = _mm_set_epi32(1, 2, 3, 4);
24840        let r = _mm_cvtepu32_ph(a);
24841        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24842        assert_eq_m128h(r, e);
24843    }
24844
24845    #[simd_test(enable = "avx512fp16,avx512vl")]
24846    unsafe fn test_mm_mask_cvtepu32_ph() {
24847        let a = _mm_set_epi32(1, 2, 3, 4);
24848        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24849        let r = _mm_mask_cvtepu32_ph(src, 0b0101, a);
24850        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24851        assert_eq_m128h(r, e);
24852    }
24853
24854    #[simd_test(enable = "avx512fp16,avx512vl")]
24855    unsafe fn test_mm_maskz_cvtepu32_ph() {
24856        let a = _mm_set_epi32(1, 2, 3, 4);
24857        let r = _mm_maskz_cvtepu32_ph(0b0101, a);
24858        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24859        assert_eq_m128h(r, e);
24860    }
24861
24862    #[simd_test(enable = "avx512fp16,avx512vl")]
24863    unsafe fn test_mm256_cvtepu32_ph() {
24864        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24865        let r = _mm256_cvtepu32_ph(a);
24866        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24867        assert_eq_m128h(r, e);
24868    }
24869
24870    #[simd_test(enable = "avx512fp16,avx512vl")]
24871    unsafe fn test_mm256_mask_cvtepu32_ph() {
24872        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24873        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24874        let r = _mm256_mask_cvtepu32_ph(src, 0b01010101, a);
24875        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24876        assert_eq_m128h(r, e);
24877    }
24878
24879    #[simd_test(enable = "avx512fp16,avx512vl")]
24880    unsafe fn test_mm256_maskz_cvtepu32_ph() {
24881        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24882        let r = _mm256_maskz_cvtepu32_ph(0b01010101, a);
24883        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24884        assert_eq_m128h(r, e);
24885    }
24886
24887    #[simd_test(enable = "avx512fp16,avx512vl")]
24888    unsafe fn test_mm512_cvtepu32_ph() {
24889        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24890        let r = _mm512_cvtepu32_ph(a);
24891        let e = _mm256_set_ph(
24892            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24893        );
24894        assert_eq_m256h(r, e);
24895    }
24896
24897    #[simd_test(enable = "avx512fp16,avx512vl")]
24898    unsafe fn test_mm512_mask_cvtepu32_ph() {
24899        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24900        let src = _mm256_set_ph(
24901            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24902        );
24903        let r = _mm512_mask_cvtepu32_ph(src, 0b0101010101010101, a);
24904        let e = _mm256_set_ph(
24905            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24906        );
24907        assert_eq_m256h(r, e);
24908    }
24909
24910    #[simd_test(enable = "avx512fp16,avx512vl")]
24911    unsafe fn test_mm512_maskz_cvtepu32_ph() {
24912        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24913        let r = _mm512_maskz_cvtepu32_ph(0b0101010101010101, a);
24914        let e = _mm256_set_ph(
24915            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24916        );
24917        assert_eq_m256h(r, e);
24918    }
24919
24920    #[simd_test(enable = "avx512fp16,avx512vl")]
24921    unsafe fn test_mm512_cvt_roundepu32_ph() {
24922        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24923        let r = _mm512_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24924        let e = _mm256_set_ph(
24925            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24926        );
24927        assert_eq_m256h(r, e);
24928    }
24929
24930    #[simd_test(enable = "avx512fp16,avx512vl")]
24931    unsafe fn test_mm512_mask_cvt_roundepu32_ph() {
24932        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24933        let src = _mm256_set_ph(
24934            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24935        );
24936        let r = _mm512_mask_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24937            src,
24938            0b0101010101010101,
24939            a,
24940        );
24941        let e = _mm256_set_ph(
24942            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
24943            16.0,
24944        );
24945        assert_eq_m256h(r, e);
24946    }
24947
24948    #[simd_test(enable = "avx512fp16,avx512vl")]
24949    unsafe fn test_mm512_maskz_cvt_roundepu32_ph() {
24950        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24951        let r = _mm512_maskz_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24952            0b0101010101010101,
24953            a,
24954        );
24955        let e = _mm256_set_ph(
24956            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24957        );
24958        assert_eq_m256h(r, e);
24959    }
24960
24961    #[simd_test(enable = "avx512fp16,avx512vl")]
24962    unsafe fn test_mm_cvtu32_sh() {
24963        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24964        let r = _mm_cvtu32_sh(a, 10);
24965        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24966        assert_eq_m128h(r, e);
24967    }
24968
24969    #[simd_test(enable = "avx512fp16,avx512vl")]
24970    unsafe fn test_mm_cvt_roundu32_sh() {
24971        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24972        let r = _mm_cvt_roundu32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24973        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24974        assert_eq_m128h(r, e);
24975    }
24976
24977    #[simd_test(enable = "avx512fp16,avx512vl")]
24978    unsafe fn test_mm_cvtepi64_ph() {
24979        let a = _mm_set_epi64x(1, 2);
24980        let r = _mm_cvtepi64_ph(a);
24981        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24982        assert_eq_m128h(r, e);
24983    }
24984
24985    #[simd_test(enable = "avx512fp16,avx512vl")]
24986    unsafe fn test_mm_mask_cvtepi64_ph() {
24987        let a = _mm_set_epi64x(1, 2);
24988        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24989        let r = _mm_mask_cvtepi64_ph(src, 0b01, a);
24990        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24991        assert_eq_m128h(r, e);
24992    }
24993
24994    #[simd_test(enable = "avx512fp16,avx512vl")]
24995    unsafe fn test_mm_maskz_cvtepi64_ph() {
24996        let a = _mm_set_epi64x(1, 2);
24997        let r = _mm_maskz_cvtepi64_ph(0b01, a);
24998        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.);
24999        assert_eq_m128h(r, e);
25000    }
25001
25002    #[simd_test(enable = "avx512fp16,avx512vl")]
25003    unsafe fn test_mm256_cvtepi64_ph() {
25004        let a = _mm256_set_epi64x(1, 2, 3, 4);
25005        let r = _mm256_cvtepi64_ph(a);
25006        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25007        assert_eq_m128h(r, e);
25008    }
25009
25010    #[simd_test(enable = "avx512fp16,avx512vl")]
25011    unsafe fn test_mm256_mask_cvtepi64_ph() {
25012        let a = _mm256_set_epi64x(1, 2, 3, 4);
25013        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25014        let r = _mm256_mask_cvtepi64_ph(src, 0b0101, a);
25015        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
25016        assert_eq_m128h(r, e);
25017    }
25018
25019    #[simd_test(enable = "avx512fp16,avx512vl")]
25020    unsafe fn test_mm256_maskz_cvtepi64_ph() {
25021        let a = _mm256_set_epi64x(1, 2, 3, 4);
25022        let r = _mm256_maskz_cvtepi64_ph(0b0101, a);
25023        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25024        assert_eq_m128h(r, e);
25025    }
25026
25027    #[simd_test(enable = "avx512fp16,avx512vl")]
25028    unsafe fn test_mm512_cvtepi64_ph() {
25029        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25030        let r = _mm512_cvtepi64_ph(a);
25031        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25032        assert_eq_m128h(r, e);
25033    }
25034
25035    #[simd_test(enable = "avx512fp16,avx512vl")]
25036    unsafe fn test_mm512_mask_cvtepi64_ph() {
25037        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25038        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25039        let r = _mm512_mask_cvtepi64_ph(src, 0b01010101, a);
25040        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25041        assert_eq_m128h(r, e);
25042    }
25043
25044    #[simd_test(enable = "avx512fp16,avx512vl")]
25045    unsafe fn test_mm512_maskz_cvtepi64_ph() {
25046        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25047        let r = _mm512_maskz_cvtepi64_ph(0b01010101, a);
25048        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25049        assert_eq_m128h(r, e);
25050    }
25051
25052    #[simd_test(enable = "avx512fp16,avx512vl")]
25053    unsafe fn test_mm512_cvt_roundepi64_ph() {
25054        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25055        let r = _mm512_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25056        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25057        assert_eq_m128h(r, e);
25058    }
25059
25060    #[simd_test(enable = "avx512fp16")]
25061    unsafe fn test_mm512_mask_cvt_roundepi64_ph() {
25062        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25063        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25064        let r = _mm512_mask_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25065            src, 0b01010101, a,
25066        );
25067        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25068        assert_eq_m128h(r, e);
25069    }
25070
25071    #[simd_test(enable = "avx512fp16,avx512vl")]
25072    unsafe fn test_mm512_maskz_cvt_roundepi64_ph() {
25073        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25074        let r = _mm512_maskz_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25075            0b01010101, a,
25076        );
25077        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25078        assert_eq_m128h(r, e);
25079    }
25080
25081    #[simd_test(enable = "avx512fp16,avx512vl")]
25082    unsafe fn test_mm_cvtepu64_ph() {
25083        let a = _mm_set_epi64x(1, 2);
25084        let r = _mm_cvtepu64_ph(a);
25085        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
25086        assert_eq_m128h(r, e);
25087    }
25088
25089    #[simd_test(enable = "avx512fp16,avx512vl")]
25090    unsafe fn test_mm_mask_cvtepu64_ph() {
25091        let a = _mm_set_epi64x(1, 2);
25092        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25093        let r = _mm_mask_cvtepu64_ph(src, 0b01, a);
25094        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
25095        assert_eq_m128h(r, e);
25096    }
25097
25098    #[simd_test(enable = "avx512fp16,avx512vl")]
25099    unsafe fn test_mm_maskz_cvtepu64_ph() {
25100        let a = _mm_set_epi64x(1, 2);
25101        let r = _mm_maskz_cvtepu64_ph(0b01, a);
25102        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
25103        assert_eq_m128h(r, e);
25104    }
25105
25106    #[simd_test(enable = "avx512fp16,avx512vl")]
25107    unsafe fn test_mm256_cvtepu64_ph() {
25108        let a = _mm256_set_epi64x(1, 2, 3, 4);
25109        let r = _mm256_cvtepu64_ph(a);
25110        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25111        assert_eq_m128h(r, e);
25112    }
25113
25114    #[simd_test(enable = "avx512fp16,avx512vl")]
25115    unsafe fn test_mm256_mask_cvtepu64_ph() {
25116        let a = _mm256_set_epi64x(1, 2, 3, 4);
25117        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25118        let r = _mm256_mask_cvtepu64_ph(src, 0b0101, a);
25119        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
25120        assert_eq_m128h(r, e);
25121    }
25122
25123    #[simd_test(enable = "avx512fp16,avx512vl")]
25124    unsafe fn test_mm256_maskz_cvtepu64_ph() {
25125        let a = _mm256_set_epi64x(1, 2, 3, 4);
25126        let r = _mm256_maskz_cvtepu64_ph(0b0101, a);
25127        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25128        assert_eq_m128h(r, e);
25129    }
25130
25131    #[simd_test(enable = "avx512fp16,avx512vl")]
25132    unsafe fn test_mm512_cvtepu64_ph() {
25133        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25134        let r = _mm512_cvtepu64_ph(a);
25135        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25136        assert_eq_m128h(r, e);
25137    }
25138
25139    #[simd_test(enable = "avx512fp16,avx512vl")]
25140    unsafe fn test_mm512_mask_cvtepu64_ph() {
25141        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25142        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25143        let r = _mm512_mask_cvtepu64_ph(src, 0b01010101, a);
25144        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25145        assert_eq_m128h(r, e);
25146    }
25147
25148    #[simd_test(enable = "avx512fp16,avx512vl")]
25149    unsafe fn test_mm512_maskz_cvtepu64_ph() {
25150        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25151        let r = _mm512_maskz_cvtepu64_ph(0b01010101, a);
25152        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25153        assert_eq_m128h(r, e);
25154    }
25155
25156    #[simd_test(enable = "avx512fp16,avx512vl")]
25157    unsafe fn test_mm512_cvt_roundepu64_ph() {
25158        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25159        let r = _mm512_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25160        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25161        assert_eq_m128h(r, e);
25162    }
25163
25164    #[simd_test(enable = "avx512fp16,avx512vl")]
25165    unsafe fn test_mm512_mask_cvt_roundepu64_ph() {
25166        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25167        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25168        let r = _mm512_mask_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25169            src, 0b01010101, a,
25170        );
25171        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25172        assert_eq_m128h(r, e);
25173    }
25174
25175    #[simd_test(enable = "avx512fp16,avx512vl")]
25176    unsafe fn test_mm512_maskz_cvt_roundepu64_ph() {
25177        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
25178        let r = _mm512_maskz_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25179            0b01010101, a,
25180        );
25181        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25182        assert_eq_m128h(r, e);
25183    }
25184
25185    #[simd_test(enable = "avx512fp16,avx512vl")]
25186    unsafe fn test_mm_cvtxps_ph() {
25187        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
25188        let r = _mm_cvtxps_ph(a);
25189        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25190        assert_eq_m128h(r, e);
25191    }
25192
25193    #[simd_test(enable = "avx512fp16,avx512vl")]
25194    unsafe fn test_mm_mask_cvtxps_ph() {
25195        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
25196        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25197        let r = _mm_mask_cvtxps_ph(src, 0b0101, a);
25198        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16., 4.0);
25199        assert_eq_m128h(r, e);
25200    }
25201
25202    #[simd_test(enable = "avx512fp16,avx512vl")]
25203    unsafe fn test_mm_maskz_cvtxps_ph() {
25204        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
25205        let r = _mm_maskz_cvtxps_ph(0b0101, a);
25206        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25207        assert_eq_m128h(r, e);
25208    }
25209
25210    #[simd_test(enable = "avx512fp16,avx512vl")]
25211    unsafe fn test_mm256_cvtxps_ph() {
25212        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25213        let r = _mm256_cvtxps_ph(a);
25214        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25215        assert_eq_m128h(r, e);
25216    }
25217
25218    #[simd_test(enable = "avx512fp16,avx512vl")]
25219    unsafe fn test_mm256_mask_cvtxps_ph() {
25220        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25221        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25222        let r = _mm256_mask_cvtxps_ph(src, 0b01010101, a);
25223        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25224        assert_eq_m128h(r, e);
25225    }
25226
25227    #[simd_test(enable = "avx512fp16,avx512vl")]
25228    unsafe fn test_mm256_maskz_cvtxps_ph() {
25229        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25230        let r = _mm256_maskz_cvtxps_ph(0b01010101, a);
25231        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
25232        assert_eq_m128h(r, e);
25233    }
25234
25235    #[simd_test(enable = "avx512fp16,avx512vl")]
25236    unsafe fn test_mm512_cvtxps_ph() {
25237        let a = _mm512_set_ps(
25238            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25239        );
25240        let r = _mm512_cvtxps_ph(a);
25241        let e = _mm256_set_ph(
25242            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25243        );
25244        assert_eq_m256h(r, e);
25245    }
25246
25247    #[simd_test(enable = "avx512fp16,avx512vl")]
25248    unsafe fn test_mm512_mask_cvtxps_ph() {
25249        let a = _mm512_set_ps(
25250            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25251        );
25252        let src = _mm256_set_ph(
25253            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
25254        );
25255        let r = _mm512_mask_cvtxps_ph(src, 0b0101010101010101, a);
25256        let e = _mm256_set_ph(
25257            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
25258        );
25259        assert_eq_m256h(r, e);
25260    }
25261
25262    #[simd_test(enable = "avx512fp16,avx512vl")]
25263    unsafe fn test_mm512_maskz_cvtxps_ph() {
25264        let a = _mm512_set_ps(
25265            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25266        );
25267        let r = _mm512_maskz_cvtxps_ph(0b0101010101010101, a);
25268        let e = _mm256_set_ph(
25269            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
25270        );
25271        assert_eq_m256h(r, e);
25272    }
25273
25274    #[simd_test(enable = "avx512fp16,avx512vl")]
25275    unsafe fn test_mm512_cvtx_roundps_ph() {
25276        let a = _mm512_set_ps(
25277            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25278        );
25279        let r = _mm512_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25280        let e = _mm256_set_ph(
25281            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25282        );
25283        assert_eq_m256h(r, e);
25284    }
25285
25286    #[simd_test(enable = "avx512fp16,avx512vl")]
25287    unsafe fn test_mm512_mask_cvtx_roundps_ph() {
25288        let a = _mm512_set_ps(
25289            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25290        );
25291        let src = _mm256_set_ph(
25292            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
25293        );
25294        let r = _mm512_mask_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25295            src,
25296            0b0101010101010101,
25297            a,
25298        );
25299        let e = _mm256_set_ph(
25300            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
25301            16.0,
25302        );
25303        assert_eq_m256h(r, e);
25304    }
25305
25306    #[simd_test(enable = "avx512fp16,avx512vl")]
25307    unsafe fn test_mm512_maskz_cvtx_roundps_ph() {
25308        let a = _mm512_set_ps(
25309            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25310        );
25311        let r = _mm512_maskz_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25312            0b0101010101010101,
25313            a,
25314        );
25315        let e = _mm256_set_ph(
25316            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
25317        );
25318        assert_eq_m256h(r, e);
25319    }
25320
25321    #[simd_test(enable = "avx512fp16,avx512vl")]
25322    unsafe fn test_mm_cvtss_sh() {
25323        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25324        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25325        let r = _mm_cvtss_sh(a, b);
25326        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25327        assert_eq_m128h(r, e);
25328    }
25329
25330    #[simd_test(enable = "avx512fp16,avx512vl")]
25331    unsafe fn test_mm_mask_cvtss_sh() {
25332        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25333        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25334        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25335        let r = _mm_mask_cvtss_sh(src, 0, a, b);
25336        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25337        assert_eq_m128h(r, e);
25338        let r = _mm_mask_cvtss_sh(src, 1, a, b);
25339        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25340        assert_eq_m128h(r, e);
25341    }
25342
25343    #[simd_test(enable = "avx512fp16,avx512vl")]
25344    unsafe fn test_mm_maskz_cvtss_sh() {
25345        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25346        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25347        let r = _mm_maskz_cvtss_sh(0, a, b);
25348        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25349        assert_eq_m128h(r, e);
25350        let r = _mm_maskz_cvtss_sh(1, a, b);
25351        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25352        assert_eq_m128h(r, e);
25353    }
25354
25355    #[simd_test(enable = "avx512fp16,avx512vl")]
25356    unsafe fn test_mm_cvt_roundss_sh() {
25357        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25358        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25359        let r = _mm_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25360        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25361        assert_eq_m128h(r, e);
25362    }
25363
25364    #[simd_test(enable = "avx512fp16,avx512vl")]
25365    unsafe fn test_mm_mask_cvt_roundss_sh() {
25366        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25367        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25368        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25369        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25370            src, 0, a, b,
25371        );
25372        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25373        assert_eq_m128h(r, e);
25374        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25375            src, 1, a, b,
25376        );
25377        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25378        assert_eq_m128h(r, e);
25379    }
25380
25381    #[simd_test(enable = "avx512fp16,avx512vl")]
25382    unsafe fn test_mm_maskz_cvt_roundss_sh() {
25383        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25384        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25385        let r =
25386            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25387        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25388        assert_eq_m128h(r, e);
25389        let r =
25390            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25391        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25392        assert_eq_m128h(r, e);
25393    }
25394
25395    #[simd_test(enable = "avx512fp16,avx512vl")]
25396    unsafe fn test_mm_cvtpd_ph() {
25397        let a = _mm_set_pd(1.0, 2.0);
25398        let r = _mm_cvtpd_ph(a);
25399        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
25400        assert_eq_m128h(r, e);
25401    }
25402
25403    #[simd_test(enable = "avx512fp16,avx512vl")]
25404    unsafe fn test_mm_mask_cvtpd_ph() {
25405        let a = _mm_set_pd(1.0, 2.0);
25406        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25407        let r = _mm_mask_cvtpd_ph(src, 0b01, a);
25408        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
25409        assert_eq_m128h(r, e);
25410    }
25411
25412    #[simd_test(enable = "avx512fp16,avx512vl")]
25413    unsafe fn test_mm_maskz_cvtpd_ph() {
25414        let a = _mm_set_pd(1.0, 2.0);
25415        let r = _mm_maskz_cvtpd_ph(0b01, a);
25416        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
25417        assert_eq_m128h(r, e);
25418    }
25419
25420    #[simd_test(enable = "avx512fp16,avx512vl")]
25421    unsafe fn test_mm256_cvtpd_ph() {
25422        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25423        let r = _mm256_cvtpd_ph(a);
25424        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25425        assert_eq_m128h(r, e);
25426    }
25427
25428    #[simd_test(enable = "avx512fp16,avx512vl")]
25429    unsafe fn test_mm256_mask_cvtpd_ph() {
25430        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25431        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25432        let r = _mm256_mask_cvtpd_ph(src, 0b0101, a);
25433        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
25434        assert_eq_m128h(r, e);
25435    }
25436
25437    #[simd_test(enable = "avx512fp16,avx512vl")]
25438    unsafe fn test_mm256_maskz_cvtpd_ph() {
25439        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25440        let r = _mm256_maskz_cvtpd_ph(0b0101, a);
25441        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25442        assert_eq_m128h(r, e);
25443    }
25444
25445    #[simd_test(enable = "avx512fp16,avx512vl")]
25446    unsafe fn test_mm512_cvtpd_ph() {
25447        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25448        let r = _mm512_cvtpd_ph(a);
25449        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25450        assert_eq_m128h(r, e);
25451    }
25452
25453    #[simd_test(enable = "avx512fp16,avx512vl")]
25454    unsafe fn test_mm512_mask_cvtpd_ph() {
25455        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25456        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25457        let r = _mm512_mask_cvtpd_ph(src, 0b01010101, a);
25458        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25459        assert_eq_m128h(r, e);
25460    }
25461
25462    #[simd_test(enable = "avx512fp16,avx512vl")]
25463    unsafe fn test_mm512_maskz_cvtpd_ph() {
25464        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25465        let r = _mm512_maskz_cvtpd_ph(0b01010101, a);
25466        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25467        assert_eq_m128h(r, e);
25468    }
25469
25470    #[simd_test(enable = "avx512fp16,avx512vl")]
25471    unsafe fn test_mm512_cvt_roundpd_ph() {
25472        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25473        let r = _mm512_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25474        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25475        assert_eq_m128h(r, e);
25476    }
25477
25478    #[simd_test(enable = "avx512fp16,avx512vl")]
25479    unsafe fn test_mm512_mask_cvt_roundpd_ph() {
25480        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25481        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25482        let r = _mm512_mask_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25483            src, 0b01010101, a,
25484        );
25485        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25486        assert_eq_m128h(r, e);
25487    }
25488
25489    #[simd_test(enable = "avx512fp16,avx512vl")]
25490    unsafe fn test_mm512_maskz_cvt_roundpd_ph() {
25491        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25492        let r = _mm512_maskz_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25493            0b01010101, a,
25494        );
25495        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25496        assert_eq_m128h(r, e);
25497    }
25498
25499    #[simd_test(enable = "avx512fp16,avx512vl")]
25500    unsafe fn test_mm_cvtsd_sh() {
25501        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25502        let b = _mm_setr_pd(1.0, 2.0);
25503        let r = _mm_cvtsd_sh(a, b);
25504        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25505        assert_eq_m128h(r, e);
25506    }
25507
25508    #[simd_test(enable = "avx512fp16,avx512vl")]
25509    unsafe fn test_mm_mask_cvtsd_sh() {
25510        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25511        let b = _mm_setr_pd(1.0, 2.0);
25512        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25513        let r = _mm_mask_cvtsd_sh(src, 0, a, b);
25514        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25515        assert_eq_m128h(r, e);
25516        let r = _mm_mask_cvtsd_sh(src, 1, a, b);
25517        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25518        assert_eq_m128h(r, e);
25519    }
25520
25521    #[simd_test(enable = "avx512fp16,avx512vl")]
25522    unsafe fn test_mm_maskz_cvtsd_sh() {
25523        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25524        let b = _mm_setr_pd(1.0, 2.0);
25525        let r = _mm_maskz_cvtsd_sh(0, a, b);
25526        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25527        assert_eq_m128h(r, e);
25528        let r = _mm_maskz_cvtsd_sh(1, a, b);
25529        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25530        assert_eq_m128h(r, e);
25531    }
25532
25533    #[simd_test(enable = "avx512fp16,avx512vl")]
25534    unsafe fn test_mm_cvt_roundsd_sh() {
25535        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25536        let b = _mm_setr_pd(1.0, 2.0);
25537        let r = _mm_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25538        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25539        assert_eq_m128h(r, e);
25540    }
25541
25542    #[simd_test(enable = "avx512fp16,avx512vl")]
25543    unsafe fn test_mm_mask_cvt_roundsd_sh() {
25544        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25545        let b = _mm_setr_pd(1.0, 2.0);
25546        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25547        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25548            src, 0, a, b,
25549        );
25550        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25551        assert_eq_m128h(r, e);
25552        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25553            src, 1, a, b,
25554        );
25555        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25556        assert_eq_m128h(r, e);
25557    }
25558
25559    #[simd_test(enable = "avx512fp16,avx512vl")]
25560    unsafe fn test_mm_maskz_cvt_roundsd_sh() {
25561        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25562        let b = _mm_setr_pd(1.0, 2.0);
25563        let r =
25564            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25565        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25566        assert_eq_m128h(r, e);
25567        let r =
25568            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25569        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25570        assert_eq_m128h(r, e);
25571    }
25572
25573    #[simd_test(enable = "avx512fp16,avx512vl")]
25574    unsafe fn test_mm_cvtph_epi16() {
25575        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25576        let r = _mm_cvttph_epi16(a);
25577        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25578        assert_eq_m128i(r, e);
25579    }
25580
25581    #[simd_test(enable = "avx512fp16,avx512vl")]
25582    unsafe fn test_mm_mask_cvtph_epi16() {
25583        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25584        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25585        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25586        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25587        assert_eq_m128i(r, e);
25588    }
25589
25590    #[simd_test(enable = "avx512fp16,avx512vl")]
25591    unsafe fn test_mm_maskz_cvtph_epi16() {
25592        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25593        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25594        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25595        assert_eq_m128i(r, e);
25596    }
25597
25598    #[simd_test(enable = "avx512fp16,avx512vl")]
25599    unsafe fn test_mm256_cvtph_epi16() {
25600        let a = _mm256_set_ph(
25601            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25602        );
25603        let r = _mm256_cvttph_epi16(a);
25604        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25605        assert_eq_m256i(r, e);
25606    }
25607
25608    #[simd_test(enable = "avx512fp16,avx512vl")]
25609    unsafe fn test_mm256_mask_cvtph_epi16() {
25610        let a = _mm256_set_ph(
25611            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25612        );
25613        let src = _mm256_set_epi16(
25614            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25615        );
25616        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25617        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25618        assert_eq_m256i(r, e);
25619    }
25620
25621    #[simd_test(enable = "avx512fp16,avx512vl")]
25622    unsafe fn test_mm256_maskz_cvtph_epi16() {
25623        let a = _mm256_set_ph(
25624            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25625        );
25626        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25627        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25628        assert_eq_m256i(r, e);
25629    }
25630
25631    #[simd_test(enable = "avx512fp16")]
25632    unsafe fn test_mm512_cvtph_epi16() {
25633        let a = _mm512_set_ph(
25634            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25635            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25636            31.0, 32.0,
25637        );
25638        let r = _mm512_cvttph_epi16(a);
25639        let e = _mm512_set_epi16(
25640            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25641            25, 26, 27, 28, 29, 30, 31, 32,
25642        );
25643        assert_eq_m512i(r, e);
25644    }
25645
25646    #[simd_test(enable = "avx512fp16")]
25647    unsafe fn test_mm512_mask_cvtph_epi16() {
25648        let a = _mm512_set_ph(
25649            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25650            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25651            31.0, 32.0,
25652        );
25653        let src = _mm512_set_epi16(
25654            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25655            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25656        );
25657        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25658        let e = _mm512_set_epi16(
25659            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25660            24, 34, 26, 36, 28, 38, 30, 40, 32,
25661        );
25662        assert_eq_m512i(r, e);
25663    }
25664
25665    #[simd_test(enable = "avx512fp16")]
25666    unsafe fn test_mm512_maskz_cvtph_epi16() {
25667        let a = _mm512_set_ph(
25668            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25669            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25670            31.0, 32.0,
25671        );
25672        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25673        let e = _mm512_set_epi16(
25674            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25675            0, 28, 0, 30, 0, 32,
25676        );
25677        assert_eq_m512i(r, e);
25678    }
25679
25680    #[simd_test(enable = "avx512fp16")]
25681    unsafe fn test_mm512_cvt_roundph_epi16() {
25682        let a = _mm512_set_ph(
25683            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25684            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25685            31.0, 32.0,
25686        );
25687        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25688        let e = _mm512_set_epi16(
25689            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25690            25, 26, 27, 28, 29, 30, 31, 32,
25691        );
25692        assert_eq_m512i(r, e);
25693    }
25694
25695    #[simd_test(enable = "avx512fp16")]
25696    unsafe fn test_mm512_mask_cvt_roundph_epi16() {
25697        let a = _mm512_set_ph(
25698            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25699            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25700            31.0, 32.0,
25701        );
25702        let src = _mm512_set_epi16(
25703            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25704            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25705        );
25706        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25707            src,
25708            0b01010101010101010101010101010101,
25709            a,
25710        );
25711        let e = _mm512_set_epi16(
25712            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25713            24, 34, 26, 36, 28, 38, 30, 40, 32,
25714        );
25715        assert_eq_m512i(r, e);
25716    }
25717
25718    #[simd_test(enable = "avx512fp16")]
25719    unsafe fn test_mm512_maskz_cvt_roundph_epi16() {
25720        let a = _mm512_set_ph(
25721            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25722            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25723            31.0, 32.0,
25724        );
25725        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25726            0b01010101010101010101010101010101,
25727            a,
25728        );
25729        let e = _mm512_set_epi16(
25730            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25731            0, 28, 0, 30, 0, 32,
25732        );
25733        assert_eq_m512i(r, e);
25734    }
25735
25736    #[simd_test(enable = "avx512fp16,avx512vl")]
25737    unsafe fn test_mm_cvtph_epu16() {
25738        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25739        let r = _mm_cvttph_epu16(a);
25740        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25741        assert_eq_m128i(r, e);
25742    }
25743
25744    #[simd_test(enable = "avx512fp16,avx512vl")]
25745    unsafe fn test_mm_mask_cvtph_epu16() {
25746        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25747        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25748        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25749        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25750        assert_eq_m128i(r, e);
25751    }
25752
25753    #[simd_test(enable = "avx512fp16,avx512vl")]
25754    unsafe fn test_mm_maskz_cvtph_epu16() {
25755        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25756        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25757        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25758        assert_eq_m128i(r, e);
25759    }
25760
25761    #[simd_test(enable = "avx512fp16,avx512vl")]
25762    unsafe fn test_mm256_cvtph_epu16() {
25763        let a = _mm256_set_ph(
25764            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25765        );
25766        let r = _mm256_cvttph_epu16(a);
25767        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25768        assert_eq_m256i(r, e);
25769    }
25770
25771    #[simd_test(enable = "avx512fp16,avx512vl")]
25772    unsafe fn test_mm256_mask_cvtph_epu16() {
25773        let a = _mm256_set_ph(
25774            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25775        );
25776        let src = _mm256_set_epi16(
25777            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25778        );
25779        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25780        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25781        assert_eq_m256i(r, e);
25782    }
25783
25784    #[simd_test(enable = "avx512fp16,avx512vl")]
25785    unsafe fn test_mm256_maskz_cvtph_epu16() {
25786        let a = _mm256_set_ph(
25787            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25788        );
25789        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25790        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25791        assert_eq_m256i(r, e);
25792    }
25793
25794    #[simd_test(enable = "avx512fp16")]
25795    unsafe fn test_mm512_cvtph_epu16() {
25796        let a = _mm512_set_ph(
25797            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25798            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25799            31.0, 32.0,
25800        );
25801        let r = _mm512_cvttph_epu16(a);
25802        let e = _mm512_set_epi16(
25803            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25804            25, 26, 27, 28, 29, 30, 31, 32,
25805        );
25806        assert_eq_m512i(r, e);
25807    }
25808
25809    #[simd_test(enable = "avx512fp16")]
25810    unsafe fn test_mm512_mask_cvtph_epu16() {
25811        let a = _mm512_set_ph(
25812            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25813            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25814            31.0, 32.0,
25815        );
25816        let src = _mm512_set_epi16(
25817            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25818            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25819        );
25820        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25821        let e = _mm512_set_epi16(
25822            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25823            24, 34, 26, 36, 28, 38, 30, 40, 32,
25824        );
25825        assert_eq_m512i(r, e);
25826    }
25827
25828    #[simd_test(enable = "avx512fp16")]
25829    unsafe fn test_mm512_maskz_cvtph_epu16() {
25830        let a = _mm512_set_ph(
25831            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25832            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25833            31.0, 32.0,
25834        );
25835        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25836        let e = _mm512_set_epi16(
25837            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25838            0, 28, 0, 30, 0, 32,
25839        );
25840        assert_eq_m512i(r, e);
25841    }
25842
25843    #[simd_test(enable = "avx512fp16")]
25844    unsafe fn test_mm512_cvt_roundph_epu16() {
25845        let a = _mm512_set_ph(
25846            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25847            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25848            31.0, 32.0,
25849        );
25850        let r = _mm512_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25851        let e = _mm512_set_epi16(
25852            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25853            25, 26, 27, 28, 29, 30, 31, 32,
25854        );
25855        assert_eq_m512i(r, e);
25856    }
25857
25858    #[simd_test(enable = "avx512fp16")]
25859    unsafe fn test_mm512_mask_cvt_roundph_epu16() {
25860        let a = _mm512_set_ph(
25861            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25862            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25863            31.0, 32.0,
25864        );
25865        let src = _mm512_set_epi16(
25866            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25867            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25868        );
25869        let r = _mm512_mask_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25870            src,
25871            0b01010101010101010101010101010101,
25872            a,
25873        );
25874        let e = _mm512_set_epi16(
25875            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25876            24, 34, 26, 36, 28, 38, 30, 40, 32,
25877        );
25878        assert_eq_m512i(r, e);
25879    }
25880
25881    #[simd_test(enable = "avx512fp16")]
25882    unsafe fn test_mm512_maskz_cvt_roundph_epu16() {
25883        let a = _mm512_set_ph(
25884            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25885            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25886            31.0, 32.0,
25887        );
25888        let r = _mm512_maskz_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25889            0b01010101010101010101010101010101,
25890            a,
25891        );
25892        let e = _mm512_set_epi16(
25893            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25894            0, 28, 0, 30, 0, 32,
25895        );
25896        assert_eq_m512i(r, e);
25897    }
25898
25899    #[simd_test(enable = "avx512fp16,avx512vl")]
25900    unsafe fn test_mm_cvttph_epi16() {
25901        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25902        let r = _mm_cvttph_epi16(a);
25903        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25904        assert_eq_m128i(r, e);
25905    }
25906
25907    #[simd_test(enable = "avx512fp16,avx512vl")]
25908    unsafe fn test_mm_mask_cvttph_epi16() {
25909        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25910        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25911        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25912        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25913        assert_eq_m128i(r, e);
25914    }
25915
25916    #[simd_test(enable = "avx512fp16,avx512vl")]
25917    unsafe fn test_mm_maskz_cvttph_epi16() {
25918        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25919        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25920        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25921        assert_eq_m128i(r, e);
25922    }
25923
25924    #[simd_test(enable = "avx512fp16,avx512vl")]
25925    unsafe fn test_mm256_cvttph_epi16() {
25926        let a = _mm256_set_ph(
25927            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25928        );
25929        let r = _mm256_cvttph_epi16(a);
25930        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25931        assert_eq_m256i(r, e);
25932    }
25933
25934    #[simd_test(enable = "avx512fp16,avx512vl")]
25935    unsafe fn test_mm256_mask_cvttph_epi16() {
25936        let a = _mm256_set_ph(
25937            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25938        );
25939        let src = _mm256_set_epi16(
25940            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25941        );
25942        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25943        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25944        assert_eq_m256i(r, e);
25945    }
25946
25947    #[simd_test(enable = "avx512fp16,avx512vl")]
25948    unsafe fn test_mm256_maskz_cvttph_epi16() {
25949        let a = _mm256_set_ph(
25950            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25951        );
25952        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25953        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25954        assert_eq_m256i(r, e);
25955    }
25956
25957    #[simd_test(enable = "avx512fp16")]
25958    unsafe fn test_mm512_cvttph_epi16() {
25959        let a = _mm512_set_ph(
25960            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25961            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25962            31.0, 32.0,
25963        );
25964        let r = _mm512_cvttph_epi16(a);
25965        let e = _mm512_set_epi16(
25966            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25967            25, 26, 27, 28, 29, 30, 31, 32,
25968        );
25969        assert_eq_m512i(r, e);
25970    }
25971
25972    #[simd_test(enable = "avx512fp16")]
25973    unsafe fn test_mm512_mask_cvttph_epi16() {
25974        let a = _mm512_set_ph(
25975            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25976            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25977            31.0, 32.0,
25978        );
25979        let src = _mm512_set_epi16(
25980            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25981            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25982        );
25983        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25984        let e = _mm512_set_epi16(
25985            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25986            24, 34, 26, 36, 28, 38, 30, 40, 32,
25987        );
25988        assert_eq_m512i(r, e);
25989    }
25990
25991    #[simd_test(enable = "avx512fp16")]
25992    unsafe fn test_mm512_maskz_cvttph_epi16() {
25993        let a = _mm512_set_ph(
25994            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25995            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25996            31.0, 32.0,
25997        );
25998        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25999        let e = _mm512_set_epi16(
26000            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
26001            0, 28, 0, 30, 0, 32,
26002        );
26003        assert_eq_m512i(r, e);
26004    }
26005
26006    #[simd_test(enable = "avx512fp16")]
26007    unsafe fn test_mm512_cvtt_roundph_epi16() {
26008        let a = _mm512_set_ph(
26009            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26010            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26011            31.0, 32.0,
26012        );
26013        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
26014        let e = _mm512_set_epi16(
26015            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
26016            25, 26, 27, 28, 29, 30, 31, 32,
26017        );
26018        assert_eq_m512i(r, e);
26019    }
26020
26021    #[simd_test(enable = "avx512fp16")]
26022    unsafe fn test_mm512_mask_cvtt_roundph_epi16() {
26023        let a = _mm512_set_ph(
26024            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26025            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26026            31.0, 32.0,
26027        );
26028        let src = _mm512_set_epi16(
26029            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
26030            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
26031        );
26032        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
26033            src,
26034            0b01010101010101010101010101010101,
26035            a,
26036        );
26037        let e = _mm512_set_epi16(
26038            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
26039            24, 34, 26, 36, 28, 38, 30, 40, 32,
26040        );
26041        assert_eq_m512i(r, e);
26042    }
26043
26044    #[simd_test(enable = "avx512fp16")]
26045    unsafe fn test_mm512_maskz_cvtt_roundph_epi16() {
26046        let a = _mm512_set_ph(
26047            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26048            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26049            31.0, 32.0,
26050        );
26051        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
26052            0b01010101010101010101010101010101,
26053            a,
26054        );
26055        let e = _mm512_set_epi16(
26056            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
26057            0, 28, 0, 30, 0, 32,
26058        );
26059        assert_eq_m512i(r, e);
26060    }
26061
26062    #[simd_test(enable = "avx512fp16,avx512vl")]
26063    unsafe fn test_mm_cvttph_epu16() {
26064        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26065        let r = _mm_cvttph_epu16(a);
26066        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
26067        assert_eq_m128i(r, e);
26068    }
26069
26070    #[simd_test(enable = "avx512fp16,avx512vl")]
26071    unsafe fn test_mm_mask_cvttph_epu16() {
26072        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26073        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
26074        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
26075        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
26076        assert_eq_m128i(r, e);
26077    }
26078
26079    #[simd_test(enable = "avx512fp16,avx512vl")]
26080    unsafe fn test_mm_maskz_cvttph_epu16() {
26081        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26082        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
26083        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
26084        assert_eq_m128i(r, e);
26085    }
26086
26087    #[simd_test(enable = "avx512fp16,avx512vl")]
26088    unsafe fn test_mm256_cvttph_epu16() {
26089        let a = _mm256_set_ph(
26090            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26091        );
26092        let r = _mm256_cvttph_epu16(a);
26093        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26094        assert_eq_m256i(r, e);
26095    }
26096
26097    #[simd_test(enable = "avx512fp16,avx512vl")]
26098    unsafe fn test_mm256_mask_cvttph_epu16() {
26099        let a = _mm256_set_ph(
26100            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26101        );
26102        let src = _mm256_set_epi16(
26103            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26104        );
26105        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
26106        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26107        assert_eq_m256i(r, e);
26108    }
26109
26110    #[simd_test(enable = "avx512fp16,avx512vl")]
26111    unsafe fn test_mm256_maskz_cvttph_epu16() {
26112        let a = _mm256_set_ph(
26113            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26114        );
26115        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
26116        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26117        assert_eq_m256i(r, e);
26118    }
26119
26120    #[simd_test(enable = "avx512fp16")]
26121    unsafe fn test_mm512_cvttph_epu16() {
26122        let a = _mm512_set_ph(
26123            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26124            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26125            31.0, 32.0,
26126        );
26127        let r = _mm512_cvttph_epu16(a);
26128        let e = _mm512_set_epi16(
26129            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
26130            25, 26, 27, 28, 29, 30, 31, 32,
26131        );
26132        assert_eq_m512i(r, e);
26133    }
26134
26135    #[simd_test(enable = "avx512fp16")]
26136    unsafe fn test_mm512_mask_cvttph_epu16() {
26137        let a = _mm512_set_ph(
26138            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26139            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26140            31.0, 32.0,
26141        );
26142        let src = _mm512_set_epi16(
26143            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
26144            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
26145        );
26146        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
26147        let e = _mm512_set_epi16(
26148            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
26149            24, 34, 26, 36, 28, 38, 30, 40, 32,
26150        );
26151        assert_eq_m512i(r, e);
26152    }
26153
26154    #[simd_test(enable = "avx512fp16")]
26155    unsafe fn test_mm512_maskz_cvttph_epu16() {
26156        let a = _mm512_set_ph(
26157            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26158            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26159            31.0, 32.0,
26160        );
26161        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
26162        let e = _mm512_set_epi16(
26163            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
26164            0, 28, 0, 30, 0, 32,
26165        );
26166        assert_eq_m512i(r, e);
26167    }
26168
26169    #[simd_test(enable = "avx512fp16")]
26170    unsafe fn test_mm512_cvtt_roundph_epu16() {
26171        let a = _mm512_set_ph(
26172            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26173            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26174            31.0, 32.0,
26175        );
26176        let r = _mm512_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(a);
26177        let e = _mm512_set_epi16(
26178            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
26179            25, 26, 27, 28, 29, 30, 31, 32,
26180        );
26181        assert_eq_m512i(r, e);
26182    }
26183
26184    #[simd_test(enable = "avx512fp16")]
26185    unsafe fn test_mm512_mask_cvtt_roundph_epu16() {
26186        let a = _mm512_set_ph(
26187            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26188            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26189            31.0, 32.0,
26190        );
26191        let src = _mm512_set_epi16(
26192            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
26193            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
26194        );
26195        let r = _mm512_mask_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
26196            src,
26197            0b01010101010101010101010101010101,
26198            a,
26199        );
26200        let e = _mm512_set_epi16(
26201            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
26202            24, 34, 26, 36, 28, 38, 30, 40, 32,
26203        );
26204        assert_eq_m512i(r, e);
26205    }
26206
26207    #[simd_test(enable = "avx512fp16")]
26208    unsafe fn test_mm512_maskz_cvtt_roundph_epu16() {
26209        let a = _mm512_set_ph(
26210            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26211            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26212            31.0, 32.0,
26213        );
26214        let r = _mm512_maskz_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
26215            0b01010101010101010101010101010101,
26216            a,
26217        );
26218        let e = _mm512_set_epi16(
26219            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
26220            0, 28, 0, 30, 0, 32,
26221        );
26222        assert_eq_m512i(r, e);
26223    }
26224
26225    #[simd_test(enable = "avx512fp16,avx512vl")]
26226    unsafe fn test_mm_cvtph_epi32() {
26227        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26228        let r = _mm_cvtph_epi32(a);
26229        let e = _mm_set_epi32(1, 2, 3, 4);
26230        assert_eq_m128i(r, e);
26231    }
26232
26233    #[simd_test(enable = "avx512fp16,avx512vl")]
26234    unsafe fn test_mm_mask_cvtph_epi32() {
26235        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26236        let src = _mm_set_epi32(10, 11, 12, 13);
26237        let r = _mm_mask_cvtph_epi32(src, 0b0101, a);
26238        let e = _mm_set_epi32(10, 2, 12, 4);
26239        assert_eq_m128i(r, e);
26240    }
26241
26242    #[simd_test(enable = "avx512fp16,avx512vl")]
26243    unsafe fn test_mm_maskz_cvtph_epi32() {
26244        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26245        let r = _mm_maskz_cvtph_epi32(0b0101, a);
26246        let e = _mm_set_epi32(0, 2, 0, 4);
26247        assert_eq_m128i(r, e);
26248    }
26249
26250    #[simd_test(enable = "avx512fp16,avx512vl")]
26251    unsafe fn test_mm256_cvtph_epi32() {
26252        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26253        let r = _mm256_cvtph_epi32(a);
26254        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26255        assert_eq_m256i(r, e);
26256    }
26257
26258    #[simd_test(enable = "avx512fp16,avx512vl")]
26259    unsafe fn test_mm256_mask_cvtph_epi32() {
26260        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26261        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26262        let r = _mm256_mask_cvtph_epi32(src, 0b01010101, a);
26263        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26264        assert_eq_m256i(r, e);
26265    }
26266
26267    #[simd_test(enable = "avx512fp16,avx512vl")]
26268    unsafe fn test_mm256_maskz_cvtph_epi32() {
26269        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26270        let r = _mm256_maskz_cvtph_epi32(0b01010101, a);
26271        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26272        assert_eq_m256i(r, e);
26273    }
26274
26275    #[simd_test(enable = "avx512fp16")]
26276    unsafe fn test_mm512_cvtph_epi32() {
26277        let a = _mm256_set_ph(
26278            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26279        );
26280        let r = _mm512_cvtph_epi32(a);
26281        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26282        assert_eq_m512i(r, e);
26283    }
26284
26285    #[simd_test(enable = "avx512fp16")]
26286    unsafe fn test_mm512_mask_cvtph_epi32() {
26287        let a = _mm256_set_ph(
26288            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26289        );
26290        let src = _mm512_set_epi32(
26291            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26292        );
26293        let r = _mm512_mask_cvtph_epi32(src, 0b0101010101010101, a);
26294        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26295        assert_eq_m512i(r, e);
26296    }
26297
26298    #[simd_test(enable = "avx512fp16")]
26299    unsafe fn test_mm512_maskz_cvtph_epi32() {
26300        let a = _mm256_set_ph(
26301            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26302        );
26303        let r = _mm512_maskz_cvtph_epi32(0b0101010101010101, a);
26304        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26305        assert_eq_m512i(r, e);
26306    }
26307
26308    #[simd_test(enable = "avx512fp16")]
26309    unsafe fn test_mm512_cvt_roundph_epi32() {
26310        let a = _mm256_set_ph(
26311            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26312        );
26313        let r = _mm512_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26314        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26315        assert_eq_m512i(r, e);
26316    }
26317
26318    #[simd_test(enable = "avx512fp16")]
26319    unsafe fn test_mm512_mask_cvt_roundph_epi32() {
26320        let a = _mm256_set_ph(
26321            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26322        );
26323        let src = _mm512_set_epi32(
26324            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26325        );
26326        let r = _mm512_mask_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26327            src,
26328            0b0101010101010101,
26329            a,
26330        );
26331        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26332        assert_eq_m512i(r, e);
26333    }
26334
26335    #[simd_test(enable = "avx512fp16")]
26336    unsafe fn test_mm512_maskz_cvt_roundph_epi32() {
26337        let a = _mm256_set_ph(
26338            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26339        );
26340        let r = _mm512_maskz_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26341            0b0101010101010101,
26342            a,
26343        );
26344        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26345        assert_eq_m512i(r, e);
26346    }
26347
26348    #[simd_test(enable = "avx512fp16")]
26349    unsafe fn test_mm_cvtsh_i32() {
26350        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26351        let r = _mm_cvtsh_i32(a);
26352        assert_eq!(r, 1);
26353    }
26354
26355    #[simd_test(enable = "avx512fp16")]
26356    unsafe fn test_mm_cvt_roundsh_i32() {
26357        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26358        let r = _mm_cvt_roundsh_i32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26359        assert_eq!(r, 1);
26360    }
26361
26362    #[simd_test(enable = "avx512fp16,avx512vl")]
26363    unsafe fn test_mm_cvtph_epu32() {
26364        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26365        let r = _mm_cvtph_epu32(a);
26366        let e = _mm_set_epi32(1, 2, 3, 4);
26367        assert_eq_m128i(r, e);
26368    }
26369
26370    #[simd_test(enable = "avx512fp16,avx512vl")]
26371    unsafe fn test_mm_mask_cvtph_epu32() {
26372        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26373        let src = _mm_set_epi32(10, 11, 12, 13);
26374        let r = _mm_mask_cvtph_epu32(src, 0b0101, a);
26375        let e = _mm_set_epi32(10, 2, 12, 4);
26376        assert_eq_m128i(r, e);
26377    }
26378
26379    #[simd_test(enable = "avx512fp16,avx512vl")]
26380    unsafe fn test_mm_maskz_cvtph_epu32() {
26381        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26382        let r = _mm_maskz_cvtph_epu32(0b0101, a);
26383        let e = _mm_set_epi32(0, 2, 0, 4);
26384        assert_eq_m128i(r, e);
26385    }
26386
26387    #[simd_test(enable = "avx512fp16,avx512vl")]
26388    unsafe fn test_mm256_cvtph_epu32() {
26389        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26390        let r = _mm256_cvtph_epu32(a);
26391        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26392        assert_eq_m256i(r, e);
26393    }
26394
26395    #[simd_test(enable = "avx512fp16,avx512vl")]
26396    unsafe fn test_mm256_mask_cvtph_epu32() {
26397        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26398        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26399        let r = _mm256_mask_cvtph_epu32(src, 0b01010101, a);
26400        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26401        assert_eq_m256i(r, e);
26402    }
26403
26404    #[simd_test(enable = "avx512fp16,avx512vl")]
26405    unsafe fn test_mm256_maskz_cvtph_epu32() {
26406        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26407        let r = _mm256_maskz_cvtph_epu32(0b01010101, a);
26408        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26409        assert_eq_m256i(r, e);
26410    }
26411
26412    #[simd_test(enable = "avx512fp16")]
26413    unsafe fn test_mm512_cvtph_epu32() {
26414        let a = _mm256_set_ph(
26415            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26416        );
26417        let r = _mm512_cvtph_epu32(a);
26418        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26419        assert_eq_m512i(r, e);
26420    }
26421
26422    #[simd_test(enable = "avx512fp16")]
26423    unsafe fn test_mm512_mask_cvtph_epu32() {
26424        let a = _mm256_set_ph(
26425            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26426        );
26427        let src = _mm512_set_epi32(
26428            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26429        );
26430        let r = _mm512_mask_cvtph_epu32(src, 0b0101010101010101, a);
26431        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26432        assert_eq_m512i(r, e);
26433    }
26434
26435    #[simd_test(enable = "avx512fp16")]
26436    unsafe fn test_mm512_maskz_cvtph_epu32() {
26437        let a = _mm256_set_ph(
26438            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26439        );
26440        let r = _mm512_maskz_cvtph_epu32(0b0101010101010101, a);
26441        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26442        assert_eq_m512i(r, e);
26443    }
26444
26445    #[simd_test(enable = "avx512fp16")]
26446    unsafe fn test_mm512_cvt_roundph_epu32() {
26447        let a = _mm256_set_ph(
26448            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26449        );
26450        let r = _mm512_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26451        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26452        assert_eq_m512i(r, e);
26453    }
26454
26455    #[simd_test(enable = "avx512fp16")]
26456    unsafe fn test_mm512_mask_cvt_roundph_epu32() {
26457        let a = _mm256_set_ph(
26458            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26459        );
26460        let src = _mm512_set_epi32(
26461            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26462        );
26463        let r = _mm512_mask_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26464            src,
26465            0b0101010101010101,
26466            a,
26467        );
26468        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26469        assert_eq_m512i(r, e);
26470    }
26471
26472    #[simd_test(enable = "avx512fp16")]
26473    unsafe fn test_mm512_maskz_cvt_roundph_epu32() {
26474        let a = _mm256_set_ph(
26475            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26476        );
26477        let r = _mm512_maskz_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26478            0b0101010101010101,
26479            a,
26480        );
26481        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26482        assert_eq_m512i(r, e);
26483    }
26484
26485    #[simd_test(enable = "avx512fp16")]
26486    unsafe fn test_mm_cvtsh_u32() {
26487        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26488        let r = _mm_cvtsh_u32(a);
26489        assert_eq!(r, 1);
26490    }
26491
26492    #[simd_test(enable = "avx512fp16")]
26493    unsafe fn test_mm_cvt_roundsh_u32() {
26494        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26495        let r = _mm_cvt_roundsh_u32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26496        assert_eq!(r, 1);
26497    }
26498
26499    #[simd_test(enable = "avx512fp16,avx512vl")]
26500    unsafe fn test_mm_cvttph_epi32() {
26501        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26502        let r = _mm_cvttph_epi32(a);
26503        let e = _mm_set_epi32(1, 2, 3, 4);
26504        assert_eq_m128i(r, e);
26505    }
26506
26507    #[simd_test(enable = "avx512fp16,avx512vl")]
26508    unsafe fn test_mm_mask_cvttph_epi32() {
26509        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26510        let src = _mm_set_epi32(10, 11, 12, 13);
26511        let r = _mm_mask_cvttph_epi32(src, 0b0101, a);
26512        let e = _mm_set_epi32(10, 2, 12, 4);
26513        assert_eq_m128i(r, e);
26514    }
26515
26516    #[simd_test(enable = "avx512fp16,avx512vl")]
26517    unsafe fn test_mm_maskz_cvttph_epi32() {
26518        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26519        let r = _mm_maskz_cvttph_epi32(0b0101, a);
26520        let e = _mm_set_epi32(0, 2, 0, 4);
26521        assert_eq_m128i(r, e);
26522    }
26523
26524    #[simd_test(enable = "avx512fp16,avx512vl")]
26525    unsafe fn test_mm256_cvttph_epi32() {
26526        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26527        let r = _mm256_cvttph_epi32(a);
26528        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26529        assert_eq_m256i(r, e);
26530    }
26531
26532    #[simd_test(enable = "avx512fp16,avx512vl")]
26533    unsafe fn test_mm256_mask_cvttph_epi32() {
26534        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26535        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26536        let r = _mm256_mask_cvttph_epi32(src, 0b01010101, a);
26537        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26538        assert_eq_m256i(r, e);
26539    }
26540
26541    #[simd_test(enable = "avx512fp16,avx512vl")]
26542    unsafe fn test_mm256_maskz_cvttph_epi32() {
26543        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26544        let r = _mm256_maskz_cvttph_epi32(0b01010101, a);
26545        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26546        assert_eq_m256i(r, e);
26547    }
26548
26549    #[simd_test(enable = "avx512fp16")]
26550    unsafe fn test_mm512_cvttph_epi32() {
26551        let a = _mm256_set_ph(
26552            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26553        );
26554        let r = _mm512_cvttph_epi32(a);
26555        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26556        assert_eq_m512i(r, e);
26557    }
26558
26559    #[simd_test(enable = "avx512fp16")]
26560    unsafe fn test_mm512_mask_cvttph_epi32() {
26561        let a = _mm256_set_ph(
26562            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26563        );
26564        let src = _mm512_set_epi32(
26565            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26566        );
26567        let r = _mm512_mask_cvttph_epi32(src, 0b0101010101010101, a);
26568        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26569        assert_eq_m512i(r, e);
26570    }
26571
26572    #[simd_test(enable = "avx512fp16")]
26573    unsafe fn test_mm512_maskz_cvttph_epi32() {
26574        let a = _mm256_set_ph(
26575            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26576        );
26577        let r = _mm512_maskz_cvttph_epi32(0b0101010101010101, a);
26578        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26579        assert_eq_m512i(r, e);
26580    }
26581
26582    #[simd_test(enable = "avx512fp16")]
26583    unsafe fn test_mm512_cvtt_roundph_epi32() {
26584        let a = _mm256_set_ph(
26585            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26586        );
26587        let r = _mm512_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(a);
26588        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26589        assert_eq_m512i(r, e);
26590    }
26591
26592    #[simd_test(enable = "avx512fp16")]
26593    unsafe fn test_mm512_mask_cvtt_roundph_epi32() {
26594        let a = _mm256_set_ph(
26595            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26596        );
26597        let src = _mm512_set_epi32(
26598            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26599        );
26600        let r = _mm512_mask_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26601        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26602        assert_eq_m512i(r, e);
26603    }
26604
26605    #[simd_test(enable = "avx512fp16")]
26606    unsafe fn test_mm512_maskz_cvtt_roundph_epi32() {
26607        let a = _mm256_set_ph(
26608            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26609        );
26610        let r = _mm512_maskz_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26611        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26612        assert_eq_m512i(r, e);
26613    }
26614
26615    #[simd_test(enable = "avx512fp16")]
26616    unsafe fn test_mm_cvttsh_i32() {
26617        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26618        let r = _mm_cvttsh_i32(a);
26619        assert_eq!(r, 1);
26620    }
26621
26622    #[simd_test(enable = "avx512fp16")]
26623    unsafe fn test_mm_cvtt_roundsh_i32() {
26624        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26625        let r = _mm_cvtt_roundsh_i32::<_MM_FROUND_NO_EXC>(a);
26626        assert_eq!(r, 1);
26627    }
26628
26629    #[simd_test(enable = "avx512fp16,avx512vl")]
26630    unsafe fn test_mm_cvttph_epu32() {
26631        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26632        let r = _mm_cvttph_epu32(a);
26633        let e = _mm_set_epi32(1, 2, 3, 4);
26634        assert_eq_m128i(r, e);
26635    }
26636
26637    #[simd_test(enable = "avx512fp16,avx512vl")]
26638    unsafe fn test_mm_mask_cvttph_epu32() {
26639        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26640        let src = _mm_set_epi32(10, 11, 12, 13);
26641        let r = _mm_mask_cvttph_epu32(src, 0b0101, a);
26642        let e = _mm_set_epi32(10, 2, 12, 4);
26643        assert_eq_m128i(r, e);
26644    }
26645
26646    #[simd_test(enable = "avx512fp16,avx512vl")]
26647    unsafe fn test_mm_maskz_cvttph_epu32() {
26648        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26649        let r = _mm_maskz_cvttph_epu32(0b0101, a);
26650        let e = _mm_set_epi32(0, 2, 0, 4);
26651        assert_eq_m128i(r, e);
26652    }
26653
26654    #[simd_test(enable = "avx512fp16,avx512vl")]
26655    unsafe fn test_mm256_cvttph_epu32() {
26656        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26657        let r = _mm256_cvttph_epu32(a);
26658        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26659        assert_eq_m256i(r, e);
26660    }
26661
26662    #[simd_test(enable = "avx512fp16,avx512vl")]
26663    unsafe fn test_mm256_mask_cvttph_epu32() {
26664        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26665        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26666        let r = _mm256_mask_cvttph_epu32(src, 0b01010101, a);
26667        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26668        assert_eq_m256i(r, e);
26669    }
26670
26671    #[simd_test(enable = "avx512fp16,avx512vl")]
26672    unsafe fn test_mm256_maskz_cvttph_epu32() {
26673        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26674        let r = _mm256_maskz_cvttph_epu32(0b01010101, a);
26675        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26676        assert_eq_m256i(r, e);
26677    }
26678
26679    #[simd_test(enable = "avx512fp16")]
26680    unsafe fn test_mm512_cvttph_epu32() {
26681        let a = _mm256_set_ph(
26682            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26683        );
26684        let r = _mm512_cvttph_epu32(a);
26685        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26686        assert_eq_m512i(r, e);
26687    }
26688
26689    #[simd_test(enable = "avx512fp16")]
26690    unsafe fn test_mm512_mask_cvttph_epu32() {
26691        let a = _mm256_set_ph(
26692            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26693        );
26694        let src = _mm512_set_epi32(
26695            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26696        );
26697        let r = _mm512_mask_cvttph_epu32(src, 0b0101010101010101, a);
26698        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26699        assert_eq_m512i(r, e);
26700    }
26701
26702    #[simd_test(enable = "avx512fp16")]
26703    unsafe fn test_mm512_maskz_cvttph_epu32() {
26704        let a = _mm256_set_ph(
26705            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26706        );
26707        let r = _mm512_maskz_cvttph_epu32(0b0101010101010101, a);
26708        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26709        assert_eq_m512i(r, e);
26710    }
26711
26712    #[simd_test(enable = "avx512fp16")]
26713    unsafe fn test_mm512_cvtt_roundph_epu32() {
26714        let a = _mm256_set_ph(
26715            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26716        );
26717        let r = _mm512_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(a);
26718        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26719        assert_eq_m512i(r, e);
26720    }
26721
26722    #[simd_test(enable = "avx512fp16")]
26723    unsafe fn test_mm512_mask_cvtt_roundph_epu32() {
26724        let a = _mm256_set_ph(
26725            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26726        );
26727        let src = _mm512_set_epi32(
26728            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26729        );
26730        let r = _mm512_mask_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26731        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26732        assert_eq_m512i(r, e);
26733    }
26734
26735    #[simd_test(enable = "avx512fp16")]
26736    unsafe fn test_mm512_maskz_cvtt_roundph_epu32() {
26737        let a = _mm256_set_ph(
26738            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26739        );
26740        let r = _mm512_maskz_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26741        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26742        assert_eq_m512i(r, e);
26743    }
26744
26745    #[simd_test(enable = "avx512fp16")]
26746    unsafe fn test_mm_cvttsh_u32() {
26747        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26748        let r = _mm_cvttsh_u32(a);
26749        assert_eq!(r, 1);
26750    }
26751
26752    #[simd_test(enable = "avx512fp16")]
26753    unsafe fn test_mm_cvtt_roundsh_u32() {
26754        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26755        let r = _mm_cvtt_roundsh_u32::<_MM_FROUND_NO_EXC>(a);
26756        assert_eq!(r, 1);
26757    }
26758
26759    #[simd_test(enable = "avx512fp16,avx512vl")]
26760    unsafe fn test_mm_cvtph_epi64() {
26761        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26762        let r = _mm_cvtph_epi64(a);
26763        let e = _mm_set_epi64x(1, 2);
26764        assert_eq_m128i(r, e);
26765    }
26766
26767    #[simd_test(enable = "avx512fp16,avx512vl")]
26768    unsafe fn test_mm_mask_cvtph_epi64() {
26769        let src = _mm_set_epi64x(3, 4);
26770        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26771        let r = _mm_mask_cvtph_epi64(src, 0b01, a);
26772        let e = _mm_set_epi64x(3, 2);
26773        assert_eq_m128i(r, e);
26774    }
26775
26776    #[simd_test(enable = "avx512fp16,avx512vl")]
26777    unsafe fn test_mm_maskz_cvtph_epi64() {
26778        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26779        let r = _mm_maskz_cvtph_epi64(0b01, a);
26780        let e = _mm_set_epi64x(0, 2);
26781        assert_eq_m128i(r, e);
26782    }
26783
26784    #[simd_test(enable = "avx512fp16,avx512vl")]
26785    unsafe fn test_mm256_cvtph_epi64() {
26786        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26787        let r = _mm256_cvtph_epi64(a);
26788        let e = _mm256_set_epi64x(1, 2, 3, 4);
26789        assert_eq_m256i(r, e);
26790    }
26791
26792    #[simd_test(enable = "avx512fp16,avx512vl")]
26793    unsafe fn test_mm256_mask_cvtph_epi64() {
26794        let src = _mm256_set_epi64x(5, 6, 7, 8);
26795        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26796        let r = _mm256_mask_cvtph_epi64(src, 0b0101, a);
26797        let e = _mm256_set_epi64x(5, 2, 7, 4);
26798        assert_eq_m256i(r, e);
26799    }
26800
26801    #[simd_test(enable = "avx512fp16,avx512vl")]
26802    unsafe fn test_mm256_maskz_cvtph_epi64() {
26803        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26804        let r = _mm256_maskz_cvtph_epi64(0b0101, a);
26805        let e = _mm256_set_epi64x(0, 2, 0, 4);
26806        assert_eq_m256i(r, e);
26807    }
26808
26809    #[simd_test(enable = "avx512fp16")]
26810    unsafe fn test_mm512_cvtph_epi64() {
26811        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26812        let r = _mm512_cvtph_epi64(a);
26813        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26814        assert_eq_m512i(r, e);
26815    }
26816
26817    #[simd_test(enable = "avx512fp16")]
26818    unsafe fn test_mm512_mask_cvtph_epi64() {
26819        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26820        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26821        let r = _mm512_mask_cvtph_epi64(src, 0b01010101, a);
26822        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26823        assert_eq_m512i(r, e);
26824    }
26825
26826    #[simd_test(enable = "avx512fp16")]
26827    unsafe fn test_mm512_maskz_cvtph_epi64() {
26828        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26829        let r = _mm512_maskz_cvtph_epi64(0b01010101, a);
26830        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26831        assert_eq_m512i(r, e);
26832    }
26833
26834    #[simd_test(enable = "avx512fp16")]
26835    unsafe fn test_mm512_cvt_roundph_epi64() {
26836        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26837        let r = _mm512_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26838        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26839        assert_eq_m512i(r, e);
26840    }
26841
26842    #[simd_test(enable = "avx512fp16")]
26843    unsafe fn test_mm512_mask_cvt_roundph_epi64() {
26844        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26845        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26846        let r = _mm512_mask_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26847            src, 0b01010101, a,
26848        );
26849        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26850        assert_eq_m512i(r, e);
26851    }
26852
26853    #[simd_test(enable = "avx512fp16")]
26854    unsafe fn test_mm512_maskz_cvt_roundph_epi64() {
26855        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26856        let r = _mm512_maskz_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26857            0b01010101, a,
26858        );
26859        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26860        assert_eq_m512i(r, e);
26861    }
26862
26863    #[simd_test(enable = "avx512fp16,avx512vl")]
26864    unsafe fn test_mm_cvtph_epu64() {
26865        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26866        let r = _mm_cvtph_epu64(a);
26867        let e = _mm_set_epi64x(1, 2);
26868        assert_eq_m128i(r, e);
26869    }
26870
26871    #[simd_test(enable = "avx512fp16,avx512vl")]
26872    unsafe fn test_mm_mask_cvtph_epu64() {
26873        let src = _mm_set_epi64x(3, 4);
26874        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26875        let r = _mm_mask_cvtph_epu64(src, 0b01, a);
26876        let e = _mm_set_epi64x(3, 2);
26877        assert_eq_m128i(r, e);
26878    }
26879
26880    #[simd_test(enable = "avx512fp16,avx512vl")]
26881    unsafe fn test_mm_maskz_cvtph_epu64() {
26882        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26883        let r = _mm_maskz_cvtph_epu64(0b01, a);
26884        let e = _mm_set_epi64x(0, 2);
26885        assert_eq_m128i(r, e);
26886    }
26887
26888    #[simd_test(enable = "avx512fp16,avx512vl")]
26889    unsafe fn test_mm256_cvtph_epu64() {
26890        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26891        let r = _mm256_cvtph_epu64(a);
26892        let e = _mm256_set_epi64x(1, 2, 3, 4);
26893        assert_eq_m256i(r, e);
26894    }
26895
26896    #[simd_test(enable = "avx512fp16,avx512vl")]
26897    unsafe fn test_mm256_mask_cvtph_epu64() {
26898        let src = _mm256_set_epi64x(5, 6, 7, 8);
26899        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26900        let r = _mm256_mask_cvtph_epu64(src, 0b0101, a);
26901        let e = _mm256_set_epi64x(5, 2, 7, 4);
26902        assert_eq_m256i(r, e);
26903    }
26904
26905    #[simd_test(enable = "avx512fp16,avx512vl")]
26906    unsafe fn test_mm256_maskz_cvtph_epu64() {
26907        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26908        let r = _mm256_maskz_cvtph_epu64(0b0101, a);
26909        let e = _mm256_set_epi64x(0, 2, 0, 4);
26910        assert_eq_m256i(r, e);
26911    }
26912
26913    #[simd_test(enable = "avx512fp16")]
26914    unsafe fn test_mm512_cvtph_epu64() {
26915        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26916        let r = _mm512_cvtph_epu64(a);
26917        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26918        assert_eq_m512i(r, e);
26919    }
26920
26921    #[simd_test(enable = "avx512fp16")]
26922    unsafe fn test_mm512_mask_cvtph_epu64() {
26923        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26924        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26925        let r = _mm512_mask_cvtph_epu64(src, 0b01010101, a);
26926        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26927        assert_eq_m512i(r, e);
26928    }
26929
26930    #[simd_test(enable = "avx512fp16")]
26931    unsafe fn test_mm512_maskz_cvtph_epu64() {
26932        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26933        let r = _mm512_maskz_cvtph_epu64(0b01010101, a);
26934        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26935        assert_eq_m512i(r, e);
26936    }
26937
26938    #[simd_test(enable = "avx512fp16")]
26939    unsafe fn test_mm512_cvt_roundph_epu64() {
26940        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26941        let r = _mm512_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26942        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26943        assert_eq_m512i(r, e);
26944    }
26945
26946    #[simd_test(enable = "avx512fp16")]
26947    unsafe fn test_mm512_mask_cvt_roundph_epu64() {
26948        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26949        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26950        let r = _mm512_mask_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26951            src, 0b01010101, a,
26952        );
26953        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26954        assert_eq_m512i(r, e);
26955    }
26956
26957    #[simd_test(enable = "avx512fp16")]
26958    unsafe fn test_mm512_maskz_cvt_roundph_epu64() {
26959        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26960        let r = _mm512_maskz_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26961            0b01010101, a,
26962        );
26963        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26964        assert_eq_m512i(r, e);
26965    }
26966
26967    #[simd_test(enable = "avx512fp16,avx512vl")]
26968    unsafe fn test_mm_cvttph_epi64() {
26969        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26970        let r = _mm_cvttph_epi64(a);
26971        let e = _mm_set_epi64x(1, 2);
26972        assert_eq_m128i(r, e);
26973    }
26974
26975    #[simd_test(enable = "avx512fp16,avx512vl")]
26976    unsafe fn test_mm_mask_cvttph_epi64() {
26977        let src = _mm_set_epi64x(3, 4);
26978        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26979        let r = _mm_mask_cvttph_epi64(src, 0b01, a);
26980        let e = _mm_set_epi64x(3, 2);
26981        assert_eq_m128i(r, e);
26982    }
26983
26984    #[simd_test(enable = "avx512fp16,avx512vl")]
26985    unsafe fn test_mm_maskz_cvttph_epi64() {
26986        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26987        let r = _mm_maskz_cvttph_epi64(0b01, a);
26988        let e = _mm_set_epi64x(0, 2);
26989        assert_eq_m128i(r, e);
26990    }
26991
26992    #[simd_test(enable = "avx512fp16,avx512vl")]
26993    unsafe fn test_mm256_cvttph_epi64() {
26994        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26995        let r = _mm256_cvttph_epi64(a);
26996        let e = _mm256_set_epi64x(1, 2, 3, 4);
26997        assert_eq_m256i(r, e);
26998    }
26999
27000    #[simd_test(enable = "avx512fp16,avx512vl")]
27001    unsafe fn test_mm256_mask_cvttph_epi64() {
27002        let src = _mm256_set_epi64x(5, 6, 7, 8);
27003        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27004        let r = _mm256_mask_cvttph_epi64(src, 0b0101, a);
27005        let e = _mm256_set_epi64x(5, 2, 7, 4);
27006        assert_eq_m256i(r, e);
27007    }
27008
27009    #[simd_test(enable = "avx512fp16,avx512vl")]
27010    unsafe fn test_mm256_maskz_cvttph_epi64() {
27011        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27012        let r = _mm256_maskz_cvttph_epi64(0b0101, a);
27013        let e = _mm256_set_epi64x(0, 2, 0, 4);
27014        assert_eq_m256i(r, e);
27015    }
27016
27017    #[simd_test(enable = "avx512fp16")]
27018    unsafe fn test_mm512_cvttph_epi64() {
27019        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27020        let r = _mm512_cvttph_epi64(a);
27021        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
27022        assert_eq_m512i(r, e);
27023    }
27024
27025    #[simd_test(enable = "avx512fp16")]
27026    unsafe fn test_mm512_mask_cvttph_epi64() {
27027        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
27028        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27029        let r = _mm512_mask_cvttph_epi64(src, 0b01010101, a);
27030        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
27031        assert_eq_m512i(r, e);
27032    }
27033
27034    #[simd_test(enable = "avx512fp16")]
27035    unsafe fn test_mm512_maskz_cvttph_epi64() {
27036        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27037        let r = _mm512_maskz_cvttph_epi64(0b01010101, a);
27038        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
27039        assert_eq_m512i(r, e);
27040    }
27041
27042    #[simd_test(enable = "avx512fp16")]
27043    unsafe fn test_mm512_cvtt_roundph_epi64() {
27044        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27045        let r = _mm512_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(a);
27046        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
27047        assert_eq_m512i(r, e);
27048    }
27049
27050    #[simd_test(enable = "avx512fp16")]
27051    unsafe fn test_mm512_mask_cvtt_roundph_epi64() {
27052        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
27053        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27054        let r = _mm512_mask_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
27055        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
27056        assert_eq_m512i(r, e);
27057    }
27058
27059    #[simd_test(enable = "avx512fp16")]
27060    unsafe fn test_mm512_maskz_cvtt_roundph_epi64() {
27061        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27062        let r = _mm512_maskz_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(0b01010101, a);
27063        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
27064        assert_eq_m512i(r, e);
27065    }
27066
27067    #[simd_test(enable = "avx512fp16,avx512vl")]
27068    unsafe fn test_mm_cvttph_epu64() {
27069        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27070        let r = _mm_cvttph_epu64(a);
27071        let e = _mm_set_epi64x(1, 2);
27072        assert_eq_m128i(r, e);
27073    }
27074
27075    #[simd_test(enable = "avx512fp16,avx512vl")]
27076    unsafe fn test_mm_mask_cvttph_epu64() {
27077        let src = _mm_set_epi64x(3, 4);
27078        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27079        let r = _mm_mask_cvttph_epu64(src, 0b01, a);
27080        let e = _mm_set_epi64x(3, 2);
27081        assert_eq_m128i(r, e);
27082    }
27083
27084    #[simd_test(enable = "avx512fp16,avx512vl")]
27085    unsafe fn test_mm_maskz_cvttph_epu64() {
27086        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27087        let r = _mm_maskz_cvttph_epu64(0b01, a);
27088        let e = _mm_set_epi64x(0, 2);
27089        assert_eq_m128i(r, e);
27090    }
27091
27092    #[simd_test(enable = "avx512fp16,avx512vl")]
27093    unsafe fn test_mm256_cvttph_epu64() {
27094        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27095        let r = _mm256_cvttph_epu64(a);
27096        let e = _mm256_set_epi64x(1, 2, 3, 4);
27097        assert_eq_m256i(r, e);
27098    }
27099
27100    #[simd_test(enable = "avx512fp16,avx512vl")]
27101    unsafe fn test_mm256_mask_cvttph_epu64() {
27102        let src = _mm256_set_epi64x(5, 6, 7, 8);
27103        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27104        let r = _mm256_mask_cvttph_epu64(src, 0b0101, a);
27105        let e = _mm256_set_epi64x(5, 2, 7, 4);
27106        assert_eq_m256i(r, e);
27107    }
27108
27109    #[simd_test(enable = "avx512fp16,avx512vl")]
27110    unsafe fn test_mm256_maskz_cvttph_epu64() {
27111        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27112        let r = _mm256_maskz_cvttph_epu64(0b0101, a);
27113        let e = _mm256_set_epi64x(0, 2, 0, 4);
27114        assert_eq_m256i(r, e);
27115    }
27116
27117    #[simd_test(enable = "avx512fp16")]
27118    unsafe fn test_mm512_cvttph_epu64() {
27119        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27120        let r = _mm512_cvttph_epu64(a);
27121        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
27122        assert_eq_m512i(r, e);
27123    }
27124
27125    #[simd_test(enable = "avx512fp16")]
27126    unsafe fn test_mm512_mask_cvttph_epu64() {
27127        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
27128        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27129        let r = _mm512_mask_cvttph_epu64(src, 0b01010101, a);
27130        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
27131        assert_eq_m512i(r, e);
27132    }
27133
27134    #[simd_test(enable = "avx512fp16")]
27135    unsafe fn test_mm512_maskz_cvttph_epu64() {
27136        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27137        let r = _mm512_maskz_cvttph_epu64(0b01010101, a);
27138        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
27139        assert_eq_m512i(r, e);
27140    }
27141
27142    #[simd_test(enable = "avx512fp16")]
27143    unsafe fn test_mm512_cvtt_roundph_epu64() {
27144        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27145        let r = _mm512_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(a);
27146        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
27147        assert_eq_m512i(r, e);
27148    }
27149
27150    #[simd_test(enable = "avx512fp16")]
27151    unsafe fn test_mm512_mask_cvtt_roundph_epu64() {
27152        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
27153        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27154        let r = _mm512_mask_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
27155        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
27156        assert_eq_m512i(r, e);
27157    }
27158
27159    #[simd_test(enable = "avx512fp16")]
27160    unsafe fn test_mm512_maskz_cvtt_roundph_epu64() {
27161        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27162        let r = _mm512_maskz_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(0b01010101, a);
27163        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
27164        assert_eq_m512i(r, e);
27165    }
27166
27167    #[simd_test(enable = "avx512fp16,avx512vl")]
27168    unsafe fn test_mm_cvtxph_ps() {
27169        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27170        let r = _mm_cvtxph_ps(a);
27171        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
27172        assert_eq_m128(r, e);
27173    }
27174
27175    #[simd_test(enable = "avx512fp16,avx512vl")]
27176    unsafe fn test_mm_mask_cvtxph_ps() {
27177        let src = _mm_set_ps(10.0, 11.0, 12.0, 13.0);
27178        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27179        let r = _mm_mask_cvtxph_ps(src, 0b0101, a);
27180        let e = _mm_set_ps(10.0, 2.0, 12.0, 4.0);
27181        assert_eq_m128(r, e);
27182    }
27183
27184    #[simd_test(enable = "avx512fp16,avx512vl")]
27185    unsafe fn test_mm_maskz_cvtxph_ps() {
27186        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27187        let r = _mm_maskz_cvtxph_ps(0b0101, a);
27188        let e = _mm_set_ps(0.0, 2.0, 0.0, 4.0);
27189        assert_eq_m128(r, e);
27190    }
27191
27192    #[simd_test(enable = "avx512fp16,avx512vl")]
27193    unsafe fn test_mm256_cvtxph_ps() {
27194        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27195        let r = _mm256_cvtxph_ps(a);
27196        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27197        assert_eq_m256(r, e);
27198    }
27199
27200    #[simd_test(enable = "avx512fp16,avx512vl")]
27201    unsafe fn test_mm256_mask_cvtxph_ps() {
27202        let src = _mm256_set_ps(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27203        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27204        let r = _mm256_mask_cvtxph_ps(src, 0b01010101, a);
27205        let e = _mm256_set_ps(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27206        assert_eq_m256(r, e);
27207    }
27208
27209    #[simd_test(enable = "avx512fp16,avx512vl")]
27210    unsafe fn test_mm256_maskz_cvtxph_ps() {
27211        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27212        let r = _mm256_maskz_cvtxph_ps(0b01010101, a);
27213        let e = _mm256_set_ps(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27214        assert_eq_m256(r, e);
27215    }
27216
27217    #[simd_test(enable = "avx512fp16")]
27218    unsafe fn test_mm512_cvtxph_ps() {
27219        let a = _mm256_set_ph(
27220            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27221        );
27222        let r = _mm512_cvtxph_ps(a);
27223        let e = _mm512_set_ps(
27224            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27225        );
27226        assert_eq_m512(r, e);
27227    }
27228
27229    #[simd_test(enable = "avx512fp16")]
27230    unsafe fn test_mm512_mask_cvtxph_ps() {
27231        let src = _mm512_set_ps(
27232            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
27233            24.0, 25.0,
27234        );
27235        let a = _mm256_set_ph(
27236            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27237        );
27238        let r = _mm512_mask_cvtxph_ps(src, 0b0101010101010101, a);
27239        let e = _mm512_set_ps(
27240            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
27241            16.0,
27242        );
27243        assert_eq_m512(r, e);
27244    }
27245
27246    #[simd_test(enable = "avx512fp16")]
27247    unsafe fn test_mm512_maskz_cvtxph_ps() {
27248        let a = _mm256_set_ph(
27249            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27250        );
27251        let r = _mm512_maskz_cvtxph_ps(0b0101010101010101, a);
27252        let e = _mm512_set_ps(
27253            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
27254        );
27255        assert_eq_m512(r, e);
27256    }
27257
27258    #[simd_test(enable = "avx512fp16")]
27259    unsafe fn test_mm512_cvtx_roundph_ps() {
27260        let a = _mm256_set_ph(
27261            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27262        );
27263        let r = _mm512_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(a);
27264        let e = _mm512_set_ps(
27265            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27266        );
27267        assert_eq_m512(r, e);
27268    }
27269
27270    #[simd_test(enable = "avx512fp16")]
27271    unsafe fn test_mm512_mask_cvtx_roundph_ps() {
27272        let src = _mm512_set_ps(
27273            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
27274            24.0, 25.0,
27275        );
27276        let a = _mm256_set_ph(
27277            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27278        );
27279        let r = _mm512_mask_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
27280        let e = _mm512_set_ps(
27281            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
27282            16.0,
27283        );
27284        assert_eq_m512(r, e);
27285    }
27286
27287    #[simd_test(enable = "avx512fp16")]
27288    unsafe fn test_mm512_maskz_cvtx_roundph_ps() {
27289        let a = _mm256_set_ph(
27290            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27291        );
27292        let r = _mm512_maskz_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
27293        let e = _mm512_set_ps(
27294            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
27295        );
27296        assert_eq_m512(r, e);
27297    }
27298
27299    #[simd_test(enable = "avx512fp16")]
27300    unsafe fn test_mm_cvtsh_ss() {
27301        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27302        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27303        let r = _mm_cvtsh_ss(a, b);
27304        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27305        assert_eq_m128(r, e);
27306    }
27307
27308    #[simd_test(enable = "avx512fp16")]
27309    unsafe fn test_mm_mask_cvtsh_ss() {
27310        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27311        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27312        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27313        let r = _mm_mask_cvtsh_ss(src, 0, a, b);
27314        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27315        assert_eq_m128(r, e);
27316        let r = _mm_mask_cvtsh_ss(src, 1, a, b);
27317        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27318        assert_eq_m128(r, e);
27319    }
27320
27321    #[simd_test(enable = "avx512fp16")]
27322    unsafe fn test_mm_maskz_cvtsh_ss() {
27323        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27324        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27325        let r = _mm_maskz_cvtsh_ss(0, a, b);
27326        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27327        assert_eq_m128(r, e);
27328        let r = _mm_maskz_cvtsh_ss(1, a, b);
27329        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27330        assert_eq_m128(r, e);
27331    }
27332
27333    #[simd_test(enable = "avx512fp16")]
27334    unsafe fn test_mm_cvt_roundsh_ss() {
27335        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27336        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27337        let r = _mm_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(a, b);
27338        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27339        assert_eq_m128(r, e);
27340    }
27341
27342    #[simd_test(enable = "avx512fp16")]
27343    unsafe fn test_mm_mask_cvt_roundsh_ss() {
27344        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27345        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27346        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27347        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27348        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27349        assert_eq_m128(r, e);
27350        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27351        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27352        assert_eq_m128(r, e);
27353    }
27354
27355    #[simd_test(enable = "avx512fp16")]
27356    unsafe fn test_mm_maskz_cvt_roundsh_ss() {
27357        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27358        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27359        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(0, a, b);
27360        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27361        assert_eq_m128(r, e);
27362        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(1, a, b);
27363        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27364        assert_eq_m128(r, e);
27365    }
27366
27367    #[simd_test(enable = "avx512fp16,avx512vl")]
27368    unsafe fn test_mm_cvtph_pd() {
27369        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27370        let r = _mm_cvtph_pd(a);
27371        let e = _mm_set_pd(1.0, 2.0);
27372        assert_eq_m128d(r, e);
27373    }
27374
27375    #[simd_test(enable = "avx512fp16,avx512vl")]
27376    unsafe fn test_mm_mask_cvtph_pd() {
27377        let src = _mm_set_pd(10.0, 11.0);
27378        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27379        let r = _mm_mask_cvtph_pd(src, 0b01, a);
27380        let e = _mm_set_pd(10.0, 2.0);
27381        assert_eq_m128d(r, e);
27382    }
27383
27384    #[simd_test(enable = "avx512fp16,avx512vl")]
27385    unsafe fn test_mm_maskz_cvtph_pd() {
27386        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27387        let r = _mm_maskz_cvtph_pd(0b01, a);
27388        let e = _mm_set_pd(0.0, 2.0);
27389        assert_eq_m128d(r, e);
27390    }
27391
27392    #[simd_test(enable = "avx512fp16,avx512vl")]
27393    unsafe fn test_mm256_cvtph_pd() {
27394        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27395        let r = _mm256_cvtph_pd(a);
27396        let e = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
27397        assert_eq_m256d(r, e);
27398    }
27399
27400    #[simd_test(enable = "avx512fp16,avx512vl")]
27401    unsafe fn test_mm256_mask_cvtph_pd() {
27402        let src = _mm256_set_pd(10.0, 11.0, 12.0, 13.0);
27403        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27404        let r = _mm256_mask_cvtph_pd(src, 0b0101, a);
27405        let e = _mm256_set_pd(10.0, 2.0, 12.0, 4.0);
27406        assert_eq_m256d(r, e);
27407    }
27408
27409    #[simd_test(enable = "avx512fp16,avx512vl")]
27410    unsafe fn test_mm256_maskz_cvtph_pd() {
27411        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27412        let r = _mm256_maskz_cvtph_pd(0b0101, a);
27413        let e = _mm256_set_pd(0.0, 2.0, 0.0, 4.0);
27414        assert_eq_m256d(r, e);
27415    }
27416
27417    #[simd_test(enable = "avx512fp16")]
27418    unsafe fn test_mm512_cvtph_pd() {
27419        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27420        let r = _mm512_cvtph_pd(a);
27421        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27422        assert_eq_m512d(r, e);
27423    }
27424
27425    #[simd_test(enable = "avx512fp16")]
27426    unsafe fn test_mm512_mask_cvtph_pd() {
27427        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27428        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27429        let r = _mm512_mask_cvtph_pd(src, 0b01010101, a);
27430        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27431        assert_eq_m512d(r, e);
27432    }
27433
27434    #[simd_test(enable = "avx512fp16")]
27435    unsafe fn test_mm512_maskz_cvtph_pd() {
27436        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27437        let r = _mm512_maskz_cvtph_pd(0b01010101, a);
27438        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27439        assert_eq_m512d(r, e);
27440    }
27441
27442    #[simd_test(enable = "avx512fp16")]
27443    unsafe fn test_mm512_cvt_roundph_pd() {
27444        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27445        let r = _mm512_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(a);
27446        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27447        assert_eq_m512d(r, e);
27448    }
27449
27450    #[simd_test(enable = "avx512fp16")]
27451    unsafe fn test_mm512_mask_cvt_roundph_pd() {
27452        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27453        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27454        let r = _mm512_mask_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
27455        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27456        assert_eq_m512d(r, e);
27457    }
27458
27459    #[simd_test(enable = "avx512fp16")]
27460    unsafe fn test_mm512_maskz_cvt_roundph_pd() {
27461        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27462        let r = _mm512_maskz_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(0b01010101, a);
27463        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27464        assert_eq_m512d(r, e);
27465    }
27466
27467    #[simd_test(enable = "avx512fp16")]
27468    unsafe fn test_mm_cvtsh_sd() {
27469        let a = _mm_setr_pd(2.0, 20.0);
27470        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27471        let r = _mm_cvtsh_sd(a, b);
27472        let e = _mm_setr_pd(1.0, 20.0);
27473        assert_eq_m128d(r, e);
27474    }
27475
27476    #[simd_test(enable = "avx512fp16")]
27477    unsafe fn test_mm_mask_cvtsh_sd() {
27478        let src = _mm_setr_pd(3.0, 11.0);
27479        let a = _mm_setr_pd(2.0, 20.0);
27480        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27481        let r = _mm_mask_cvtsh_sd(src, 0, a, b);
27482        let e = _mm_setr_pd(3.0, 20.0);
27483        assert_eq_m128d(r, e);
27484        let r = _mm_mask_cvtsh_sd(src, 1, a, b);
27485        let e = _mm_setr_pd(1.0, 20.0);
27486        assert_eq_m128d(r, e);
27487    }
27488
27489    #[simd_test(enable = "avx512fp16")]
27490    unsafe fn test_mm_maskz_cvtsh_sd() {
27491        let a = _mm_setr_pd(2.0, 20.0);
27492        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27493        let r = _mm_maskz_cvtsh_sd(0, a, b);
27494        let e = _mm_setr_pd(0.0, 20.0);
27495        assert_eq_m128d(r, e);
27496        let r = _mm_maskz_cvtsh_sd(1, a, b);
27497        let e = _mm_setr_pd(1.0, 20.0);
27498        assert_eq_m128d(r, e);
27499    }
27500
27501    #[simd_test(enable = "avx512fp16")]
27502    unsafe fn test_mm_cvt_roundsh_sd() {
27503        let a = _mm_setr_pd(2.0, 20.0);
27504        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27505        let r = _mm_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(a, b);
27506        let e = _mm_setr_pd(1.0, 20.0);
27507        assert_eq_m128d(r, e);
27508    }
27509
27510    #[simd_test(enable = "avx512fp16")]
27511    unsafe fn test_mm_mask_cvt_roundsh_sd() {
27512        let src = _mm_setr_pd(3.0, 11.0);
27513        let a = _mm_setr_pd(2.0, 20.0);
27514        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27515        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27516        let e = _mm_setr_pd(3.0, 20.0);
27517        assert_eq_m128d(r, e);
27518        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27519        let e = _mm_setr_pd(1.0, 20.0);
27520        assert_eq_m128d(r, e);
27521    }
27522
27523    #[simd_test(enable = "avx512fp16")]
27524    unsafe fn test_mm_maskz_cvt_roundsh_sd() {
27525        let a = _mm_setr_pd(2.0, 20.0);
27526        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27527        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(0, a, b);
27528        let e = _mm_setr_pd(0.0, 20.0);
27529        assert_eq_m128d(r, e);
27530        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(1, a, b);
27531        let e = _mm_setr_pd(1.0, 20.0);
27532        assert_eq_m128d(r, e);
27533    }
27534
27535    #[simd_test(enable = "avx512fp16")]
27536    const unsafe fn test_mm_cvtsh_h() {
27537        let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
27538        let r = _mm_cvtsh_h(a);
27539        assert_eq!(r, 1.0);
27540    }
27541
27542    #[simd_test(enable = "avx512fp16")]
27543    const unsafe fn test_mm256_cvtsh_h() {
27544        let a = _mm256_setr_ph(
27545            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27546        );
27547        let r = _mm256_cvtsh_h(a);
27548        assert_eq!(r, 1.0);
27549    }
27550
27551    #[simd_test(enable = "avx512fp16")]
27552    const unsafe fn test_mm512_cvtsh_h() {
27553        let a = _mm512_setr_ph(
27554            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27555            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
27556            31.0, 32.0,
27557        );
27558        let r = _mm512_cvtsh_h(a);
27559        assert_eq!(r, 1.0);
27560    }
27561
27562    #[simd_test(enable = "avx512fp16")]
27563    const unsafe fn test_mm_cvtsi128_si16() {
27564        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
27565        let r = _mm_cvtsi128_si16(a);
27566        assert_eq!(r, 1);
27567    }
27568
27569    #[simd_test(enable = "avx512fp16")]
27570    const unsafe fn test_mm_cvtsi16_si128() {
27571        let a = 1;
27572        let r = _mm_cvtsi16_si128(a);
27573        let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
27574        assert_eq_m128i(r, e);
27575    }
27576}