1from peachpy import *
2from peachpy.x86_64 import *
3
4
5def fp16_alt_xmm_to_fp32_xmm(xmm_half):
6	xmm_zero = XMMRegister()
7	VPXOR(xmm_zero, xmm_zero, xmm_zero)
8
9	xmm_word = XMMRegister()
10	VPUNPCKLWD(xmm_word, xmm_zero, xmm_half)
11
12	xmm_shl1_half = XMMRegister()
13	VPADDW(xmm_shl1_half, xmm_half, xmm_half)
14
15	xmm_shl1_nonsign = XMMRegister()
16	VPADDD(xmm_shl1_nonsign, xmm_word, xmm_word)
17
18	sign_mask = Constant.float32x4(-0.0)
19
20	xmm_sign = XMMRegister()
21	VANDPS(xmm_sign, xmm_word, sign_mask)
22
23	xmm_shr3_nonsign = XMMRegister()
24	VPSRLD(xmm_shr3_nonsign, xmm_shl1_nonsign, 4)
25
26	exp_offset = Constant.uint32x4(0x38000000)
27
28	xmm_norm_nonsign = XMMRegister()
29	VPADDD(xmm_norm_nonsign, xmm_shr3_nonsign, exp_offset)
30
31	magic_mask = Constant.uint16x8(0x3E80)
32	xmm_denorm_nonsign = XMMRegister()
33	VPUNPCKLWD(xmm_denorm_nonsign, xmm_shl1_half, magic_mask)
34
35	magic_bias = Constant.float32x4(0.25)
36	VSUBPS(xmm_denorm_nonsign, xmm_denorm_nonsign, magic_bias)
37
38	xmm_denorm_cutoff = XMMRegister()
39	VMOVDQA(xmm_denorm_cutoff, Constant.uint32x4(0x00800000))
40
41	xmm_denorm_mask = XMMRegister()
42	VPCMPGTD(xmm_denorm_mask, xmm_denorm_cutoff, xmm_shr3_nonsign)
43
44	xmm_nonsign = XMMRegister()
45	VBLENDVPS(xmm_nonsign, xmm_norm_nonsign, xmm_denorm_nonsign, xmm_denorm_mask)
46
47	xmm_float = XMMRegister()
48	VORPS(xmm_float, xmm_nonsign, xmm_sign)
49
50	return xmm_float
51