| from peachpy import * |
| from peachpy.x86_64 import * |
| |
| |
| def fp16_alt_xmm_to_fp32_xmm(xmm_half): |
| xmm_zero = XMMRegister() |
| VPXOR(xmm_zero, xmm_zero, xmm_zero) |
| |
| xmm_word = XMMRegister() |
| VPUNPCKLWD(xmm_word, xmm_zero, xmm_half) |
| |
| xmm_shl1_half = XMMRegister() |
| VPADDW(xmm_shl1_half, xmm_half, xmm_half) |
| |
| xmm_shl1_nonsign = XMMRegister() |
| VPADDD(xmm_shl1_nonsign, xmm_word, xmm_word) |
| |
| sign_mask = Constant.float32x4(-0.0) |
| |
| xmm_sign = XMMRegister() |
| VANDPS(xmm_sign, xmm_word, sign_mask) |
| |
| xmm_shr3_nonsign = XMMRegister() |
| VPSRLD(xmm_shr3_nonsign, xmm_shl1_nonsign, 4) |
| |
| exp_offset = Constant.uint32x4(0x38000000) |
| |
| xmm_norm_nonsign = XMMRegister() |
| VPADDD(xmm_norm_nonsign, xmm_shr3_nonsign, exp_offset) |
| |
| magic_mask = Constant.uint16x8(0x3E80) |
| xmm_denorm_nonsign = XMMRegister() |
| VPUNPCKLWD(xmm_denorm_nonsign, xmm_shl1_half, magic_mask) |
| |
| magic_bias = Constant.float32x4(0.25) |
| VSUBPS(xmm_denorm_nonsign, xmm_denorm_nonsign, magic_bias) |
| |
| xmm_denorm_cutoff = XMMRegister() |
| VMOVDQA(xmm_denorm_cutoff, Constant.uint32x4(0x00800000)) |
| |
| xmm_denorm_mask = XMMRegister() |
| VPCMPGTD(xmm_denorm_mask, xmm_denorm_cutoff, xmm_shr3_nonsign) |
| |
| xmm_nonsign = XMMRegister() |
| VBLENDVPS(xmm_nonsign, xmm_norm_nonsign, xmm_denorm_nonsign, xmm_denorm_mask) |
| |
| xmm_float = XMMRegister() |
| VORPS(xmm_float, xmm_nonsign, xmm_sign) |
| |
| return xmm_float |