private static float float16_to_float32(ushort h) { fp32 ans = 0; UInt16 h_exp, h_sig; UInt32 f_sgn, f_exp, f_sig; h_exp = (UInt16)(h & 0x7c00u); f_sgn = ((UInt32)h & 0x8000u) << 16; switch (h_exp) { case 0x0000: /* 0 or subnormal */ h_sig = (UInt16)(h & 0x03ffu); /* Signed zero */ if (h_sig == 0) { ans = f_sgn; return(ans.f); } /* Subnormal */ h_sig <<= 1; while ((h_sig & 0x0400u) == 0) { h_sig <<= 1; h_exp++; } f_exp = ((UInt32)(127 - 15 - h_exp)) << 23; f_sig = ((UInt32)(h_sig & 0x03ffu)) << 13; ans = f_sgn + f_exp + f_sig; return(ans.f); case 0x7c00: /* inf or NaN */ /* All-ones exponent and a copy of the significand */ ans = f_sgn + 0x7f800000u + (((UInt32)(h & 0x03ffu)) << 13); return(ans.f); default: /* normalized */ /* Just need to adjust the exponent and shift */ ans = f_sgn + (((UInt32)(h & 0x7fffu) + 0x1c000u) << 13); return(ans.f); } }
private static ushort make_float16(float value) { fp32 f32infty = 255U << 23; fp32 f16infty = 31U << 23; fp32 magic = 15U << 23; uint sign_mask = 0x80000000U; uint round_mask = ~0xFFFU; fp32 @in = 0; ushort @out = 0; @in.f = value; uint sign = @in.u & sign_mask; @in.u ^= sign; if (@in.u >= f32infty.u) { @out = (@in.u > f32infty.u) ? (ushort)0x7FFFU : (ushort)0x7C00U; } else { @in.u &= round_mask; @in.f *= magic.f; @in.u -= round_mask; if (@in.u > f16infty.u) { @in.u = f16infty.u; } @out = (ushort)(@in.u >> 13); } @out |= (ushort)(sign >> 16); return(@out); }