|
1 | 1 | //! Decodes a floating-point value into individual parts and error ranges. |
2 | 2 |
|
3 | | -use crate::num::FpCategory; |
4 | | -use crate::num::dec2flt::float::RawFloat; |
| 3 | +use crate::mem::size_of; |
5 | 4 |
|
6 | | -/// Decoded unsigned finite value, such that: |
| 5 | +/// Generic representation of finite floating-point values up to 64-bit wide. |
| 6 | +/// The absolute value equals `mant * 2^exp`. All real values `x` such that: |
7 | 7 | /// |
8 | | -/// - The original value equals to `mant * 2^exp`. |
| 8 | +/// lower < x < upper |
| 9 | +/// |
| 10 | +/// (or `lower < x ≤ upper` in the tie-to-even case) round to this value under |
| 11 | +/// IEEE 754 round-to-nearest rules, where: |
| 12 | +/// |
| 13 | +/// lower = (mant − minus) * 2^exp |
| 14 | +/// upper = (mant + plus) * 2^exp |
9 | 15 | /// |
10 | | -/// - Any number from `(mant - minus) * 2^exp` to `(mant + plus) * 2^exp` will |
11 | | -/// round to the original value. The range is inclusive only when |
12 | | -/// `inclusive` is `true`. |
13 | 16 | #[derive(Copy, Clone, Debug, PartialEq, Eq)] |
14 | | -pub struct Decoded { |
15 | | - /// The scaled mantissa. |
| 17 | +pub struct Decoded64 { |
| 18 | + /// Scaled mantissa. The scaling is chosen such that the rounding boundaries |
| 19 | + /// are integral when expressed as `mant ± {minus, plus}`. |
16 | 20 | pub mant: u64, |
17 | | - /// The lower error range. |
| 21 | + |
| 22 | + /// Distance from `mant` to the lower rounding-boundary. |
18 | 23 | pub minus: u64, |
19 | | - /// The upper error range. |
| 24 | + /// Distance from `mant` to the upper rounding-boundary. |
20 | 25 | pub plus: u64, |
21 | | - /// The shared exponent in base 2. |
22 | | - pub exp: i16, |
23 | | - /// True when the error range is inclusive. |
24 | | - /// |
25 | | - /// In IEEE 754, this is true when the original mantissa was even. |
26 | | - pub inclusive: bool, |
27 | | -} |
28 | 26 |
|
29 | | -/// Decoded unsigned value. |
30 | | -#[derive(Copy, Clone, Debug, PartialEq, Eq)] |
31 | | -pub enum FullDecoded { |
32 | | - /// Not-a-number. |
33 | | - Nan, |
34 | | - /// Infinities, either positive or negative. |
35 | | - Infinite, |
36 | | - /// Zero, either positive or negative. |
37 | | - Zero, |
38 | | - /// Finite numbers with further decoded fields. |
39 | | - Finite(Decoded), |
40 | | -} |
| 27 | + /// Base-2 exponent for `mant`, `minus`, and `plus`. |
| 28 | + pub exp: isize, |
41 | 29 |
|
42 | | -/// A floating point type which can be `decode`d. |
43 | | -pub trait DecodableFloat: RawFloat + Copy { |
44 | | - /// The minimum positive normalized value. |
45 | | - fn min_pos_norm_value() -> Self; |
| 30 | + /// Indicates whether an exact tie at the upper rounding-boundary (i.e., the |
| 31 | + /// midpoint between this value and its next representable neighbor) rounds |
| 32 | + /// to this value. It applies only to the upper boundary; the lower boundary |
| 33 | + /// is always exclusive. This follows IEEE 754 round-ties-to-even semantics |
| 34 | + /// and is true iff the original significand was even. |
| 35 | + pub tie_to_even: bool, |
46 | 36 | } |
47 | 37 |
|
48 | | -#[cfg(target_has_reliable_f16)] |
49 | | -impl DecodableFloat for f16 { |
50 | | - fn min_pos_norm_value() -> Self { |
51 | | - f16::MIN_POSITIVE |
52 | | - } |
53 | | -} |
| 38 | +macro_rules! floats { |
| 39 | + ($($T:ident)*) => { |
| 40 | + $( |
54 | 41 |
|
55 | | -impl DecodableFloat for f32 { |
56 | | - fn min_pos_norm_value() -> Self { |
57 | | - f32::MIN_POSITIVE |
58 | | - } |
59 | | -} |
| 42 | + /// Decode a floating-point into its integer components. The tuple in |
| 43 | + /// return contains the mantissa m and exponent e, such that original |
| 44 | + /// value equals `m * 2^e`, ignoring the sign. |
| 45 | + /// |
| 46 | + /// For normal numbers: mantissa includes the implied leading 1. |
| 47 | + /// For denormal numbers: mantissa is shifted to maintain the equation. |
| 48 | + const fn ${concat(mant_and_exp_, $T)}(v: $T) -> (u64, isize) { |
| 49 | + const ENC_BITS: usize = size_of::<$T>() * 8; |
| 50 | + // The encoding of the sign resides in the most significant bit. |
| 51 | + const SIGN_ENC_BITS: usize = 1; |
| 52 | + // The encoding of the mantissa resides in the least-significant |
| 53 | + // bits. |
| 54 | + const MANT_ENC_BITS: usize = $T::MANTISSA_DIGITS as usize - 1; |
| 55 | + // The encoding of the exponent resides in the remaining bits, |
| 56 | + // inbetween sign and the mantissa. |
| 57 | + const EXP_ENC_BITS: usize = ENC_BITS - (SIGN_ENC_BITS + MANT_ENC_BITS); |
60 | 58 |
|
61 | | -impl DecodableFloat for f64 { |
62 | | - fn min_pos_norm_value() -> Self { |
63 | | - f64::MIN_POSITIVE |
64 | | - } |
65 | | -} |
| 59 | + let enc = v.to_bits(); |
| 60 | + let exp_enc = (enc << SIGN_ENC_BITS) >> (SIGN_ENC_BITS + MANT_ENC_BITS); |
| 61 | + let mant_enc = enc & ((1 << MANT_ENC_BITS) - 1); |
| 62 | + |
| 63 | + const EXP_BIAS: isize = (1 << (EXP_ENC_BITS - 1)) - 1; |
| 64 | + let exp = exp_enc as isize - (EXP_BIAS + MANT_ENC_BITS as isize); |
66 | 65 |
|
67 | | -/// Returns a sign (true when negative) and `FullDecoded` value |
68 | | -/// from given floating point number. |
69 | | -pub fn decode<T: DecodableFloat>(v: T) -> (/*negative?*/ bool, FullDecoded) { |
70 | | - let (mant, exp, sign) = v.integer_decode(); |
71 | | - let even = (mant & 1) == 0; |
72 | | - let decoded = match v.classify() { |
73 | | - FpCategory::Nan => FullDecoded::Nan, |
74 | | - FpCategory::Infinite => FullDecoded::Infinite, |
75 | | - FpCategory::Zero => FullDecoded::Zero, |
76 | | - FpCategory::Subnormal => { |
77 | | - // neighbors: (mant - 2, exp) -- (mant, exp) -- (mant + 2, exp) |
78 | | - // Float::integer_decode always preserves the exponent, |
79 | | - // so the mantissa is scaled for subnormals. |
80 | | - FullDecoded::Finite(Decoded { mant, minus: 1, plus: 1, exp, inclusive: even }) |
| 66 | + let mant = if exp_enc != 0 { |
| 67 | + // Normal numbers have an implied leading 1 to the mantissa |
| 68 | + // bits. |
| 69 | + mant_enc | 1 << MANT_ENC_BITS |
| 70 | + } else { |
| 71 | + // Account for the effective +1 on exponents of denormal numbers. |
| 72 | + mant_enc << 1 |
| 73 | + }; |
| 74 | + |
| 75 | + const _: () = assert!(ENC_BITS <= 64); |
| 76 | + (mant as u64, exp) |
81 | 77 | } |
82 | | - FpCategory::Normal => { |
83 | | - let minnorm = <T as DecodableFloat>::min_pos_norm_value().integer_decode(); |
84 | | - if mant == minnorm.0 { |
85 | | - // neighbors: (maxmant, exp - 1) -- (minnormmant, exp) -- (minnormmant + 1, exp) |
86 | | - // where maxmant = minnormmant * 2 - 1 |
87 | | - FullDecoded::Finite(Decoded { |
88 | | - mant: mant << 2, |
89 | | - minus: 1, |
90 | | - plus: 2, |
91 | | - exp: exp - 2, |
92 | | - inclusive: even, |
93 | | - }) |
| 78 | + |
| 79 | + /// Parse a finite, non-zero floating-point into the generic structure. |
| 80 | + pub fn ${concat(decode_, $T)}(v: $T) -> Decoded64 { |
| 81 | + let (mant, exp) = ${concat(mant_and_exp_, $T)}(v); |
| 82 | + |
| 83 | + if v.is_subnormal() { |
| 84 | + // Subnormal floats have doubled spacing between representable values. |
| 85 | + // The boundaries are symmetric around `mant` in this scaled integer space. |
| 86 | + return Decoded64 { mant: mant, minus: 1, plus: 1, exp: exp, tie_to_even: true }; |
| 87 | + } |
| 88 | + debug_assert!(v.is_normal()); |
| 89 | + |
| 90 | + let is_even = (mant & 1) == 0; |
| 91 | + |
| 92 | + const MIN_POS_MANT: u64 = ${concat(mant_and_exp_, $T)}($T::MIN_POSITIVE).0; |
| 93 | + const _: () = assert!(MIN_POS_MANT != 0); |
| 94 | + if mant == MIN_POS_MANT { |
| 95 | + // The previous float of the first normal number is the largest subnormal. |
| 96 | + // The upper boundary is asymmetrically farther away than the lower boundary. |
| 97 | + Decoded64 { mant: mant << 2, minus: 1, plus: 2, exp: exp - 2, tie_to_even: is_even } |
94 | 98 | } else { |
95 | | - // neighbors: (mant - 1, exp) -- (mant, exp) -- (mant + 1, exp) |
96 | | - FullDecoded::Finite(Decoded { |
97 | | - mant: mant << 1, |
98 | | - minus: 1, |
99 | | - plus: 1, |
100 | | - exp: exp - 1, |
101 | | - inclusive: even, |
102 | | - }) |
| 99 | + Decoded64 { mant: mant << 1, minus: 1, plus: 1, exp: exp - 1, tie_to_even: is_even } |
103 | 100 | } |
104 | 101 | } |
| 102 | + |
| 103 | + )* |
105 | 104 | }; |
106 | | - (sign < 0, decoded) |
107 | 105 | } |
| 106 | + |
| 107 | +floats! { f16 f32 f64 } |
0 commit comments