示例#1
0
        internal static value128 compute_product_approximation(int bitPrecision, long q, ulong w)
        {
            int index = 2 * (int)(q - Constants.smallest_power_of_five);
            // For small values of q, e.g., q in [0,27], the answer is always exact because
            // The line value128 firstproduct = full_multiplication(w, power_of_five_128[index]);
            // gives the exact answer.
            value128 firstproduct = FullMultiplication(w, Constants.power_of_five_128[index]);
            //static_assert((bit_precision >= 0) && (bit_precision <= 64), " precision should  be in (0,64]");
            ulong precision_mask = (bitPrecision < 64) ? ((ulong)(0xFFFFFFFFFFFFFFFF) >> bitPrecision) : (ulong)(0xFFFFFFFFFFFFFFFF);

            if ((firstproduct.high & precision_mask) == precision_mask)
            { // could further guard with  (lower + w < lower)
              // regarding the second product, we only need secondproduct.high, but our expectation is that the compiler will optimize this extra work away if needed.
                value128 secondproduct = FullMultiplication(w, Constants.power_of_five_128[index + 1]);
                firstproduct.low += secondproduct.high;
                if (secondproduct.high > firstproduct.low)
                {
                    firstproduct.high++;
                }
            }
            return(firstproduct);
        }
示例#2
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="q"></param>
        /// <param name="w"></param>
        ///
        /// <returns></returns>

        internal static AdjustedMantissa ComputeFloat(long q, ulong w)
        {
            var answer = new AdjustedMantissa();

            if ((w == 0) || (q < FloatBinaryConstants.smallest_power_of_ten))
            {
                answer.power2   = 0;
                answer.mantissa = 0;
                // result should be zero
                return(answer);
            }
            if (q > FloatBinaryConstants.largest_power_of_ten)
            {
                // we want to get infinity:
                answer.power2   = FloatBinaryConstants.infinite_power;
                answer.mantissa = 0;
                return(answer);
            }
            // At this point in time q is in [smallest_power_of_five, largest_power_of_five].

            // We want the most significant bit of i to be 1. Shift if needed.
            int lz = BitOperations.LeadingZeroCount(w);

            w <<= lz;

            // The required precision is mantissa_explicit_bits() + 3 because
            // 1. We need the implicit bit
            // 2. We need an extra bit for rounding purposes
            // 3. We might lose a bit due to the "upperbit" routine (result too small, requiring a shift)

            value128 product = Utils.compute_product_approximation(FloatBinaryConstants.mantissa_explicit_bits + 3, q, w);

            if (product.low == 0xFFFFFFFFFFFFFFFF)
            { //  could guard it further
              // In some very rare cases, this could happen, in which case we might need a more accurate
              // computation that what we can provide cheaply. This is very, very unlikely.
              //
                bool inside_safe_exponent = (q >= -27) && (q <= 55); // always good because 5**q <2**128 when q>=0,
                                                                     // and otherwise, for q<0, we have 5**-q<2**64 and the 128-bit reciprocal allows for exact computation.
                if (!inside_safe_exponent)
                {
                    answer.power2 = -1; // This (a negative value) indicates an error condition.
                    return(answer);
                }
            }
            // The "compute_product_approximation" function can be slightly slower than a branchless approach:
            // value128 product = compute_product(q, w);
            // but in practice, we can win big with the compute_product_approximation if its additional branch
            // is easily predicted. Which is best is data specific.
            int upperbit = (int)(product.high >> 63);

            answer.mantissa = product.high >> (upperbit + 64 - FloatBinaryConstants.mantissa_explicit_bits - 3);

            answer.power2 = (int)(Utils.power((int)(q)) + upperbit - lz - FloatBinaryConstants.minimum_exponent);
            if (answer.power2 <= 0)
            { // we have a subnormal?
              // Here have that answer.power2 <= 0 so -answer.power2 >= 0
                if (-answer.power2 + 1 >= 64)
                { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
                    answer.power2   = 0;
                    answer.mantissa = 0;
                    // result should be zero
                    return(answer);
                }
                // next line is safe because -answer.power2 + 1 < 64
                answer.mantissa >>= -answer.power2 + 1;
                // Thankfully, we can't have both "round-to-even" and subnormals because
                // "round-to-even" only occurs for powers close to 0.
                answer.mantissa  += (answer.mantissa & 1); // round up
                answer.mantissa >>= 1;
                // There is a weird scenario where we don't have a subnormal but just.
                // Suppose we start with 2.2250738585072013e-308, we end up
                // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal
                // whereas 0x40000000000000 x 2^-1023-53  is normal. Now, we need to round
                // up 0x3fffffffffffff x 2^-1023-53  and once we do, we are no longer
                // subnormal, but we can only know this after rounding.
                // So we only declare a subnormal if we are smaller than the threshold.
                answer.power2 = (answer.mantissa < ((ulong)(1) << FloatBinaryConstants.mantissa_explicit_bits)) ? 0 : 1;
                return(answer);
            }

            // usually, we round *up*, but if we fall right in between and and we have an
            // even basis, we need to round down
            // We are only concerned with the cases where 5**q fits in single 64-bit word.
            if ((product.low <= 1) && (q >= FloatBinaryConstants.min_exponent_round_to_even) && (q <= FloatBinaryConstants.max_exponent_round_to_even) &&
                ((answer.mantissa & 3) == 1))
            { // we may fall between two floats!
              // To be in-between two floats we need that in doing
              //   answer.mantissa = product.high >> (upperbit + 64 - mantissa_explicit_bits() - 3);
              // ... we dropped out only zeroes. But if this happened, then we can go back!!!
                if ((answer.mantissa << (upperbit + 64 - FloatBinaryConstants.mantissa_explicit_bits - 3)) == product.high)
                {
                    answer.mantissa &= ~(ulong)(1); // flip it so that we do not round up
                }
            }

            answer.mantissa  += (answer.mantissa & 1); // round up
            answer.mantissa >>= 1;
            if (answer.mantissa >= ((ulong)(2) << FloatBinaryConstants.mantissa_explicit_bits))
            {
                answer.mantissa = ((ulong)(1) << FloatBinaryConstants.mantissa_explicit_bits);
                answer.power2++; // undo previous addition
            }

            answer.mantissa &= ~((ulong)(1) << FloatBinaryConstants.mantissa_explicit_bits);
            if (answer.power2 >= FloatBinaryConstants.infinite_power)
            { // infinity
                answer.power2   = FloatBinaryConstants.infinite_power;
                answer.mantissa = 0;
            }
            return(answer);
        }