ctarbi.de - ieee754 · aka floats and doubles · qnans and snans

Table Of Contents

Basics

basics function from basics.nw:

void
basics(void)
{
    double one = 1;
    double m1 = -1;
    double zero = 0.0;
    double inf1 = one / zero;
    double nan1 = sqrt(-1);
    double nan2 = fmod(INFINITY, one);
    double nan3 = fmod(one, zero);
    double planck_ev = 4.135667696e-15;
    double p0 = create_double(0, 0, 0, 0);
    double m0 = create_double(1, 0, 0, 0);

    /* zero is a special case, the biased exponent of 0 behaves as 0 */
    show_double("0", zero);
    show_double("0", p0);
    show_double("-0", m0);

    show_double("Planck eV", planck_ev);
    /* round(log(4e-15) / log(2)) = -48 */
    show_created_double("Planck eV", 0, DBLEXP(-48), 0x2a019, 0xa830a613);
    show_long_bits("Planck eV", planck_ev);
    show_bytes("Planck eV", planck_ev);

    show_double("1", one);
    show_created_double("1", 0, DBLEXP(0), 0, 0);

    show_double("-1", m1);
    show_created_double("-1", 1, DBLEXP(0), 0, 0);

    /* 2047 = bias(1024) = 0x7ff = 0b11111111111 = all 11 biased-exponent
     * bits set
     */
    show_created_double("-inf", 1, DBLEXP(1024), 0, 0);
    show_created_double("inf", 0, DBLEXP(1024), 0, 0);

    show_double("1 / 0", inf1);

    show_double("sqrt(-1)", nan1);
    show_double("inf % 1", nan2);
    show_double("1 % 0", nan3);

    /* sNaN, signaling NaN (sign is indifferent) */
    show_created_double("first +sNaN", 0, DBLEXP(1024), 0, 1);
    show_created_double("last +sNaN", 0, DBLEXP(1024), 0x7ffff, 0xffffffff);

    /* qNaN, quiet NaN (sign is indifferent) */
    show_created_double("first +qNaN", 0, DBLEXP(1024), 0x80000, 0);
    show_created_double("last +qNaN", 0, DBLEXP(1024), 0xfffff, 0xffffffff);

    show_created_double("1 / 2^7", 0, DBLEXP(-7), 0, 0);
    show_created_double("1 / 2^6", 0, DBLEXP(-6), 0, 0);
    show_created_double("1 / 2^5", 0, DBLEXP(-5), 0, 0);
    show_created_double("1 / 2^4", 0, DBLEXP(-4), 0, 0);
    show_created_double("1 / 2^3", 0, DBLEXP(-3), 0, 0);
    show_created_double("1 / 2^2", 0, DBLEXP(-2), 0, 0);
    show_created_double("1 / 2^1", 0, DBLEXP(-1), 0, 0);

    show_created_double("1 * 2^0", 0, DBLEXP(0), 0, 0);
    show_created_double("1 * 2^1", 0, DBLEXP(1), 0, 0);
    show_created_double("1 * 2^2", 0, DBLEXP(2), 0, 0);
    show_created_double("1 * 2^3", 0, DBLEXP(3), 0, 0);
    show_created_double("1 * 2^4", 0, DBLEXP(4), 0, 0);
    show_created_double("1 * 2^5", 0, DBLEXP(5), 0, 0);
    show_created_double("1 * 2^6", 0, DBLEXP(6), 0, 0);
    show_created_double("1 * 2^7", 0, DBLEXP(7), 0, 0);

    printf("%15.15s -> %e (formal machine epsilon, lapack)\n",
        "2^-53", create_double(0, DBLEXP(-53), 0, 0));

    printf("%15.15s -> %e (machine epsilon, iso c std, mathematica, matlab)\n",
        "2^-52", create_double(0, DBLEXP(-52), 0, 0));

    printf("%15.15s -> %e (machine_epsilon(1.0))\n",
        "2^-52", machine_epsilon(1.0));

    printf("%15.15s -> %e (machine_epsilon(-1.0))\n",
        "2^-52", machine_epsilon(-1.0));

    printf("%15.15s -> %.2f (machine_epsilon(2^51))\n",
        "0.5", machine_epsilon(create_double(0, DBLEXP(51), 0, 0)));

    printf("%15.15s -> %.2f (machine_epsilon(2^52))\n",
        "1.0", machine_epsilon(create_double(0, DBLEXP(52), 0, 0)));

    printf("%15.15s -> %.2f (machine_epsilon(2^53), greater than unit, will skip)\n",
        "2.0", machine_epsilon(create_double(0, DBLEXP(53), 0, 0)));

    printf("%15.15s -> %.1f (maximum integer before skipping occurs)\n",
        "2^53", create_double(0, DBLEXP(53), 0, 0));

    printf("%15.15s -> %.1f (smallest number greater than 2^53)\n",
        "...", create_double(0, DBLEXP(53), 0, 1));
}

basics output:

           0 ->        (special) 0.0 (+1.00000_00000000 * 2 ^ 0)
           0 ->        (special) 0.0 (+1.00000_00000000 * 2 ^ 0)
          -0 ->        (special) 0.0 (-1.00000_00000000 * 2 ^ 0)
   Planck eV ->    4.13566769600e-15 (+1.2a019_a830a613 * 2 ^ unbias(3cf))
   Planck eV ->    4.13566769600e-15 (+1.2a019_a830a613 * 2 ^ unbias(3cf))
   Planck eV ->   0x3cf2a019a830a613 (+1.2a019_a830a613 * 2 ^ unbias(3cf))
      Planck eV ->   13a630a819a0f23c (octets/bytes)
           1 ->                 1.00 (+1.00000_00000000 * 2 ^ unbias(3ff))
           1 ->                 1.00 (+1.00000_00000000 * 2 ^ unbias(3ff))
          -1 ->                -1.00 (-1.00000_00000000 * 2 ^ unbias(3ff))
          -1 ->                -1.00 (-1.00000_00000000 * 2 ^ unbias(3ff))
        -inf ->                 -inf (-1.00000_00000000 * 2 ^ unbias(7ff))
         inf ->                  inf (+1.00000_00000000 * 2 ^ unbias(7ff))
       1 / 0 ->                  inf (+1.00000_00000000 * 2 ^ unbias(7ff))
    sqrt(-1) ->                 -nan (-1.80000_00000000 * 2 ^ unbias(7ff))
     inf % 1 ->                 -nan (-1.80000_00000000 * 2 ^ unbias(7ff))
       1 % 0 ->                 -nan (-1.80000_00000000 * 2 ^ unbias(7ff))
 first +sNaN ->                  nan (+1.00000_00000001 * 2 ^ unbias(7ff))
  last +sNaN ->                  nan (+1.7ffff_ffffffff * 2 ^ unbias(7ff))
 first +qNaN ->                  nan (+1.80000_00000000 * 2 ^ unbias(7ff))
  last +qNaN ->                  nan (+1.fffff_ffffffff * 2 ^ unbias(7ff))
     1 / 2^7 ->            0.0078125 (+1.00000_00000000 * 2 ^ unbias(3f8))
     1 / 2^6 ->            0.0156250 (+1.00000_00000000 * 2 ^ unbias(3f9))
     1 / 2^5 ->            0.0312500 (+1.00000_00000000 * 2 ^ unbias(3fa))
     1 / 2^4 ->            0.0625000 (+1.00000_00000000 * 2 ^ unbias(3fb))
     1 / 2^3 ->            0.1250000 (+1.00000_00000000 * 2 ^ unbias(3fc))
     1 / 2^2 ->            0.2500000 (+1.00000_00000000 * 2 ^ unbias(3fd))
     1 / 2^1 ->            0.5000000 (+1.00000_00000000 * 2 ^ unbias(3fe))
     1 * 2^0 ->                 1.00 (+1.00000_00000000 * 2 ^ unbias(3ff))
     1 * 2^1 ->                 2.00 (+1.00000_00000000 * 2 ^ unbias(400))
     1 * 2^2 ->                 4.00 (+1.00000_00000000 * 2 ^ unbias(401))
     1 * 2^3 ->                 8.00 (+1.00000_00000000 * 2 ^ unbias(402))
     1 * 2^4 ->                16.00 (+1.00000_00000000 * 2 ^ unbias(403))
     1 * 2^5 ->                32.00 (+1.00000_00000000 * 2 ^ unbias(404))
     1 * 2^6 ->                64.00 (+1.00000_00000000 * 2 ^ unbias(405))
     1 * 2^7 ->               128.00 (+1.00000_00000000 * 2 ^ unbias(406))
          2^-53 -> 1.110223e-16 (formal machine epsilon, lapack)
          2^-52 -> 2.220446e-16 (machine epsilon, iso c std, mathematica, matlab)
          2^-52 -> 2.220446e-16 (machine_epsilon(1.0))
          2^-52 -> 2.220446e-16 (machine_epsilon(-1.0))
            0.5 -> 0.50 (machine_epsilon(2^51))
            1.0 -> 1.00 (machine_epsilon(2^52))
            2.0 -> 2.00 (machine_epsilon(2^53), greater than unit, will skip)
           2^53 -> 9007199254740992.0 (maximum integer before skipping occurs)
            ... -> 9007199254740994.0 (smallest number greater than 2^53)

Bit Patterns

bits function from bits.nw:

void
bits(void)
{
    show_long_bits("zero", create_double(0, 0, 0, 0));
    show_long_bits("first subnormal", create_double(0, 0, 0, 1));
    show_long_bits("last subnormal", create_double(0, 0, 0xfffff, 0xffffffff));
    show_long_bits("first normal", create_double(0, 1, 0, 0));
    show_long_bits("2^(-53)", create_double(0, DBLEXP(-53), 0, 0));
    show_long_bits("one", create_double(0, DBLEXP(0), 0, 0));
    show_long_bits("2^(+53)", create_double(0, DBLEXP(+53), 0, 0));
    show_long_bits("last normal", create_double(0, DBLEXP(1023), 0xfffff, 0xffffffff));
    show_long_bits("infinite", create_double(0, DBLEXP(1024), 0, 0));
    show_long_bits("first snan", create_double(0, DBLEXP(1024), 0, 1));
    show_long_bits("last snan", create_double(0, DBLEXP(1024), 0x7ffff, 0xffffffff));
    show_long_bits("first qnan", create_double(0, DBLEXP(1024), 0x80000, 0));
    show_long_bits("last qnan", create_double(0, DBLEXP(1024), 0xfffff, 0xffffffff));
    show_long_bits("(-) first subnormal", create_double(1, 0, 0, 1));
    show_long_bits("(-) last subnormal", create_double(1, 0, 0xfffff, 0xffffffff));
    show_long_bits("(-) one", create_double(1, DBLEXP(0), 0, 0));
    show_long_bits("(-) infinite", create_double(1, DBLEXP(1024), 0, 0));
    show_long_bits("(-) last qnan", create_double(1, DBLEXP(1024), 0xfffff, 0xffffffff));
}

bits output:

            zero  ->  0x0000000000000000  +1.00000_00000000  (special)           0.000000e+00
 first subnormal  ->  0x0000000000000001  +1.00000_00000001  (special)           4.940656e-324
  last subnormal  ->  0x000fffffffffffff  +1.fffff_ffffffff  (special)           2.225074e-308
    first normal  ->  0x0010000000000000  +1.00000_00000000  * 2 ^ unbias(  1)   2.225074e-308
         2^(-53)  ->  0x3ca0000000000000  +1.00000_00000000  * 2 ^ unbias(3ca)   1.110223e-16
             one  ->  0x3ff0000000000000  +1.00000_00000000  * 2 ^ unbias(3ff)   1.000000e+00
         2^(+53)  ->  0x4340000000000000  +1.00000_00000000  * 2 ^ unbias(434)   9.007199e+15
     last normal  ->  0x7fefffffffffffff  +1.fffff_ffffffff  * 2 ^ unbias(7fe)   1.797693e+308
        infinite  ->  0x7ff0000000000000  +1.00000_00000000  * 2 ^ unbias(7ff)   inf
      first snan  ->  0x7ff0000000000001  +1.00000_00000001  * 2 ^ unbias(7ff)   nan
       last snan  ->  0x7ff7ffffffffffff  +1.7ffff_ffffffff  * 2 ^ unbias(7ff)   nan
      first qnan  ->  0x7ff8000000000000  +1.80000_00000000  * 2 ^ unbias(7ff)   nan
       last qnan  ->  0x7fffffffffffffff  +1.fffff_ffffffff  * 2 ^ unbias(7ff)   nan
(-) first subnor  ->  0x8000000000000001  -1.00000_00000001  (special)           -4.940656e-324
(-) last subnorm  ->  0x800fffffffffffff  -1.fffff_ffffffff  (special)           -2.225074e-308
         (-) one  ->  0xbff0000000000000  -1.00000_00000000  * 2 ^ unbias(3ff)   -1.000000e+00
    (-) infinite  ->  0xfff0000000000000  -1.00000_00000000  * 2 ^ unbias(7ff)   -inf
   (-) last qnan  ->  0xffffffffffffffff  -1.fffff_ffffffff  * 2 ^ unbias(7ff)   -nan

Other sources

Notes

References

More details in the link below.

This page was last modified on March 18, 2024 at 14:56:59 UTC.