summaryrefslogtreecommitdiff
path: root/src/get_flt.c
blob: 58b7c43cbea4eb6eeadf7f26b7e01910931f7824 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/* mpfr_get_flt -- convert a mpfr_t to a machine single precision float

Copyright 2009-2021 Free Software Foundation, Inc.
Contributed by the AriC and Caramba projects, INRIA.

This file is part of the GNU MPFR Library.

The GNU MPFR Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

The GNU MPFR Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */

#include <float.h>     /* for FLT_MIN */

#define MPFR_NEED_LONGLONG_H
#include "mpfr-impl.h"

#include "ieee_floats.h"

#define FLT_NEG_ZERO ((float) DBL_NEG_ZERO)
#define MPFR_FLT_INFM ((float) MPFR_DBL_INFM)
#define MPFR_FLT_INFP ((float) MPFR_DBL_INFP)

float
mpfr_get_flt (mpfr_srcptr src, mpfr_rnd_t rnd_mode)
{
  int negative;
  mpfr_exp_t e;
  float d;

  /* in case of NaN, +Inf, -Inf, +0, -0, the conversion from double to float
     is exact */
  if (MPFR_UNLIKELY (MPFR_IS_SINGULAR (src)))
    {
      /* for NaN, we don't propagate the sign bit */
      return (float) mpfr_get_d (src, rnd_mode);
    }

  e = MPFR_GET_EXP (src);
  negative = MPFR_IS_NEG (src);

  if (MPFR_UNLIKELY(rnd_mode == MPFR_RNDA))
    rnd_mode = negative ? MPFR_RNDD : MPFR_RNDU;

  /* FIXME: The code below assumes the IEEE-754 binary32 format
     with subnormal support. Fix it by converting to double, then
     to float, and in case of binary radix (for which we want
     correct rounding), handle double-rounding issues somewhere
     in the code? */

  /* the smallest positive normal float number is 2^(-126) = 0.5*2^(-125),
     and the smallest positive subnormal number is 2^(-149) = 0.5*2^(-148) */
  if (MPFR_UNLIKELY (e < -148))
    {
      /* |src| < 2^(-149), i.e., |src| is smaller than the smallest positive
         subnormal number.
         In round-to-nearest mode, 2^(-150) is rounded to zero.
      */
      d = negative ?
        (rnd_mode == MPFR_RNDD ||
         (rnd_mode == MPFR_RNDN && mpfr_cmp_si_2exp (src, -1, -150) < 0)
         ? -FLT_MIN : FLT_NEG_ZERO) :
        (rnd_mode == MPFR_RNDU ||
         (rnd_mode == MPFR_RNDN && mpfr_cmp_si_2exp (src, 1, -150) > 0)
         ? FLT_MIN : 0.0);
      if (d != 0.0) /* we multiply FLT_MIN = 2^(-126) by FLT_EPSILON = 2^(-23)
                       to get +-2^(-149) */
        d *= FLT_EPSILON;
    }
  /* the largest normal number is 2^128*(1-2^(-24)) = 0.111...111e128 */
  else if (MPFR_UNLIKELY (e > 128))
    {
      d = negative ?
        (rnd_mode == MPFR_RNDZ || rnd_mode == MPFR_RNDU ?
         -FLT_MAX : MPFR_FLT_INFM) :
        (rnd_mode == MPFR_RNDZ || rnd_mode == MPFR_RNDD ?
         FLT_MAX : MPFR_FLT_INFP);
    }
  else /* -148 <= e <= 127 */
    {
      int nbits;
      mp_limb_t tp[MPFR_LIMBS_PER_FLT];
      int carry;
      double dd;

      nbits = IEEE_FLT_MANT_DIG; /* 24 */
      if (MPFR_UNLIKELY (e < -125))
        /*In the subnormal case, compute the exact number of significant bits*/
        {
          nbits += 125 + e;
          MPFR_ASSERTD (1 <= nbits && nbits < 24);
        }
      carry = mpfr_round_raw_4 (tp, MPFR_MANT(src), MPFR_PREC(src), negative,
                                nbits, rnd_mode);
      /* we perform the reconstruction using the 'double' type here,
         knowing the result is exactly representable as 'float' */
      if (MPFR_UNLIKELY(carry))
        dd = 1.0;
      else
        {
#if MPFR_LIMBS_PER_FLT == 1
          dd = (double) tp[0] / MP_BASE_AS_DOUBLE;
#else
          mp_size_t np, i;
          np = MPFR_PREC2LIMBS (nbits);
          MPFR_ASSERTD(np <= MPFR_LIMBS_PER_FLT);
          /* The following computations are exact thanks to the previous
             mpfr_round_raw. */
          dd = (double) tp[0] / MP_BASE_AS_DOUBLE;
          for (i = 1 ; i < np ; i++)
            dd = (dd + tp[i]) / MP_BASE_AS_DOUBLE;
          /* dd is the mantissa (between 1/2 and 1) of the argument rounded
             to 24 bits */
#endif
        }
      dd = mpfr_scale2 (dd, e);
      if (negative)
        dd = -dd;

      /* convert (exactly) to float */
      d = (float) dd;
    }

  return d;
}