Floating point basicsThe core idea of floating-point representatio.pdf

Floating point basics
The core idea of floating-point representations (as opposed to fixed point representations as used
by, say, ints), is that a number x is written as m*be where m is a mantissa or fractional part, b is a
base, and e is an exponent. On modern computers the base is almost always 2, and for most
floating-point representations the mantissa will be scaled to be between 1 and b. This is done by
adjusting the exponent, e.g.
1 = 1*20
2 = 1*21
0.375 = 1.5*2-2
etc.
Iam writing a program that does some floating addition that uses bit patterns with shifts applied
to the mantissa and such to obtain the sum of the two floating point numbers. Logically and on
paper I can get this to compute the correct sum.
Code:
#include
#include
#include
#include
int isNegative (float f)
{
unsigned int* iptr = (unsigned int*)&f;
return ( ((*iptr) & 0x80000000) ? 1:0);
}
unsigned char getExponent (float f)
{
return (((*iptr >> 23) & 0xff) - 127);
}
unsigned int getMantissa (float f)
{
if( *iptr == 0 ) return 0;
return ((*iptr & 0xFFFFFF) | 0x800000 );

}
float sum (float left, float right)
{
unsigned int littleMan;
unsigned int bigMan;
unsigned char littleE;
unsigned char bigE;
unsigned char lexp = getExponent(left);
unsigned char rexp = getExponent(right);
int Dexponent;
if (lexp > rexp)
{
bigE = lexp;
bigMan = getMantissa(left);
littleE = rexp;
littleMan = getMantissa(right);
}
else
{
bigE = rexp;
bigMan = getMantissa(right);
littleE = lexp;
littleMan = getMantissa(left);
}
printf("little: %x %x ", littleE, littleMan);
printf("big: %x %x ", bigE, bigMan);
void shift( unsigned int *valToShift, int bitsToShift )
{
// Masks is used to mask out bits to check for a "sticky" bit.
static unsigned masks[24] =
{
0, 1, 3, 7, 0xf, 0x1f, 0x3f, 0x7f,

0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff,
0xffff, 0x1ffff, 0x3ffff, 0x7ffff, 0xfffff, 0x1fffff, 0x3fffff, 0x7fffff
};
// HOmasks - masks out the H.O. bit of the value masked by the masks entry.
static unsigned HOmasks[24] =
{
0,
1, 2, 4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000,
0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000, 0x400000
};
// shiftedOut- Holds the value that will be shifted out of a mantissa
// during the denormalization operation (used to round a denormalized value).
int shiftedOut;
assert( bitsToShift <= 23 );
// Grabs the bits we're going to shift out (so we can determine
// how to round this value after the shift).
shiftedOut = *valToShift & masks[ bitsToShift ];
// Shift the value to the right the specified number of bits:
*valToShift = *valToShift >> bitsToShift;
// If necessary, round the value:
if( shiftedOut > HOmasks[ bitsToShift ] )
{
// If the bits we shifted out are greater than 1/2 the L.O. bit, then

// round the value up by one.
*valToShift = *valToShift + 1;
}
else if( shiftedOut == HOmasks[ bitsToShift ] )
{
// If the bits we shifted out are exactly 1/2 of the L.O. bit's value,
// then round the value to the nearest number whose L.O. bit is zero.
*valToShift = *valToShift + ((*valToShift & 1) == 1);
}
// else we round the value down to the previous value. The current
// value is already truncated (rounded down), so we don't have to do anything.
}
// I got two actual floating point values. I want to add them together.
// 1. "denormalize" one of the operands if their exponents aren't
// the same (when adding or subtracting values, the exponents must be the same).
//
// Algorithm: choose the value with the smaller exponent. Shift its mantissa
// to the right the number of bits specified by the difference between the two
// exponents.
if( rexp > lexp )
{
shift( &littleMan, (rexp - lexp));
Dexponent = rexp;
}
else if( rexp < lexp )
{
shift( &littleMan, (lexp - rexp));
Dexponent = lexp;
}
unsigned int result = Dexponent;
float fresult = *(float*)&result;

return(fresult);
}
int main()
{
const int SIZE = 256;
char line[SIZE];
while (1)
{
float f1;
float f2;
float left = f1;
float right = f2;
printf("Please enter the first float ( "q" to quit):");
fgets(line,SIZE,stdin);
if (toupper(line[0]) =='Q')
break;
f1 = atof(line);
printf("Please enter the second float ( "q" to quit):");
if (toupper(line[0]) == 'Q')
break;
f2 = atof(line);
if (isNegative(f1) || isNegative(f2))
printf ("One of thse is negative, but %g + %g == %g ", f1,f2,sum(f1,f2));
else
printf("%g + %g == %g ", f1,f2,sum(f1,f2));
}
return(EXIT_SUCCESS);

1 = 1*20
2 = 1*21
0.375 = 1.5*2-2
Solution
Floating point basics
The core idea of floating-point representations (as opposed to fixed point representations as used
by, say, ints), is that a number x is written as m*be where m is a mantissa or fractional part, b is a
base, and e is an exponent. On modern computers the base is almost always 2, and for most
floating-point representations the mantissa will be scaled to be between 1 and b. This is done by
adjusting the exponent, e.g.
1 = 1*20
2 = 1*21
0.375 = 1.5*2-2
etc.
Iam writing a program that does some floating addition that uses bit patterns with shifts applied
to the mantissa and such to obtain the sum of the two floating point numbers. Logically and on
paper I can get this to compute the correct sum.
Code:
#include
#include
#include
#include
int isNegative (float f)
{
return ( ((*iptr) & 0x80000000) ? 1:0);
}
unsigned char getExponent (float f)
{
return (((*iptr >> 23) & 0xff) - 127);
}
unsigned int getMantissa (float f)

{
if( *iptr == 0 ) return 0;
return ((*iptr & 0xFFFFFF) | 0x800000 );
}
float sum (float left, float right)
{
unsigned int littleMan;
unsigned int bigMan;
unsigned char littleE;
unsigned char bigE;
unsigned char lexp = getExponent(left);
unsigned char rexp = getExponent(right);
int Dexponent;
if (lexp > rexp)
{
bigE = lexp;
bigMan = getMantissa(left);
littleE = rexp;
littleMan = getMantissa(right);
}
else
{
bigE = rexp;
bigMan = getMantissa(right);
littleE = lexp;
littleMan = getMantissa(left);
}
printf("little: %x %x ", littleE, littleMan);
printf("big: %x %x ", bigE, bigMan);
void shift( unsigned int *valToShift, int bitsToShift )
{

// Masks is used to mask out bits to check for a "sticky" bit.
static unsigned masks[24] =
{
0, 1, 3, 7, 0xf, 0x1f, 0x3f, 0x7f,
0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff,
0xffff, 0x1ffff, 0x3ffff, 0x7ffff, 0xfffff, 0x1fffff, 0x3fffff, 0x7fffff
};
// HOmasks - masks out the H.O. bit of the value masked by the masks entry.
static unsigned HOmasks[24] =
{
0,
1, 2, 4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000,
0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000, 0x400000
};
// shiftedOut- Holds the value that will be shifted out of a mantissa
// during the denormalization operation (used to round a denormalized value).
int shiftedOut;
assert( bitsToShift <= 23 );
// Grabs the bits we're going to shift out (so we can determine
// how to round this value after the shift).
shiftedOut = *valToShift & masks[ bitsToShift ];
// Shift the value to the right the specified number of bits:
*valToShift = *valToShift >> bitsToShift;

// If necessary, round the value:
if( shiftedOut > HOmasks[ bitsToShift ] )
{
// If the bits we shifted out are greater than 1/2 the L.O. bit, then
// round the value up by one.
*valToShift = *valToShift + 1;
}
else if( shiftedOut == HOmasks[ bitsToShift ] )
{
// If the bits we shifted out are exactly 1/2 of the L.O. bit's value,
// then round the value to the nearest number whose L.O. bit is zero.
*valToShift = *valToShift + ((*valToShift & 1) == 1);
}
// else we round the value down to the previous value. The current
// value is already truncated (rounded down), so we don't have to do anything.
}
// I got two actual floating point values. I want to add them together.
// 1. "denormalize" one of the operands if their exponents aren't
// the same (when adding or subtracting values, the exponents must be the same).
//
// Algorithm: choose the value with the smaller exponent. Shift its mantissa
// to the right the number of bits specified by the difference between the two
// exponents.
if( rexp > lexp )
{
shift( &littleMan, (rexp - lexp));
Dexponent = rexp;
}
else if( rexp < lexp )
{
shift( &littleMan, (lexp - rexp));
Dexponent = lexp;

}
unsigned int result = Dexponent;
float fresult = *(float*)&result;
return(fresult);
}
int main()
{
const int SIZE = 256;
char line[SIZE];
while (1)
{
float f1;
float f2;
float left = f1;
float right = f2;
printf("Please enter the first float ( "q" to quit):");
if (toupper(line[0]) =='Q')
break;
f1 = atof(line);
printf("Please enter the second float ( "q" to quit):");
if (toupper(line[0]) == 'Q')
break;
f2 = atof(line);
if (isNegative(f1) || isNegative(f2))

printf ("One of thse is negative, but %g + %g == %g ", f1,f2,sum(f1,f2));
else
printf("%g + %g == %g ", f1,f2,sum(f1,f2));
}
return(EXIT_SUCCESS);
1 = 1*20
2 = 1*21
0.375 = 1.5*2-2

Floating point basicsThe core idea of floating-point representatio.pdf

Recommended

Recommended

More Related Content

Similar to Floating point basicsThe core idea of floating-point representatio.pdf

Similar to Floating point basicsThe core idea of floating-point representatio.pdf (20)

More from info235816

More from info235816 (20)

Recently uploaded

Recently uploaded (20)

Floating point basicsThe core idea of floating-point representatio.pdf