SlideShare a Scribd company logo
Floating point basics
The core idea of floating-point representations (as opposed to fixed point representations as used
by, say, ints), is that a number x is written as m*be where m is a mantissa or fractional part, b is a
base, and e is an exponent. On modern computers the base is almost always 2, and for most
floating-point representations the mantissa will be scaled to be between 1 and b. This is done by
adjusting the exponent, e.g.
1 = 1*20
2 = 1*21
0.375 = 1.5*2-2
etc.
Iam writing a program that does some floating addition that uses bit patterns with shifts applied
to the mantissa and such to obtain the sum of the two floating point numbers. Logically and on
paper I can get this to compute the correct sum.
Code:
#include
#include
#include
#include
int isNegative (float f)
{
unsigned int* iptr = (unsigned int*)&f;
return ( ((*iptr) & 0x80000000) ? 1:0);
}
unsigned char getExponent (float f)
{
unsigned int* iptr = (unsigned int*)&f;
return (((*iptr >> 23) & 0xff) - 127);
}
unsigned int getMantissa (float f)
{
unsigned int* iptr = (unsigned int*)&f;
if( *iptr == 0 ) return 0;
return ((*iptr & 0xFFFFFF) | 0x800000 );
}
float sum (float left, float right)
{
unsigned int littleMan;
unsigned int bigMan;
unsigned char littleE;
unsigned char bigE;
unsigned char lexp = getExponent(left);
unsigned char rexp = getExponent(right);
int Dexponent;
if (lexp > rexp)
{
bigE = lexp;
bigMan = getMantissa(left);
littleE = rexp;
littleMan = getMantissa(right);
}
else
{
bigE = rexp;
bigMan = getMantissa(right);
littleE = lexp;
littleMan = getMantissa(left);
}
printf("little: %x %x ", littleE, littleMan);
printf("big: %x %x ", bigE, bigMan);
void shift( unsigned int *valToShift, int bitsToShift )
{
// Masks is used to mask out bits to check for a "sticky" bit.
static unsigned masks[24] =
{
0, 1, 3, 7, 0xf, 0x1f, 0x3f, 0x7f,
0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff,
0xffff, 0x1ffff, 0x3ffff, 0x7ffff, 0xfffff, 0x1fffff, 0x3fffff, 0x7fffff
};
// HOmasks - masks out the H.O. bit of the value masked by the masks entry.
static unsigned HOmasks[24] =
{
0,
1, 2, 4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000,
0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000, 0x400000
};
// shiftedOut- Holds the value that will be shifted out of a mantissa
// during the denormalization operation (used to round a denormalized value).
int shiftedOut;
assert( bitsToShift <= 23 );
// Grabs the bits we're going to shift out (so we can determine
// how to round this value after the shift).
shiftedOut = *valToShift & masks[ bitsToShift ];
// Shift the value to the right the specified number of bits:
*valToShift = *valToShift >> bitsToShift;
// If necessary, round the value:
if( shiftedOut > HOmasks[ bitsToShift ] )
{
// If the bits we shifted out are greater than 1/2 the L.O. bit, then
// round the value up by one.
*valToShift = *valToShift + 1;
}
else if( shiftedOut == HOmasks[ bitsToShift ] )
{
// If the bits we shifted out are exactly 1/2 of the L.O. bit's value,
// then round the value to the nearest number whose L.O. bit is zero.
*valToShift = *valToShift + ((*valToShift & 1) == 1);
}
// else we round the value down to the previous value. The current
// value is already truncated (rounded down), so we don't have to do anything.
}
// I got two actual floating point values. I want to add them together.
// 1. "denormalize" one of the operands if their exponents aren't
// the same (when adding or subtracting values, the exponents must be the same).
//
// Algorithm: choose the value with the smaller exponent. Shift its mantissa
// to the right the number of bits specified by the difference between the two
// exponents.
if( rexp > lexp )
{
shift( &littleMan, (rexp - lexp));
Dexponent = rexp;
}
else if( rexp < lexp )
{
shift( &littleMan, (lexp - rexp));
Dexponent = lexp;
}
unsigned int result = Dexponent;
float fresult = *(float*)&result;
return(fresult);
}
int main()
{
const int SIZE = 256;
char line[SIZE];
while (1)
{
float f1;
float f2;
float left = f1;
float right = f2;
printf("Please enter the first float ( "q" to quit):");
fgets(line,SIZE,stdin);
if (toupper(line[0]) =='Q')
break;
f1 = atof(line);
printf("Please enter the second float ( "q" to quit):");
fgets(line,SIZE,stdin);
if (toupper(line[0]) == 'Q')
break;
f2 = atof(line);
if (isNegative(f1) || isNegative(f2))
printf ("One of thse is negative, but %g + %g == %g ", f1,f2,sum(f1,f2));
else
printf("%g + %g == %g ", f1,f2,sum(f1,f2));
}
return(EXIT_SUCCESS);
1 = 1*20
2 = 1*21
0.375 = 1.5*2-2
Solution
Floating point basics
The core idea of floating-point representations (as opposed to fixed point representations as used
by, say, ints), is that a number x is written as m*be where m is a mantissa or fractional part, b is a
base, and e is an exponent. On modern computers the base is almost always 2, and for most
floating-point representations the mantissa will be scaled to be between 1 and b. This is done by
adjusting the exponent, e.g.
1 = 1*20
2 = 1*21
0.375 = 1.5*2-2
etc.
Iam writing a program that does some floating addition that uses bit patterns with shifts applied
to the mantissa and such to obtain the sum of the two floating point numbers. Logically and on
paper I can get this to compute the correct sum.
Code:
#include
#include
#include
#include
int isNegative (float f)
{
unsigned int* iptr = (unsigned int*)&f;
return ( ((*iptr) & 0x80000000) ? 1:0);
}
unsigned char getExponent (float f)
{
unsigned int* iptr = (unsigned int*)&f;
return (((*iptr >> 23) & 0xff) - 127);
}
unsigned int getMantissa (float f)
{
unsigned int* iptr = (unsigned int*)&f;
if( *iptr == 0 ) return 0;
return ((*iptr & 0xFFFFFF) | 0x800000 );
}
float sum (float left, float right)
{
unsigned int littleMan;
unsigned int bigMan;
unsigned char littleE;
unsigned char bigE;
unsigned char lexp = getExponent(left);
unsigned char rexp = getExponent(right);
int Dexponent;
if (lexp > rexp)
{
bigE = lexp;
bigMan = getMantissa(left);
littleE = rexp;
littleMan = getMantissa(right);
}
else
{
bigE = rexp;
bigMan = getMantissa(right);
littleE = lexp;
littleMan = getMantissa(left);
}
printf("little: %x %x ", littleE, littleMan);
printf("big: %x %x ", bigE, bigMan);
void shift( unsigned int *valToShift, int bitsToShift )
{
// Masks is used to mask out bits to check for a "sticky" bit.
static unsigned masks[24] =
{
0, 1, 3, 7, 0xf, 0x1f, 0x3f, 0x7f,
0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff,
0xffff, 0x1ffff, 0x3ffff, 0x7ffff, 0xfffff, 0x1fffff, 0x3fffff, 0x7fffff
};
// HOmasks - masks out the H.O. bit of the value masked by the masks entry.
static unsigned HOmasks[24] =
{
0,
1, 2, 4, 0x8, 0x10, 0x20, 0x40, 0x80,
0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000,
0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000, 0x400000
};
// shiftedOut- Holds the value that will be shifted out of a mantissa
// during the denormalization operation (used to round a denormalized value).
int shiftedOut;
assert( bitsToShift <= 23 );
// Grabs the bits we're going to shift out (so we can determine
// how to round this value after the shift).
shiftedOut = *valToShift & masks[ bitsToShift ];
// Shift the value to the right the specified number of bits:
*valToShift = *valToShift >> bitsToShift;
// If necessary, round the value:
if( shiftedOut > HOmasks[ bitsToShift ] )
{
// If the bits we shifted out are greater than 1/2 the L.O. bit, then
// round the value up by one.
*valToShift = *valToShift + 1;
}
else if( shiftedOut == HOmasks[ bitsToShift ] )
{
// If the bits we shifted out are exactly 1/2 of the L.O. bit's value,
// then round the value to the nearest number whose L.O. bit is zero.
*valToShift = *valToShift + ((*valToShift & 1) == 1);
}
// else we round the value down to the previous value. The current
// value is already truncated (rounded down), so we don't have to do anything.
}
// I got two actual floating point values. I want to add them together.
// 1. "denormalize" one of the operands if their exponents aren't
// the same (when adding or subtracting values, the exponents must be the same).
//
// Algorithm: choose the value with the smaller exponent. Shift its mantissa
// to the right the number of bits specified by the difference between the two
// exponents.
if( rexp > lexp )
{
shift( &littleMan, (rexp - lexp));
Dexponent = rexp;
}
else if( rexp < lexp )
{
shift( &littleMan, (lexp - rexp));
Dexponent = lexp;
}
unsigned int result = Dexponent;
float fresult = *(float*)&result;
return(fresult);
}
int main()
{
const int SIZE = 256;
char line[SIZE];
while (1)
{
float f1;
float f2;
float left = f1;
float right = f2;
printf("Please enter the first float ( "q" to quit):");
fgets(line,SIZE,stdin);
if (toupper(line[0]) =='Q')
break;
f1 = atof(line);
printf("Please enter the second float ( "q" to quit):");
fgets(line,SIZE,stdin);
if (toupper(line[0]) == 'Q')
break;
f2 = atof(line);
if (isNegative(f1) || isNegative(f2))
printf ("One of thse is negative, but %g + %g == %g ", f1,f2,sum(f1,f2));
else
printf("%g + %g == %g ", f1,f2,sum(f1,f2));
}
return(EXIT_SUCCESS);
1 = 1*20
2 = 1*21
0.375 = 1.5*2-2

More Related Content

Similar to Floating point basicsThe core idea of floating-point representatio.pdf

Lecture 3 and 4.pptx
Lecture 3 and 4.pptxLecture 3 and 4.pptx
Lecture 3 and 4.pptx
MAHAMASADIK
 
Let Us Learn Lambda Using C# 3.0
Let Us Learn Lambda Using C# 3.0Let Us Learn Lambda Using C# 3.0
Let Us Learn Lambda Using C# 3.0
Sheik Uduman Ali
 
Use the same variable names and write the function F - Force(x-ks-kc-l.pdf
Use the same variable names and write the function F - Force(x-ks-kc-l.pdfUse the same variable names and write the function F - Force(x-ks-kc-l.pdf
Use the same variable names and write the function F - Force(x-ks-kc-l.pdf
acteleshoppe
 
Pointer
PointerPointer
An Introduction to Part of C++ STL
An Introduction to Part of C++ STLAn Introduction to Part of C++ STL
An Introduction to Part of C++ STL
乐群 陈
 
introduction to c programming and C History.pptx
introduction to c programming and C History.pptxintroduction to c programming and C History.pptx
introduction to c programming and C History.pptx
ManojKhadilkar1
 
Rasterisation of a circle by the bresenham algorithm
Rasterisation of a circle by the bresenham algorithmRasterisation of a circle by the bresenham algorithm
Rasterisation of a circle by the bresenham algorithmKALAIRANJANI21
 
Rasterisation of a circle by the bresenham algorithm
Rasterisation of a circle by the bresenham algorithmRasterisation of a circle by the bresenham algorithm
Rasterisation of a circle by the bresenham algorithmKALAIRANJANI21
 
Maxima Finding problem In the 2-dimension space, we shall say that .pdf
Maxima Finding problem In the 2-dimension space, we shall say that .pdfMaxima Finding problem In the 2-dimension space, we shall say that .pdf
Maxima Finding problem In the 2-dimension space, we shall say that .pdf
arrowit1
 
[FT-11][suhorng] “Poor Man's” Undergraduate Compilers
[FT-11][suhorng] “Poor Man's” Undergraduate Compilers[FT-11][suhorng] “Poor Man's” Undergraduate Compilers
[FT-11][suhorng] “Poor Man's” Undergraduate Compilers
Functional Thursday
 
Building fast interpreters in Rust
Building fast interpreters in RustBuilding fast interpreters in Rust
Building fast interpreters in Rust
Ingvar Stepanyan
 
CPP Homework Help
CPP Homework HelpCPP Homework Help
CPP Homework Help
C++ Homework Help
 
Go Programming Language (Golang)
Go Programming Language (Golang)Go Programming Language (Golang)
Go Programming Language (Golang)
Ishin Vin
 
Stacks,queues,linked-list
Stacks,queues,linked-listStacks,queues,linked-list
Stacks,queues,linked-list
pinakspatel
 
Intro to Matlab programming
Intro to Matlab programmingIntro to Matlab programming
Intro to Matlab programming
Ahmed Moawad
 
The concept of stack is extremely important in computer science and .pdf
The concept of stack is extremely important in computer science and .pdfThe concept of stack is extremely important in computer science and .pdf
The concept of stack is extremely important in computer science and .pdf
arihantsherwani
 
Intel JIT Talk
Intel JIT TalkIntel JIT Talk
Intel JIT Talkiamdvander
 
The Functional Programming Triad of Folding, Scanning and Iteration - a first...
The Functional Programming Triad of Folding, Scanning and Iteration - a first...The Functional Programming Triad of Folding, Scanning and Iteration - a first...
The Functional Programming Triad of Folding, Scanning and Iteration - a first...
Philip Schwarz
 
Write a Matlab code (a computerized program) for calculating plane st.docx
 Write a Matlab code (a computerized program) for calculating plane st.docx Write a Matlab code (a computerized program) for calculating plane st.docx
Write a Matlab code (a computerized program) for calculating plane st.docx
ajoy21
 

Similar to Floating point basicsThe core idea of floating-point representatio.pdf (20)

Lecture 3 and 4.pptx
Lecture 3 and 4.pptxLecture 3 and 4.pptx
Lecture 3 and 4.pptx
 
Let Us Learn Lambda Using C# 3.0
Let Us Learn Lambda Using C# 3.0Let Us Learn Lambda Using C# 3.0
Let Us Learn Lambda Using C# 3.0
 
Use the same variable names and write the function F - Force(x-ks-kc-l.pdf
Use the same variable names and write the function F - Force(x-ks-kc-l.pdfUse the same variable names and write the function F - Force(x-ks-kc-l.pdf
Use the same variable names and write the function F - Force(x-ks-kc-l.pdf
 
Pointer
PointerPointer
Pointer
 
An Introduction to Part of C++ STL
An Introduction to Part of C++ STLAn Introduction to Part of C++ STL
An Introduction to Part of C++ STL
 
introduction to c programming and C History.pptx
introduction to c programming and C History.pptxintroduction to c programming and C History.pptx
introduction to c programming and C History.pptx
 
Rasterisation of a circle by the bresenham algorithm
Rasterisation of a circle by the bresenham algorithmRasterisation of a circle by the bresenham algorithm
Rasterisation of a circle by the bresenham algorithm
 
Rasterisation of a circle by the bresenham algorithm
Rasterisation of a circle by the bresenham algorithmRasterisation of a circle by the bresenham algorithm
Rasterisation of a circle by the bresenham algorithm
 
Maxima Finding problem In the 2-dimension space, we shall say that .pdf
Maxima Finding problem In the 2-dimension space, we shall say that .pdfMaxima Finding problem In the 2-dimension space, we shall say that .pdf
Maxima Finding problem In the 2-dimension space, we shall say that .pdf
 
[FT-11][suhorng] “Poor Man's” Undergraduate Compilers
[FT-11][suhorng] “Poor Man's” Undergraduate Compilers[FT-11][suhorng] “Poor Man's” Undergraduate Compilers
[FT-11][suhorng] “Poor Man's” Undergraduate Compilers
 
Building fast interpreters in Rust
Building fast interpreters in RustBuilding fast interpreters in Rust
Building fast interpreters in Rust
 
CPP Homework Help
CPP Homework HelpCPP Homework Help
CPP Homework Help
 
Go Programming Language (Golang)
Go Programming Language (Golang)Go Programming Language (Golang)
Go Programming Language (Golang)
 
Stacks,queues,linked-list
Stacks,queues,linked-listStacks,queues,linked-list
Stacks,queues,linked-list
 
Intro to Matlab programming
Intro to Matlab programmingIntro to Matlab programming
Intro to Matlab programming
 
The concept of stack is extremely important in computer science and .pdf
The concept of stack is extremely important in computer science and .pdfThe concept of stack is extremely important in computer science and .pdf
The concept of stack is extremely important in computer science and .pdf
 
week-23x
week-23xweek-23x
week-23x
 
Intel JIT Talk
Intel JIT TalkIntel JIT Talk
Intel JIT Talk
 
The Functional Programming Triad of Folding, Scanning and Iteration - a first...
The Functional Programming Triad of Folding, Scanning and Iteration - a first...The Functional Programming Triad of Folding, Scanning and Iteration - a first...
The Functional Programming Triad of Folding, Scanning and Iteration - a first...
 
Write a Matlab code (a computerized program) for calculating plane st.docx
 Write a Matlab code (a computerized program) for calculating plane st.docx Write a Matlab code (a computerized program) for calculating plane st.docx
Write a Matlab code (a computerized program) for calculating plane st.docx
 

More from info235816

2H2+O2=2H2O2 moles react with one mole of oxygen to give two moles.pdf
2H2+O2=2H2O2 moles react with one mole of oxygen to give two moles.pdf2H2+O2=2H2O2 moles react with one mole of oxygen to give two moles.pdf
2H2+O2=2H2O2 moles react with one mole of oxygen to give two moles.pdf
info235816
 
10 = ( 1+ 65200 ) ^n10 = 1.0115 ^ (n)n = 201.37Solu.pdf
10 = ( 1+ 65200 ) ^n10 = 1.0115 ^ (n)n = 201.37Solu.pdf10 = ( 1+ 65200 ) ^n10 = 1.0115 ^ (n)n = 201.37Solu.pdf
10 = ( 1+ 65200 ) ^n10 = 1.0115 ^ (n)n = 201.37Solu.pdf
info235816
 
1) Sugar because Payments to Labour has major component in Expenses .pdf
1) Sugar because Payments to Labour has major component in Expenses .pdf1) Sugar because Payments to Labour has major component in Expenses .pdf
1) Sugar because Payments to Labour has major component in Expenses .pdf
info235816
 
(3)Solution(3).pdf
(3)Solution(3).pdf(3)Solution(3).pdf
(3)Solution(3).pdf
info235816
 
Yellow fever is caused by the yellow fever virus and is spread by th.pdf
Yellow fever is caused by the yellow fever virus and is spread by th.pdfYellow fever is caused by the yellow fever virus and is spread by th.pdf
Yellow fever is caused by the yellow fever virus and is spread by th.pdf
info235816
 
Values are so important in life. The values kids learn at a young ag.pdf
Values are so important in life. The values kids learn at a young ag.pdfValues are so important in life. The values kids learn at a young ag.pdf
Values are so important in life. The values kids learn at a young ag.pdf
info235816
 
Time Value of Money (TVM) Value of the money does not remain same i.pdf
Time Value of Money (TVM) Value of the money does not remain same i.pdfTime Value of Money (TVM) Value of the money does not remain same i.pdf
Time Value of Money (TVM) Value of the money does not remain same i.pdf
info235816
 
#include stdio.h #include stdlib.h int main() { int l1.pdf
#include stdio.h #include stdlib.h int main() { int l1.pdf#include stdio.h #include stdlib.h int main() { int l1.pdf
#include stdio.h #include stdlib.h int main() { int l1.pdf
info235816
 
no it does not exists .pdf
                     no it does not exists                            .pdf                     no it does not exists                            .pdf
no it does not exists .pdf
info235816
 
The shape is angular, because of the two pairs of.pdf
                     The shape is angular, because of the two pairs of.pdf                     The shape is angular, because of the two pairs of.pdf
The shape is angular, because of the two pairs of.pdf
info235816
 
Step1 Moles of ions from NaCl = 2x.46 = .92 Step2.pdf
                     Step1 Moles of ions from NaCl = 2x.46 = .92 Step2.pdf                     Step1 Moles of ions from NaCl = 2x.46 = .92 Step2.pdf
Step1 Moles of ions from NaCl = 2x.46 = .92 Step2.pdf
info235816
 
so that the reaction takes place and forms (=0 )b.pdf
                     so that the reaction takes place and forms (=0 )b.pdf                     so that the reaction takes place and forms (=0 )b.pdf
so that the reaction takes place and forms (=0 )b.pdf
info235816
 
PV = nRT in both cases other one is taken consta.pdf
                     PV = nRT  in both cases other one is taken consta.pdf                     PV = nRT  in both cases other one is taken consta.pdf
PV = nRT in both cases other one is taken consta.pdf
info235816
 
In case of SO2 Dipole - Dipole forces are stron.pdf
                     In case of SO2  Dipole - Dipole forces are stron.pdf                     In case of SO2  Dipole - Dipole forces are stron.pdf
In case of SO2 Dipole - Dipole forces are stron.pdf
info235816
 
D) None of the above. first nitration takes place.pdf
                     D) None of the above. first nitration takes place.pdf                     D) None of the above. first nitration takes place.pdf
D) None of the above. first nitration takes place.pdf
info235816
 
Decrease in Entropy (S) always occurs when the ph.pdf
                     Decrease in Entropy (S) always occurs when the ph.pdf                     Decrease in Entropy (S) always occurs when the ph.pdf
Decrease in Entropy (S) always occurs when the ph.pdf
info235816
 
The diploid genetic system is more intersting than th monoploid gene.pdf
The diploid genetic system is more intersting than th monoploid gene.pdfThe diploid genetic system is more intersting than th monoploid gene.pdf
The diploid genetic system is more intersting than th monoploid gene.pdf
info235816
 
Solution If r t= Multiply both the sides by -1 , we have= -.pdf
Solution  If r t= Multiply both the sides by -1 , we have= -.pdfSolution  If r t= Multiply both the sides by -1 , we have= -.pdf
Solution If r t= Multiply both the sides by -1 , we have= -.pdf
info235816
 
Step-1Code the playing card and then sort the cards in a deck.Ste.pdf
Step-1Code the playing card and then sort the cards in a deck.Ste.pdfStep-1Code the playing card and then sort the cards in a deck.Ste.pdf
Step-1Code the playing card and then sort the cards in a deck.Ste.pdf
info235816
 
conc . of H+ = conc. of H3O + = 0.250.17 =0.0425.pdf
                     conc . of H+ = conc. of H3O + = 0.250.17 =0.0425.pdf                     conc . of H+ = conc. of H3O + = 0.250.17 =0.0425.pdf
conc . of H+ = conc. of H3O + = 0.250.17 =0.0425.pdf
info235816
 

More from info235816 (20)

2H2+O2=2H2O2 moles react with one mole of oxygen to give two moles.pdf
2H2+O2=2H2O2 moles react with one mole of oxygen to give two moles.pdf2H2+O2=2H2O2 moles react with one mole of oxygen to give two moles.pdf
2H2+O2=2H2O2 moles react with one mole of oxygen to give two moles.pdf
 
10 = ( 1+ 65200 ) ^n10 = 1.0115 ^ (n)n = 201.37Solu.pdf
10 = ( 1+ 65200 ) ^n10 = 1.0115 ^ (n)n = 201.37Solu.pdf10 = ( 1+ 65200 ) ^n10 = 1.0115 ^ (n)n = 201.37Solu.pdf
10 = ( 1+ 65200 ) ^n10 = 1.0115 ^ (n)n = 201.37Solu.pdf
 
1) Sugar because Payments to Labour has major component in Expenses .pdf
1) Sugar because Payments to Labour has major component in Expenses .pdf1) Sugar because Payments to Labour has major component in Expenses .pdf
1) Sugar because Payments to Labour has major component in Expenses .pdf
 
(3)Solution(3).pdf
(3)Solution(3).pdf(3)Solution(3).pdf
(3)Solution(3).pdf
 
Yellow fever is caused by the yellow fever virus and is spread by th.pdf
Yellow fever is caused by the yellow fever virus and is spread by th.pdfYellow fever is caused by the yellow fever virus and is spread by th.pdf
Yellow fever is caused by the yellow fever virus and is spread by th.pdf
 
Values are so important in life. The values kids learn at a young ag.pdf
Values are so important in life. The values kids learn at a young ag.pdfValues are so important in life. The values kids learn at a young ag.pdf
Values are so important in life. The values kids learn at a young ag.pdf
 
Time Value of Money (TVM) Value of the money does not remain same i.pdf
Time Value of Money (TVM) Value of the money does not remain same i.pdfTime Value of Money (TVM) Value of the money does not remain same i.pdf
Time Value of Money (TVM) Value of the money does not remain same i.pdf
 
#include stdio.h #include stdlib.h int main() { int l1.pdf
#include stdio.h #include stdlib.h int main() { int l1.pdf#include stdio.h #include stdlib.h int main() { int l1.pdf
#include stdio.h #include stdlib.h int main() { int l1.pdf
 
no it does not exists .pdf
                     no it does not exists                            .pdf                     no it does not exists                            .pdf
no it does not exists .pdf
 
The shape is angular, because of the two pairs of.pdf
                     The shape is angular, because of the two pairs of.pdf                     The shape is angular, because of the two pairs of.pdf
The shape is angular, because of the two pairs of.pdf
 
Step1 Moles of ions from NaCl = 2x.46 = .92 Step2.pdf
                     Step1 Moles of ions from NaCl = 2x.46 = .92 Step2.pdf                     Step1 Moles of ions from NaCl = 2x.46 = .92 Step2.pdf
Step1 Moles of ions from NaCl = 2x.46 = .92 Step2.pdf
 
so that the reaction takes place and forms (=0 )b.pdf
                     so that the reaction takes place and forms (=0 )b.pdf                     so that the reaction takes place and forms (=0 )b.pdf
so that the reaction takes place and forms (=0 )b.pdf
 
PV = nRT in both cases other one is taken consta.pdf
                     PV = nRT  in both cases other one is taken consta.pdf                     PV = nRT  in both cases other one is taken consta.pdf
PV = nRT in both cases other one is taken consta.pdf
 
In case of SO2 Dipole - Dipole forces are stron.pdf
                     In case of SO2  Dipole - Dipole forces are stron.pdf                     In case of SO2  Dipole - Dipole forces are stron.pdf
In case of SO2 Dipole - Dipole forces are stron.pdf
 
D) None of the above. first nitration takes place.pdf
                     D) None of the above. first nitration takes place.pdf                     D) None of the above. first nitration takes place.pdf
D) None of the above. first nitration takes place.pdf
 
Decrease in Entropy (S) always occurs when the ph.pdf
                     Decrease in Entropy (S) always occurs when the ph.pdf                     Decrease in Entropy (S) always occurs when the ph.pdf
Decrease in Entropy (S) always occurs when the ph.pdf
 
The diploid genetic system is more intersting than th monoploid gene.pdf
The diploid genetic system is more intersting than th monoploid gene.pdfThe diploid genetic system is more intersting than th monoploid gene.pdf
The diploid genetic system is more intersting than th monoploid gene.pdf
 
Solution If r t= Multiply both the sides by -1 , we have= -.pdf
Solution  If r t= Multiply both the sides by -1 , we have= -.pdfSolution  If r t= Multiply both the sides by -1 , we have= -.pdf
Solution If r t= Multiply both the sides by -1 , we have= -.pdf
 
Step-1Code the playing card and then sort the cards in a deck.Ste.pdf
Step-1Code the playing card and then sort the cards in a deck.Ste.pdfStep-1Code the playing card and then sort the cards in a deck.Ste.pdf
Step-1Code the playing card and then sort the cards in a deck.Ste.pdf
 
conc . of H+ = conc. of H3O + = 0.250.17 =0.0425.pdf
                     conc . of H+ = conc. of H3O + = 0.250.17 =0.0425.pdf                     conc . of H+ = conc. of H3O + = 0.250.17 =0.0425.pdf
conc . of H+ = conc. of H3O + = 0.250.17 =0.0425.pdf
 

Recently uploaded

June 3, 2024 Anti-Semitism Letter Sent to MIT President Kornbluth and MIT Cor...
June 3, 2024 Anti-Semitism Letter Sent to MIT President Kornbluth and MIT Cor...June 3, 2024 Anti-Semitism Letter Sent to MIT President Kornbluth and MIT Cor...
June 3, 2024 Anti-Semitism Letter Sent to MIT President Kornbluth and MIT Cor...
Levi Shapiro
 
The Diamonds of 2023-2024 in the IGRA collection
The Diamonds of 2023-2024 in the IGRA collectionThe Diamonds of 2023-2024 in the IGRA collection
The Diamonds of 2023-2024 in the IGRA collection
Israel Genealogy Research Association
 
"Protectable subject matters, Protection in biotechnology, Protection of othe...
"Protectable subject matters, Protection in biotechnology, Protection of othe..."Protectable subject matters, Protection in biotechnology, Protection of othe...
"Protectable subject matters, Protection in biotechnology, Protection of othe...
SACHIN R KONDAGURI
 
A Survey of Techniques for Maximizing LLM Performance.pptx
A Survey of Techniques for Maximizing LLM Performance.pptxA Survey of Techniques for Maximizing LLM Performance.pptx
A Survey of Techniques for Maximizing LLM Performance.pptx
thanhdowork
 
Unit 8 - Information and Communication Technology (Paper I).pdf
Unit 8 - Information and Communication Technology (Paper I).pdfUnit 8 - Information and Communication Technology (Paper I).pdf
Unit 8 - Information and Communication Technology (Paper I).pdf
Thiyagu K
 
JEE1_This_section_contains_FOUR_ questions
JEE1_This_section_contains_FOUR_ questionsJEE1_This_section_contains_FOUR_ questions
JEE1_This_section_contains_FOUR_ questions
ShivajiThube2
 
CACJapan - GROUP Presentation 1- Wk 4.pdf
CACJapan - GROUP Presentation 1- Wk 4.pdfCACJapan - GROUP Presentation 1- Wk 4.pdf
CACJapan - GROUP Presentation 1- Wk 4.pdf
camakaiclarkmusic
 
special B.ed 2nd year old paper_20240531.pdf
special B.ed 2nd year old paper_20240531.pdfspecial B.ed 2nd year old paper_20240531.pdf
special B.ed 2nd year old paper_20240531.pdf
Special education needs
 
STRAND 3 HYGIENIC PRACTICES.pptx GRADE 7 CBC
STRAND 3 HYGIENIC PRACTICES.pptx GRADE 7 CBCSTRAND 3 HYGIENIC PRACTICES.pptx GRADE 7 CBC
STRAND 3 HYGIENIC PRACTICES.pptx GRADE 7 CBC
kimdan468
 
Synthetic Fiber Construction in lab .pptx
Synthetic Fiber Construction in lab .pptxSynthetic Fiber Construction in lab .pptx
Synthetic Fiber Construction in lab .pptx
Pavel ( NSTU)
 
TESDA TM1 REVIEWER FOR NATIONAL ASSESSMENT WRITTEN AND ORAL QUESTIONS WITH A...
TESDA TM1 REVIEWER  FOR NATIONAL ASSESSMENT WRITTEN AND ORAL QUESTIONS WITH A...TESDA TM1 REVIEWER  FOR NATIONAL ASSESSMENT WRITTEN AND ORAL QUESTIONS WITH A...
TESDA TM1 REVIEWER FOR NATIONAL ASSESSMENT WRITTEN AND ORAL QUESTIONS WITH A...
EugeneSaldivar
 
Executive Directors Chat Leveraging AI for Diversity, Equity, and Inclusion
Executive Directors Chat  Leveraging AI for Diversity, Equity, and InclusionExecutive Directors Chat  Leveraging AI for Diversity, Equity, and Inclusion
Executive Directors Chat Leveraging AI for Diversity, Equity, and Inclusion
TechSoup
 
The basics of sentences session 5pptx.pptx
The basics of sentences session 5pptx.pptxThe basics of sentences session 5pptx.pptx
The basics of sentences session 5pptx.pptx
heathfieldcps1
 
S1-Introduction-Biopesticides in ICM.pptx
S1-Introduction-Biopesticides in ICM.pptxS1-Introduction-Biopesticides in ICM.pptx
S1-Introduction-Biopesticides in ICM.pptx
tarandeep35
 
MASS MEDIA STUDIES-835-CLASS XI Resource Material.pdf
MASS MEDIA STUDIES-835-CLASS XI Resource Material.pdfMASS MEDIA STUDIES-835-CLASS XI Resource Material.pdf
MASS MEDIA STUDIES-835-CLASS XI Resource Material.pdf
goswamiyash170123
 
Unit 2- Research Aptitude (UGC NET Paper I).pdf
Unit 2- Research Aptitude (UGC NET Paper I).pdfUnit 2- Research Aptitude (UGC NET Paper I).pdf
Unit 2- Research Aptitude (UGC NET Paper I).pdf
Thiyagu K
 
Introduction to AI for Nonprofits with Tapp Network
Introduction to AI for Nonprofits with Tapp NetworkIntroduction to AI for Nonprofits with Tapp Network
Introduction to AI for Nonprofits with Tapp Network
TechSoup
 
Thesis Statement for students diagnonsed withADHD.ppt
Thesis Statement for students diagnonsed withADHD.pptThesis Statement for students diagnonsed withADHD.ppt
Thesis Statement for students diagnonsed withADHD.ppt
EverAndrsGuerraGuerr
 
A Strategic Approach: GenAI in Education
A Strategic Approach: GenAI in EducationA Strategic Approach: GenAI in Education
A Strategic Approach: GenAI in Education
Peter Windle
 
The Challenger.pdf DNHS Official Publication
The Challenger.pdf DNHS Official PublicationThe Challenger.pdf DNHS Official Publication
The Challenger.pdf DNHS Official Publication
Delapenabediema
 

Recently uploaded (20)

June 3, 2024 Anti-Semitism Letter Sent to MIT President Kornbluth and MIT Cor...
June 3, 2024 Anti-Semitism Letter Sent to MIT President Kornbluth and MIT Cor...June 3, 2024 Anti-Semitism Letter Sent to MIT President Kornbluth and MIT Cor...
June 3, 2024 Anti-Semitism Letter Sent to MIT President Kornbluth and MIT Cor...
 
The Diamonds of 2023-2024 in the IGRA collection
The Diamonds of 2023-2024 in the IGRA collectionThe Diamonds of 2023-2024 in the IGRA collection
The Diamonds of 2023-2024 in the IGRA collection
 
"Protectable subject matters, Protection in biotechnology, Protection of othe...
"Protectable subject matters, Protection in biotechnology, Protection of othe..."Protectable subject matters, Protection in biotechnology, Protection of othe...
"Protectable subject matters, Protection in biotechnology, Protection of othe...
 
A Survey of Techniques for Maximizing LLM Performance.pptx
A Survey of Techniques for Maximizing LLM Performance.pptxA Survey of Techniques for Maximizing LLM Performance.pptx
A Survey of Techniques for Maximizing LLM Performance.pptx
 
Unit 8 - Information and Communication Technology (Paper I).pdf
Unit 8 - Information and Communication Technology (Paper I).pdfUnit 8 - Information and Communication Technology (Paper I).pdf
Unit 8 - Information and Communication Technology (Paper I).pdf
 
JEE1_This_section_contains_FOUR_ questions
JEE1_This_section_contains_FOUR_ questionsJEE1_This_section_contains_FOUR_ questions
JEE1_This_section_contains_FOUR_ questions
 
CACJapan - GROUP Presentation 1- Wk 4.pdf
CACJapan - GROUP Presentation 1- Wk 4.pdfCACJapan - GROUP Presentation 1- Wk 4.pdf
CACJapan - GROUP Presentation 1- Wk 4.pdf
 
special B.ed 2nd year old paper_20240531.pdf
special B.ed 2nd year old paper_20240531.pdfspecial B.ed 2nd year old paper_20240531.pdf
special B.ed 2nd year old paper_20240531.pdf
 
STRAND 3 HYGIENIC PRACTICES.pptx GRADE 7 CBC
STRAND 3 HYGIENIC PRACTICES.pptx GRADE 7 CBCSTRAND 3 HYGIENIC PRACTICES.pptx GRADE 7 CBC
STRAND 3 HYGIENIC PRACTICES.pptx GRADE 7 CBC
 
Synthetic Fiber Construction in lab .pptx
Synthetic Fiber Construction in lab .pptxSynthetic Fiber Construction in lab .pptx
Synthetic Fiber Construction in lab .pptx
 
TESDA TM1 REVIEWER FOR NATIONAL ASSESSMENT WRITTEN AND ORAL QUESTIONS WITH A...
TESDA TM1 REVIEWER  FOR NATIONAL ASSESSMENT WRITTEN AND ORAL QUESTIONS WITH A...TESDA TM1 REVIEWER  FOR NATIONAL ASSESSMENT WRITTEN AND ORAL QUESTIONS WITH A...
TESDA TM1 REVIEWER FOR NATIONAL ASSESSMENT WRITTEN AND ORAL QUESTIONS WITH A...
 
Executive Directors Chat Leveraging AI for Diversity, Equity, and Inclusion
Executive Directors Chat  Leveraging AI for Diversity, Equity, and InclusionExecutive Directors Chat  Leveraging AI for Diversity, Equity, and Inclusion
Executive Directors Chat Leveraging AI for Diversity, Equity, and Inclusion
 
The basics of sentences session 5pptx.pptx
The basics of sentences session 5pptx.pptxThe basics of sentences session 5pptx.pptx
The basics of sentences session 5pptx.pptx
 
S1-Introduction-Biopesticides in ICM.pptx
S1-Introduction-Biopesticides in ICM.pptxS1-Introduction-Biopesticides in ICM.pptx
S1-Introduction-Biopesticides in ICM.pptx
 
MASS MEDIA STUDIES-835-CLASS XI Resource Material.pdf
MASS MEDIA STUDIES-835-CLASS XI Resource Material.pdfMASS MEDIA STUDIES-835-CLASS XI Resource Material.pdf
MASS MEDIA STUDIES-835-CLASS XI Resource Material.pdf
 
Unit 2- Research Aptitude (UGC NET Paper I).pdf
Unit 2- Research Aptitude (UGC NET Paper I).pdfUnit 2- Research Aptitude (UGC NET Paper I).pdf
Unit 2- Research Aptitude (UGC NET Paper I).pdf
 
Introduction to AI for Nonprofits with Tapp Network
Introduction to AI for Nonprofits with Tapp NetworkIntroduction to AI for Nonprofits with Tapp Network
Introduction to AI for Nonprofits with Tapp Network
 
Thesis Statement for students diagnonsed withADHD.ppt
Thesis Statement for students diagnonsed withADHD.pptThesis Statement for students diagnonsed withADHD.ppt
Thesis Statement for students diagnonsed withADHD.ppt
 
A Strategic Approach: GenAI in Education
A Strategic Approach: GenAI in EducationA Strategic Approach: GenAI in Education
A Strategic Approach: GenAI in Education
 
The Challenger.pdf DNHS Official Publication
The Challenger.pdf DNHS Official PublicationThe Challenger.pdf DNHS Official Publication
The Challenger.pdf DNHS Official Publication
 

Floating point basicsThe core idea of floating-point representatio.pdf

  • 1. Floating point basics The core idea of floating-point representations (as opposed to fixed point representations as used by, say, ints), is that a number x is written as m*be where m is a mantissa or fractional part, b is a base, and e is an exponent. On modern computers the base is almost always 2, and for most floating-point representations the mantissa will be scaled to be between 1 and b. This is done by adjusting the exponent, e.g. 1 = 1*20 2 = 1*21 0.375 = 1.5*2-2 etc. Iam writing a program that does some floating addition that uses bit patterns with shifts applied to the mantissa and such to obtain the sum of the two floating point numbers. Logically and on paper I can get this to compute the correct sum. Code: #include #include #include #include int isNegative (float f) { unsigned int* iptr = (unsigned int*)&f; return ( ((*iptr) & 0x80000000) ? 1:0); } unsigned char getExponent (float f) { unsigned int* iptr = (unsigned int*)&f; return (((*iptr >> 23) & 0xff) - 127); } unsigned int getMantissa (float f) { unsigned int* iptr = (unsigned int*)&f; if( *iptr == 0 ) return 0; return ((*iptr & 0xFFFFFF) | 0x800000 );
  • 2. } float sum (float left, float right) { unsigned int littleMan; unsigned int bigMan; unsigned char littleE; unsigned char bigE; unsigned char lexp = getExponent(left); unsigned char rexp = getExponent(right); int Dexponent; if (lexp > rexp) { bigE = lexp; bigMan = getMantissa(left); littleE = rexp; littleMan = getMantissa(right); } else { bigE = rexp; bigMan = getMantissa(right); littleE = lexp; littleMan = getMantissa(left); } printf("little: %x %x ", littleE, littleMan); printf("big: %x %x ", bigE, bigMan); void shift( unsigned int *valToShift, int bitsToShift ) { // Masks is used to mask out bits to check for a "sticky" bit. static unsigned masks[24] = { 0, 1, 3, 7, 0xf, 0x1f, 0x3f, 0x7f,
  • 3. 0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff, 0x1ffff, 0x3ffff, 0x7ffff, 0xfffff, 0x1fffff, 0x3fffff, 0x7fffff }; // HOmasks - masks out the H.O. bit of the value masked by the masks entry. static unsigned HOmasks[24] = { 0, 1, 2, 4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000, 0x400000 }; // shiftedOut- Holds the value that will be shifted out of a mantissa // during the denormalization operation (used to round a denormalized value). int shiftedOut; assert( bitsToShift <= 23 ); // Grabs the bits we're going to shift out (so we can determine // how to round this value after the shift). shiftedOut = *valToShift & masks[ bitsToShift ]; // Shift the value to the right the specified number of bits: *valToShift = *valToShift >> bitsToShift; // If necessary, round the value: if( shiftedOut > HOmasks[ bitsToShift ] ) { // If the bits we shifted out are greater than 1/2 the L.O. bit, then
  • 4. // round the value up by one. *valToShift = *valToShift + 1; } else if( shiftedOut == HOmasks[ bitsToShift ] ) { // If the bits we shifted out are exactly 1/2 of the L.O. bit's value, // then round the value to the nearest number whose L.O. bit is zero. *valToShift = *valToShift + ((*valToShift & 1) == 1); } // else we round the value down to the previous value. The current // value is already truncated (rounded down), so we don't have to do anything. } // I got two actual floating point values. I want to add them together. // 1. "denormalize" one of the operands if their exponents aren't // the same (when adding or subtracting values, the exponents must be the same). // // Algorithm: choose the value with the smaller exponent. Shift its mantissa // to the right the number of bits specified by the difference between the two // exponents. if( rexp > lexp ) { shift( &littleMan, (rexp - lexp)); Dexponent = rexp; } else if( rexp < lexp ) { shift( &littleMan, (lexp - rexp)); Dexponent = lexp; } unsigned int result = Dexponent; float fresult = *(float*)&result;
  • 5. return(fresult); } int main() { const int SIZE = 256; char line[SIZE]; while (1) { float f1; float f2; float left = f1; float right = f2; printf("Please enter the first float ( "q" to quit):"); fgets(line,SIZE,stdin); if (toupper(line[0]) =='Q') break; f1 = atof(line); printf("Please enter the second float ( "q" to quit):"); fgets(line,SIZE,stdin); if (toupper(line[0]) == 'Q') break; f2 = atof(line); if (isNegative(f1) || isNegative(f2)) printf ("One of thse is negative, but %g + %g == %g ", f1,f2,sum(f1,f2)); else printf("%g + %g == %g ", f1,f2,sum(f1,f2)); } return(EXIT_SUCCESS);
  • 6. 1 = 1*20 2 = 1*21 0.375 = 1.5*2-2 Solution Floating point basics The core idea of floating-point representations (as opposed to fixed point representations as used by, say, ints), is that a number x is written as m*be where m is a mantissa or fractional part, b is a base, and e is an exponent. On modern computers the base is almost always 2, and for most floating-point representations the mantissa will be scaled to be between 1 and b. This is done by adjusting the exponent, e.g. 1 = 1*20 2 = 1*21 0.375 = 1.5*2-2 etc. Iam writing a program that does some floating addition that uses bit patterns with shifts applied to the mantissa and such to obtain the sum of the two floating point numbers. Logically and on paper I can get this to compute the correct sum. Code: #include #include #include #include int isNegative (float f) { unsigned int* iptr = (unsigned int*)&f; return ( ((*iptr) & 0x80000000) ? 1:0); } unsigned char getExponent (float f) { unsigned int* iptr = (unsigned int*)&f; return (((*iptr >> 23) & 0xff) - 127); } unsigned int getMantissa (float f)
  • 7. { unsigned int* iptr = (unsigned int*)&f; if( *iptr == 0 ) return 0; return ((*iptr & 0xFFFFFF) | 0x800000 ); } float sum (float left, float right) { unsigned int littleMan; unsigned int bigMan; unsigned char littleE; unsigned char bigE; unsigned char lexp = getExponent(left); unsigned char rexp = getExponent(right); int Dexponent; if (lexp > rexp) { bigE = lexp; bigMan = getMantissa(left); littleE = rexp; littleMan = getMantissa(right); } else { bigE = rexp; bigMan = getMantissa(right); littleE = lexp; littleMan = getMantissa(left); } printf("little: %x %x ", littleE, littleMan); printf("big: %x %x ", bigE, bigMan); void shift( unsigned int *valToShift, int bitsToShift ) {
  • 8. // Masks is used to mask out bits to check for a "sticky" bit. static unsigned masks[24] = { 0, 1, 3, 7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff, 0x1ffff, 0x3ffff, 0x7ffff, 0xfffff, 0x1fffff, 0x3fffff, 0x7fffff }; // HOmasks - masks out the H.O. bit of the value masked by the masks entry. static unsigned HOmasks[24] = { 0, 1, 2, 4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000, 0x400000 }; // shiftedOut- Holds the value that will be shifted out of a mantissa // during the denormalization operation (used to round a denormalized value). int shiftedOut; assert( bitsToShift <= 23 ); // Grabs the bits we're going to shift out (so we can determine // how to round this value after the shift). shiftedOut = *valToShift & masks[ bitsToShift ]; // Shift the value to the right the specified number of bits: *valToShift = *valToShift >> bitsToShift;
  • 9. // If necessary, round the value: if( shiftedOut > HOmasks[ bitsToShift ] ) { // If the bits we shifted out are greater than 1/2 the L.O. bit, then // round the value up by one. *valToShift = *valToShift + 1; } else if( shiftedOut == HOmasks[ bitsToShift ] ) { // If the bits we shifted out are exactly 1/2 of the L.O. bit's value, // then round the value to the nearest number whose L.O. bit is zero. *valToShift = *valToShift + ((*valToShift & 1) == 1); } // else we round the value down to the previous value. The current // value is already truncated (rounded down), so we don't have to do anything. } // I got two actual floating point values. I want to add them together. // 1. "denormalize" one of the operands if their exponents aren't // the same (when adding or subtracting values, the exponents must be the same). // // Algorithm: choose the value with the smaller exponent. Shift its mantissa // to the right the number of bits specified by the difference between the two // exponents. if( rexp > lexp ) { shift( &littleMan, (rexp - lexp)); Dexponent = rexp; } else if( rexp < lexp ) { shift( &littleMan, (lexp - rexp)); Dexponent = lexp;
  • 10. } unsigned int result = Dexponent; float fresult = *(float*)&result; return(fresult); } int main() { const int SIZE = 256; char line[SIZE]; while (1) { float f1; float f2; float left = f1; float right = f2; printf("Please enter the first float ( "q" to quit):"); fgets(line,SIZE,stdin); if (toupper(line[0]) =='Q') break; f1 = atof(line); printf("Please enter the second float ( "q" to quit):"); fgets(line,SIZE,stdin); if (toupper(line[0]) == 'Q') break; f2 = atof(line); if (isNegative(f1) || isNegative(f2))
  • 11. printf ("One of thse is negative, but %g + %g == %g ", f1,f2,sum(f1,f2)); else printf("%g + %g == %g ", f1,f2,sum(f1,f2)); } return(EXIT_SUCCESS); 1 = 1*20 2 = 1*21 0.375 = 1.5*2-2