ZetCode

C Floating-Point Types

last modified April 1, 2025

C provides several floating-point types with different precision and storage characteristics. This tutorial covers IEEE 754 representation, precision limitations, hardware considerations, and practical usage patterns.

We'll examine the binary representation of floats, explain rounding modes, denormal numbers, and demonstrate common pitfalls with practical examples. Understanding these concepts is crucial for systems programming and performance-sensitive applications.

C Floating-Point Types

C offers three primary floating-point types with increasing precision:

float_types.c
#include <stdio.h>
#include <float.h>

int main() {
    float f = 3.1415926535f;       // Single precision (32-bit)
    double d = 3.141592653589793;  // Double precision (64-bit)
    long double ld = 3.14159265358979323846L; // Extended precision
    
    printf("float:       %.15f\n", f);
    printf("double:      %.15lf\n", d);
    printf("long double: %.21Lf\n", ld);
    
    printf("\nPrecision:\n");
    printf("float mantissa bits:  %d\n", FLT_MANT_DIG);
    printf("double mantissa bits: %d\n", DBL_MANT_DIG);
    printf("long double mantissa bits: %d\n", LDBL_MANT_DIG);
    
    return 0;
}

Standard C floating-point types follow IEEE 754 specifications (where supported):

The float.h header defines constants like FLT_MANT_DIG that reveal implementation details. Note that long double precision varies by architecture.

IEEE 754 Binary Representation

Floating-point numbers use sign-exponent-mantissa format:

float_representation.c
#include <stdio.h>
#include <stdint.h>

void print_float_bits(float f) {
    uint32_t* p = (uint32_t*)&f;
    uint32_t bits = *p;
    
    uint32_t sign = bits >> 31;
    uint32_t exponent = (bits >> 23) & 0xFF;
    uint32_t mantissa = bits & 0x7FFFFF;
    
    printf("Float: %f\n", f);
    printf("Sign: %d\n", sign);
    printf("Exponent: 0x%X (%d biased, %d actual)\n", 
           exponent, exponent, exponent - 127);
    printf("Mantissa: 0x%X\n", mantissa);
    printf("Binary: ");
    for (int i = 31; i >= 0; i--) {
        printf("%d", (bits >> i) & 1);
        if (i == 31 || i == 23) printf(" ");
    }
    printf("\n\n");
}

int main() {
    print_float_bits(1.0f);
    print_float_bits(0.1f);
    print_float_bits(-3.5f);
    return 0;
}

IEEE 754 single-precision format consists of:

The value is computed as: (-1)sign × 2exponent-127 × 1.mantissa2

Special Floating-Point Values

IEEE 754 defines special bit patterns:

special_values.c
#include <stdio.h>
#include <math.h>

int main() {
    float inf = INFINITY;
    float nan = NAN;
    float zero = 0.0f;
    float neg_zero = -0.0f;
    
    printf("Positive infinity: %f\n", inf);
    printf("NaN: %f\n", nan);
    printf("Zero: %f\n", zero);
    printf("Negative zero: %f\n", neg_zero);
    
    printf("\nSpecial comparisons:\n");
    printf("inf == inf: %d\n", inf == inf);  // 1
    printf("nan == nan: %d\n", nan == nan);  // 0
    printf("zero == neg_zero: %d\n", zero == neg_zero); // 1
    
    printf("\nClassification:\n");
    printf("isinf(inf): %d\n", isinf(inf));
    printf("isnan(nan): %d\n", isnan(nan));
    printf("isnormal(1.0f): %d\n", isnormal(1.0f));
    printf("fpclassify(denormal): %d\n", fpclassify(1e-45f));
    
    return 0;
}

Special floating-point values include:

The math.h header provides classification macros (isnan, isinf, etc.) for proper handling.

Precision and Rounding

Floating-point operations involve rounding:

rounding.c
#include <stdio.h>
#include <fenv.h>

void show_rounding_mode() {
    switch (fegetround()) {
        case FE_TONEAREST:  printf("FE_TONEAREST\n"); break;
        case FE_DOWNWARD:   printf("FE_DOWNWARD\n"); break;
        case FE_UPWARD:     printf("FE_UPWARD\n"); break;
        case FE_TOWARDZERO: printf("FE_TOWARDZERO\n"); break;
        default:            printf("Unknown\n");
    }
}

int main() {
    printf("Default rounding: ");
    show_rounding_mode();
    
    // Demonstrate rounding effects
    float a = 1.0f / 3.0f;
    printf("1/3 as float: %.20f\n", a);
    
    // Change rounding mode
    fesetround(FE_UPWARD);
    printf("Current rounding: ");
    show_rounding_mode();
    
    float b = 1.0f / 3.0f;
    printf("1/3 with FE_UPWARD: %.20f\n", b);
    
    return 0;
}

Key precision concepts:

The fenv.h header provides control over rounding modes and floating-point environment.

Denormal Numbers

Very small numbers use denormal representation:

denormals.c
#include <stdio.h>
#include <float.h>

int main() {
    float normal = FLT_MIN;          // Smallest normal number
    float denormal = normal / 2.0f;  // Becomes denormal
    
    printf("FLT_MIN: %e\n", normal);
    printf("FLT_MIN/2: %e\n", denormal);
    
    printf("\nProperties:\n");
    printf("isnormal(normal): %d\n", isnormal(normal));
    printf("isnormal(denormal): %d\n", isnormal(denormal));
    printf("fpclassify(denormal): %d\n", fpclassify(denormal));
    
    // Performance impact
    volatile float sum = 0.0f;
    for (int i = 0; i < 1000000; i++) {
        sum += denormal;  // Much slower than normal floats
    }
    
    return 0;
}

Denormal numbers:

Some systems flush denormals to zero (FTZ) for performance.

Error Accumulation

Floating-point errors compound in calculations:

error_accumulation.c
#include <stdio.h>
#include <math.h>

int main() {
    // Classic precision problem
    float sum = 0.0f;
    for (int i = 0; i < 10000; i++) {
        sum += 0.01f;
    }
    printf("Sum of 0.01 10000 times: %.10f\n", sum);
    
    // Kahan summation algorithm
    float kahan_sum = 0.0f;
    float c = 0.0f;  // Compensation
    for (int i = 0; i < 10000; i++) {
        float y = 0.01f - c;
        float t = kahan_sum + y;
        c = (t - kahan_sum) - y;
        kahan_sum = t;
    }

    printf("Kahan sum: %.10f\n", kahan_sum);
    
    // Catastrophic cancellation
    float x = 1e8f;
    float y = x + 1.0f;
    printf("(1e8 + 1) - 1e8 = %.1f\n", y - x);
    
    return 0;
}

Common error sources:

The Kahan summation algorithm demonstrates how to reduce accumulation errors.

Floating-Point Exceptions

Floating-point operations can raise exceptions:

exceptions.c
#include <stdio.h>
#include <fenv.h>
#include <math.h>

#pragma STDC FENV_ACCESS ON

void show_exceptions() {

    printf("Raised exceptions: ");
    if (fetestexcept(FE_DIVBYZERO)) printf("FE_DIVBYZERO ");
    if (fetestexcept(FE_INVALID)) printf("FE_INVALID ");
    if (fetestexcept(FE_OVERFLOW)) printf("FE_OVERFLOW ");
    if (fetestexcept(FE_UNDERFLOW)) printf("FE_UNDERFLOW ");
    if (fetestexcept(FE_INEXACT)) printf("FE_INEXACT ");
    printf("\n");
}

int main() {
    feclearexcept(FE_ALL_EXCEPT);
    
    float x = 1.0f / 0.0f;  // Division by zero
    show_exceptions();
    
    feclearexcept(FE_ALL_EXCEPT);
    float y = sqrt(-1.0f);   // Invalid operation
    show_exceptions();
    
    feclearexcept(FE_ALL_EXCEPT);
    float z = FLT_MAX * 2.0f; // Overflow
    show_exceptions();
    
    return 0;
}

Standard floating-point exceptions:

Exception handling requires careful management of the floating-point environment.

Hardware Considerations

Floating-point performance varies by architecture:

hardware.c
#include <stdio.h>

void print_fpu_control() {
    #if defined(__x86_64__) || defined(__i386__)
    unsigned short cw;
    __asm__ __volatile__ ("fstcw %0" : "=m" (cw));
    printf("FPU control word: 0x%04X\n", cw);
    #endif
}

int main() {
    printf("FPU features:\n");
    
    #ifdef __SSE2__
    printf("SSE2 available\n");
    #endif
    
    #ifdef __AVX__
    printf("AVX available\n");
    #endif
    
    print_fpu_control();
    
    // SIMD example
    float a[4] = {1.0f, 2.0f, 3.0f, 4.0f};
    float b[4] = {5.0f, 6.0f, 7.0f, 8.0f};
    float c[4];
    
    #ifdef __SSE__
    __asm__ (
        "movups %1, %%xmm0\n"
        "movups %2, %%xmm1\n"
        "addps %%xmm1, %%xmm0\n"
        "movups %%xmm0, %0"
        : "=m" (c)
        : "m" (a), "m" (b)
    );
    printf("SIMD add: %.1f, %.1f, %.1f, %.1f\n", 
           c[0], c[1], c[2], c[3]);
    #endif
    
    return 0;
}

Key hardware aspects:

Modern compilers generate optimized code based on target architecture.

Best Practices

Source References

Author

My name is Jan Bodnar, and I am a passionate programmer with extensive programming experience. I have been writing programming articles since 2007. To date, I have authored over 1,400 articles and 8 e-books. I possess more than ten years of experience in teaching programming.