Chapter 6 NA, NULL, NaN, Inf
What You’ll Learn:
- The four “missing” value types in R
- Critical differences between NA, NULL, NaN, and Inf
- Common errors with missing data
- How to handle missing values properly
- Testing and detecting special values
Key Errors Covered: 15+ missing data errors
Difficulty: ⭐ Beginner to ⭐⭐ Intermediate
6.1 Introduction
R has four special “non-values” that trip up everyone:
NA # Not Available
#> [1] NA
NULL # Nothing
#> NULL
NaN # Not a Number
#> [1] NaN
Inf # Infinity
#> [1] InfThey look similar but behave very differently, and confusing them causes endless errors.
💡 Key Insight: The Four Special Values
# NA - Missing data (most common)
ages <- c(25, 30, NA, 35) # One age is missing
# NULL - Absence of value (empty)
result <- NULL # No result yet
# NaN - Invalid math result
0 / 0 # Undefined
#> [1] NaN
# Inf - Infinite value
1 / 0 # Positive infinity
#> [1] Inf
-1 / 0 # Negative infinity
#> [1] -InfThink of it this way: - NA: “I don’t know what this is” - NULL: “There’s nothing here” - NaN: “This calculation doesn’t make sense” - Inf: “This is beyond measurement”
6.3 Error #1: missing values where TRUE/FALSE needed
⭐ BEGINNER 🔢 TYPE
6.3.1 The Error
x <- NA
if (x > 5) {
print("Large")
}
#> Error in if (x > 5) {: missing value where TRUE/FALSE needed🔴 ERROR
Error in if (x > 5) { : missing value where TRUE/FALSE needed
6.3.3 Common Causes
6.3.3.1 Cause 1: Comparison with NA
6.3.4 Solutions
✅ SOLUTION 1: Test for NA First
✅ SOLUTION 2: Use isTRUE()
✅ SOLUTION 3: Use %in% for Comparisons
⚠️ Common Pitfall: && vs &
x <- c(NA, 2, 3)
# Single & returns vector with NA
x > 1 & x < 5
#> [1] NA TRUE TRUE
# Double && errors on vectorif (x > 1 && x < 5) { # Error!
print("yes")
}
#> Error in x > 1 && x < 5: 'length = 3' in coercion to 'logical(1)'For if(): Use && but check for NA first
For vectorized ops: Use & and handle NAs appropriately
6.4 Error #2: missing values and NaN's not allowed
⭐ BEGINNER 🧮 MATH
6.4.2 What It Means
Some functions refuse to work with NA unless you explicitly tell them how to handle it.
6.5 Error #3: 'x' contains missing values
⭐ BEGINNER 🧮 MATH
6.5.1 The Error
x <- c(1, 2, NA, 4)
y <- c(2, 3, 4, 5)
t.test(x, y)
#>
#> Welch Two Sample t-test
#>
#> data: x and y
#> t = -1.0675, df = 3.9593, p-value = 0.3465
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -4.213398 1.880065
#> sample estimates:
#> mean of x mean of y
#> 2.333333 3.500000🔴 ERROR
Error in t.test.default(x, y) : 'x' contains missing values
6.5.3 Common Functions
x <- c(1, 2, NA, 4, 5)
y <- c(2, 3, 4, 5, 6)
# These need complete data:
t.test(x, y)
#>
#> Welch Two Sample t-test
#>
#> data: x and y
#> t = -0.86603, df = 6.0472, p-value = 0.4195
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -3.820108 1.820108
#> sample estimates:
#> mean of x mean of y
#> 3 4
wilcox.test(x, y)
#> Warning in wilcox.test.default(x, y): cannot compute exact p-value with ties
#>
#> Wilcoxon rank sum test with continuity correction
#>
#> data: x and y
#> W = 6.5, p-value = 0.4568
#> alternative hypothesis: true location shift is not equal to 0
chisq.test(x, y)
#> Warning in chisq.test(x, y): Chi-squared approximation may be incorrect
#>
#> Pearson's Chi-squared test
#>
#> data: x and y
#> X-squared = 12, df = 9, p-value = 0.21336.5.4 Solutions
✅ SOLUTIONS
1. Remove NAs from both vectors:
x <- c(1, 2, NA, 4, 5)
y <- c(2, 3, 4, 5, 6)
# Find complete cases
complete <- complete.cases(x, y)
x_clean <- x[complete]
y_clean <- y[complete]
t.test(x_clean, y_clean)
#>
#> Welch Two Sample t-test
#>
#> data: x_clean and y_clean
#> t = -0.7746, df = 6, p-value = 0.468
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -4.15895 2.15895
#> sample estimates:
#> mean of x mean of y
#> 3 42. Use na.action:
# For functions that support it
df <- data.frame(x = c(1, 2, NA, 4), y = c(2, 3, 4, 5))
t.test(x ~ 1, data = df, na.action = na.omit)
#>
#> One Sample t-test
#>
#> data: x
#> t = 2.6458, df = 2, p-value = 0.1181
#> alternative hypothesis: true mean is not equal to 0
#> 95 percent confidence interval:
#> -1.461250 6.127916
#> sample estimates:
#> mean of x
#> 2.3333333. Impute missing values (advanced):
6.7 Error #4: argument is of length zero
⭐⭐ INTERMEDIATE 📏 LENGTH
6.9 Error #5: NaNs produced
⭐ BEGINNER 🧮 MATH
6.9.1 The Warning
sqrt(-1)
#> Warning in sqrt(-1): NaNs produced
#> [1] NaN
log(-1)
#> Warning in log(-1): NaNs produced
#> [1] NaN
0/0
#> [1] NaN🟡 WARNING
Warning message:
In sqrt(-1) : NaNs produced
6.9.4 Solutions
✅ SOLUTION 1: Check Input Before Operation
safe_sqrt <- function(x) {
if (any(x < 0, na.rm = TRUE)) {
warning("Negative values found, returning NA for those")
}
result <- sqrt(x)
return(result)
}
safe_sqrt(c(1, 4, -9, 16))
#> Warning in safe_sqrt(c(1, 4, -9, 16)): Negative values found, returning NA for
#> those
#> Warning in sqrt(x): NaNs produced
#> [1] 1 2 NaN 4✅ SOLUTION 2: Handle NaN After Operation
6.11 Error #6: infinite or missing values in 'x'
⭐⭐ INTERMEDIATE 🧮 MATH
6.11.3 Solutions
✅ SOLUTIONS
1. Check for and remove Inf:
x <- c(1, 2, Inf, 4, -Inf, 5)
# Find finite values
is.finite(x)
#> [1] TRUE TRUE FALSE TRUE FALSE TRUE
# Keep only finite
x_finite <- x[is.finite(x)]
y_finite <- y[is.finite(x)]
cor(x_finite, y_finite)
#> [1] NA2. Replace Inf with large number:
x <- c(1, 2, Inf, 4, -Inf, 5)
# Replace Inf with max/min of finite values
x_fixed <- x
x_fixed[x == Inf] <- max(x[is.finite(x)]) * 10
x_fixed[x == -Inf] <- min(x[is.finite(x)]) * 10
x_fixed
#> [1] 1 2 50 4 10 53. Check before calculation:
6.12 Testing for Special Values
🎯 Best Practice: Comprehensive Testing
# Test functions
x <- c(1, NA, NaN, Inf, -Inf, 0)
# Individual tests
is.na(x) # TRUE for NA and NaN
#> [1] FALSE TRUE TRUE FALSE FALSE FALSE
is.nan(x) # TRUE only for NaN
#> [1] FALSE FALSE TRUE FALSE FALSE FALSE
is.infinite(x) # TRUE for Inf and -Inf
#> [1] FALSE FALSE FALSE TRUE TRUE FALSE
is.finite(x) # TRUE for normal numbers
#> [1] TRUE FALSE FALSE FALSE FALSE TRUE
# NULL is different
y <- NULL
is.null(y) # TRUE
#> [1] TRUE
is.na(y) # logical(0) - no value to test
#> logical(0)
# Combined checks
is_valid <- function(x) {
!is.na(x) & !is.nan(x) & is.finite(x)
}
is_valid(x)
#> [1] TRUE FALSE FALSE FALSE FALSE TRUEDecision Tree:
6.13 Handling Missing Data Strategies
💡 Key Insight: Missing Data Strategies
1. Complete Case Analysis (Listwise Deletion)
df <- data.frame(
x = c(1, 2, NA, 4, 5),
y = c(10, NA, 30, 40, 50)
)
# Keep only complete rows
complete.cases(df)
#> [1] TRUE FALSE FALSE TRUE TRUE
df_complete <- df[complete.cases(df), ]
df_complete
#> x y
#> 1 1 10
#> 4 4 40
#> 5 5 502. Available Case Analysis (Pairwise Deletion)
# Use all available data for each calculation
cor(df, use = "pairwise.complete.obs")
#> x y
#> x 1 1
#> y 1 13. Imputation (Replacing with Estimates)
# Replace with mean (simple)
df$x[is.na(df$x)] <- mean(df$x, na.rm = TRUE)
# Replace with median (robust to outliers)
df$y[is.na(df$y)] <- median(df$y, na.rm = TRUE)4. Keep as NA (Most Honest)
When to use each: - Complete case: When data missing completely at random (MCAR) - Pairwise: When you want to use all available information - Imputation: When you have good reason to estimate missing values - Keep NA: When missingness is informative
6.14 Summary
Key Takeaways:
- Four special values: NA (missing), NULL (nothing), NaN (invalid math), Inf (infinite)
- NA vs NULL: NA is a placeholder in a vector, NULL is absence of vector
- Test before using: Always check
is.na(),is.null(),is.finite() - Use isTRUE(): For conditions that might be NA
- na.rm = TRUE: Most statistical functions need this with NAs
- NaN from invalid math: sqrt(-1), 0/0, etc.
- Inf from overflow: 1/0, exp(1000), etc.
Quick Reference:
| Value | Test | Meaning | Example |
|---|---|---|---|
| NA | is.na() |
Missing data | Survey non-response |
| NULL | is.null() |
No value | Uninitialized variable |
| NaN | is.nan() |
Invalid math | 0/0 |
| Inf | is.infinite() |
Infinite | 1/0 |
In if() statements:
# ❌ Dangerous
if (x > 5) { } # Errors if x is NA or NULL
# ✅ Safe
if (!is.na(x) && x > 5) { }
if (isTRUE(x > 5)) { }
if (length(x) > 0 && !is.na(x) && x > 5) { }With functions:
6.15 Exercises
📝 Exercise 1: Identify the Type
What are these and why?
📝 Exercise 2: Fix the Code
Debug these:
📝 Exercise 3: Robust Function
Write robust_mean(x) that:
1. Handles NA, NULL, NaN, Inf
2. Reports how many of each were found
3. Calculates mean of valid values
4. Returns list with mean and diagnostics
📝 Exercise 4: Data Cleaning
You have survey data:
survey <- data.frame(
age = c(25, NA, 35, -999, 40), # -999 = missing
income = c(50000, 75000, 0, 80000, NA), # 0 = refused
satisfaction = c(5, 3, NA, 4, 99) # 99 = invalid
)Clean it: 1. Convert -999 to NA 2. Convert 0 in income to NA 3. Convert 99 in satisfaction to NA 4. Calculate complete case statistics
6.16 Exercise Answers
Click to see answers
Exercise 1:
# A - Empty vector
x <- c()
typeof(x) # "logical" (default empty type)
#> [1] "NULL"
# B - NaN from invalid math
y <- sqrt(-1)
#> Warning in sqrt(-1): NaNs produced
class(y) # "numeric"
#> [1] "numeric"
is.nan(y) # TRUE
#> [1] TRUE
# C - Inf from division by zero
z <- 1/0
is.finite(z) # FALSE (Inf is not finite)
#> [1] FALSE
# D - Comparison with NA gives NA
w <- c(1, 2, NA, 4)
w == NA # All NA! Use is.na() instead
#> [1] NA NA NA NA
is.na(w) # Correct way
#> [1] FALSE FALSE TRUE FALSEExercise 2:
# Problem 1 - Need na.rm
data <- c(1, 2, NA, 4, 5)
mean_val <- mean(data, na.rm = TRUE)
if (!is.na(mean_val) && mean_val > 3) {
print("High average")
}
# Problem 2 - Check for NULL
get_score <- function(x) {
if (x > 10) {
return(x * 2)
} else {
return(0) # Return 0 instead of NULL
}
}
score <- get_score(5)
if (!is.null(score) && score > 10) {
print("High score")
}
# Problem 3 - Handle -Inf from log(0)
values <- c(10, 20, 0, 30)
log_values <- log(values)
log_values[is.infinite(log_values)] <- NA
mean(log_values, na.rm = TRUE)
#> [1] 2.899838Exercise 3:
robust_mean <- function(x) {
# Initialize diagnostics
diagnostics <- list(
total = length(x),
null = is.null(x),
na = 0,
nan = 0,
inf = 0,
valid = 0
)
# Check for NULL
if (is.null(x)) {
return(list(mean = NULL, diagnostics = diagnostics))
}
# Count special values
diagnostics$na <- sum(is.na(x) & !is.nan(x))
diagnostics$nan <- sum(is.nan(x))
diagnostics$inf <- sum(is.infinite(x))
# Find valid values
valid <- x[!is.na(x) & !is.nan(x) & is.finite(x)]
diagnostics$valid <- length(valid)
# Calculate mean
if (length(valid) == 0) {
mean_val <- NA
} else {
mean_val <- mean(valid)
}
# Return
list(
mean = mean_val,
diagnostics = diagnostics
)
}
# Test
robust_mean(c(1, 2, NA, NaN, Inf, 5, -Inf))
#> $mean
#> [1] 2.666667
#>
#> $diagnostics
#> $diagnostics$total
#> [1] 7
#>
#> $diagnostics$null
#> [1] FALSE
#>
#> $diagnostics$na
#> [1] 1
#>
#> $diagnostics$nan
#> [1] 1
#>
#> $diagnostics$inf
#> [1] 2
#>
#> $diagnostics$valid
#> [1] 3Exercise 4:
survey <- data.frame(
age = c(25, NA, 35, -999, 40),
income = c(50000, 75000, 0, 80000, NA),
satisfaction = c(5, 3, NA, 4, 99)
)
# Clean data
clean_survey <- survey
# Convert -999 to NA in age
clean_survey$age[clean_survey$age == -999] <- NA
# Convert 0 to NA in income
clean_survey$income[clean_survey$income == 0] <- NA
# Convert 99 to NA in satisfaction
clean_survey$satisfaction[clean_survey$satisfaction == 99] <- NA
# Complete case analysis
clean_survey_complete <- clean_survey[complete.cases(clean_survey), ]
# Statistics
list(
n_complete = nrow(clean_survey_complete),
mean_age = mean(clean_survey_complete$age),
mean_income = mean(clean_survey_complete$income),
mean_satisfaction = mean(clean_survey_complete$satisfaction)
)
#> $n_complete
#> [1] 1
#>
#> $mean_age
#> [1] 25
#>
#> $mean_income
#> [1] 50000
#>
#> $mean_satisfaction
#> [1] 5