Chapter 10 Column Manipulation

What You’ll Learn:

  • Adding and removing columns safely
  • Column name handling
  • Type preservation during operations
  • Renaming strategies
  • Common manipulation pitfalls

Key Errors Covered: 12+ column manipulation errors

Difficulty: ⭐ Beginner to ⭐⭐ Intermediate

10.1 Introduction

Working with data frame columns is a daily task, but it’s full of traps:

df <- data.frame(x = 1:5, y = 6:10)
df[, "z"]  # Typo in column name
#> Error in `[.data.frame`(df, , "z"): undefined columns selected
# Or this:
df$new_column <- 1:3  # Wrong length!
#> Error in `$<-.data.frame`(`*tmp*`, new_column, value = 1:3): replacement has 3 rows, data has 5

Let’s master column manipulation to avoid these common errors.

10.2 Column Basics

💡 Key Insight: Data Frame is a List

Understanding this is key to column operations:

df <- data.frame(x = 1:3, y = 4:6, z = 7:9)

# Data frame is a special list
is.list(df)
#> [1] TRUE
length(df)  # Number of columns!
#> [1] 3

# Each column is a list element
df[[1]]     # First column (vector)
#> [1] 1 2 3
df[1]       # First column (data frame)
#>   x
#> 1 1
#> 2 2
#> 3 3

# Three ways to access columns:
df$x        # Dollar sign
#> [1] 1 2 3
df[["x"]]   # Double bracket
#> [1] 1 2 3
df["x"]     # Single bracket (returns data frame)
#>   x
#> 1 1
#> 2 2
#> 3 3

# Column names
names(df)
#> [1] "x" "y" "z"
colnames(df)
#> [1] "x" "y" "z"

10.3 Error #1: undefined columns selected

⭐ BEGINNER 📏 DIMENSION

10.3.1 The Error

df <- data.frame(age = c(25, 30, 35), name = c("Alice", "Bob", "Charlie"))
df[, "salary"]  # Column doesn't exist
#> Error in `[.data.frame`(df, , "salary"): undefined columns selected

🔴 ERROR

Error in `[.data.frame`(df, , "salary") : undefined columns selected

10.3.2 What It Means

You’re trying to select a column that doesn’t exist in the data frame.

10.3.3 Common Causes

10.3.3.1 Cause 1: Typo

df <- data.frame(temperature = c(20, 25, 30))

# Typo
df[, "tempurature"]
#> Error in `[.data.frame`(df, , "tempurature"): undefined columns selected

10.3.3.2 Cause 2: Case Sensitivity

df <- data.frame(Name = c("Alice", "Bob"))

# Wrong case
df[, "name"]  # It's "Name" not "name"
#> Error in `[.data.frame`(df, , "name"): undefined columns selected

10.3.3.3 Cause 3: Column Doesn’t Exist Yet

df <- data.frame(x = 1:5)

# Trying to select before creating
df[, c("x", "y")]  # "y" doesn't exist
#> Error in `[.data.frame`(df, , c("x", "y")): undefined columns selected

10.3.3.4 Cause 4: After Transformation

df <- data.frame(x = 1:5, y = 6:10, z = 11:15)

# Select some columns
df_subset <- df[, c("x", "y")]

# Try to access original column
df_subset[, "z"]  # No longer exists
#> Error in `[.data.frame`(df_subset, , "z"): undefined columns selected

10.3.4 Solutions

SOLUTION 1: Check Column Exists

df <- data.frame(age = c(25, 30, 35), name = c("Alice", "Bob", "Charlie"))

# Check before accessing
if ("salary" %in% names(df)) {
  df[, "salary"]
} else {
  message("Column 'salary' not found")
  NULL
}
#> Column 'salary' not found
#> NULL

# Or for multiple columns
cols_wanted <- c("age", "salary", "name")
cols_available <- cols_wanted[cols_wanted %in% names(df)]
df[, cols_available]
#>   age    name
#> 1  25   Alice
#> 2  30     Bob
#> 3  35 Charlie

SOLUTION 2: Use dplyr’s select() with Helpers

library(dplyr)

df <- data.frame(age = c(25, 30, 35), name = c("Alice", "Bob", "Charlie"))

# Select only existing columns
df %>% select(any_of(c("age", "salary", "name")))
#> Error in select(., any_of(c("age", "salary", "name"))): unused argument (any_of(c("age", "salary", "name")))

# Or with error on missing
df %>% select(all_of(c("age", "salary")))  # Errors on missing
#> Error in select(., all_of(c("age", "salary"))): unused argument (all_of(c("age", "salary")))

SOLUTION 3: Safe Column Selection Function

safe_select <- function(df, cols, warn = TRUE) {
  existing <- cols[cols %in% names(df)]
  missing <- cols[!cols %in% names(df)]
  
  if (length(missing) > 0 && warn) {
    warning("Columns not found: ", paste(missing, collapse = ", "))
  }
  
  if (length(existing) == 0) {
    return(data.frame())  # Empty data frame
  }
  
  return(df[, existing, drop = FALSE])
}

# Test
df <- data.frame(x = 1:5, y = 6:10)
safe_select(df, c("x", "z", "y"))
#> Warning in safe_select(df, c("x", "z", "y")): Columns not found: z
#>   x  y
#> 1 1  6
#> 2 2  7
#> 3 3  8
#> 4 4  9
#> 5 5 10

10.4 Error #2: replacement has X rows, data has Y

⭐ BEGINNER 📏 DIMENSION

10.4.1 The Error

df <- data.frame(x = 1:5, y = 6:10)
df$z <- 1:3  # Wrong length!
#> Error in `$<-.data.frame`(`*tmp*`, z, value = 1:3): replacement has 3 rows, data has 5

🔴 ERROR

Error in `$<-.data.frame`(`*tmp*`, z, value = 1:3) : 
  replacement has 3 rows, data has 5

10.4.2 What It Means

When adding/replacing a column, the new values must match the number of rows (or be length 1).

10.4.3 The Recycling Rule for Columns

df <- data.frame(x = 1:5)

# Length 1: recycles
df$y <- 10
df
#>   x  y
#> 1 1 10
#> 2 2 10
#> 3 3 10
#> 4 4 10
#> 5 5 10

# Same length: works
df$z <- 11:15
df
#>   x  y  z
#> 1 1 10 11
#> 2 2 10 12
#> 3 3 10 13
#> 4 4 10 14
#> 5 5 10 15
# Wrong length: errors
df$w <- 1:3
#> Error in `$<-.data.frame`(`*tmp*`, w, value = 1:3): replacement has 3 rows, data has 5

10.4.4 Common Causes

10.4.4.1 Cause 1: Calculation Resulted in Wrong Length

df <- data.frame(id = 1:10, value = rnorm(10))

# Filter creates shorter vector
high_values <- df$value[df$value > 0]  # Maybe 6 elements

# Try to add back
df$high <- high_values  # Error! 6 vs 10
#> Error in `$<-.data.frame`(`*tmp*`, high, value = c(1.77950290977515, 0.286424419628825, : replacement has 7 rows, data has 10

10.4.4.2 Cause 2: Using Summary on Column

df <- data.frame(
  group = rep(c("A", "B"), each = 5),
  value = 1:10
)

# Calculate group means (2 values)
group_means <- tapply(df$value, df$group, mean)

# Try to add as column
df$mean <- group_means  # Error! 2 vs 10

10.4.4.3 Cause 3: After Subsetting

df <- data.frame(x = 1:10, y = 11:20)

# Subset rows
df_sub <- df[1:5, ]

# Create column for full df
new_col <- 1:5

# Try to add to original
df$new <- new_col  # Error! 5 vs 10

10.4.5 Solutions

SOLUTION 1: Match Lengths

df <- data.frame(id = 1:10, value = rnorm(10))
high_values <- df$value[df$value > 0]

# Option A: Use NA for missing
df$high <- NA
df$high[df$value > 0] <- high_values
df
#>    id       value       high
#> 1   1 -0.19051680         NA
#> 2   2  0.37842390 0.37842390
#> 3   3  0.30003855 0.30003855
#> 4   4 -1.00563626         NA
#> 5   5  0.01925927 0.01925927
#> 6   6 -1.07742065         NA
#> 7   7  0.71270333 0.71270333
#> 8   8  1.08477509 1.08477509
#> 9   9 -2.22498770         NA
#> 10 10  1.23569346 1.23569346

# Option B: Use ifelse
df$high <- ifelse(df$value > 0, df$value, NA)

SOLUTION 2: Use Merge for Aggregates

df <- data.frame(
  group = rep(c("A", "B"), each = 5),
  value = 1:10
)

# Calculate group means
group_summary <- aggregate(value ~ group, df, mean)
names(group_summary)[2] <- "group_mean"

# Merge back
df <- merge(df, group_summary, by = "group")
df
#>    group value group_mean
#> 1      A     1          3
#> 2      A     2          3
#> 3      A     3          3
#> 4      A     4          3
#> 5      A     5          3
#> 6      B     6          8
#> 7      B     7          8
#> 8      B     8          8
#> 9      B     9          8
#> 10     B    10          8

SOLUTION 3: Use dplyr (Cleaner)

library(dplyr)

df <- data.frame(
  group = rep(c("A", "B"), each = 5),
  value = 1:10
)

# Add group mean to each row
df <- df %>%
  group_by(group) %>%
  mutate(group_mean = mean(value)) %>%
  ungroup()

df
#> # A tibble: 10 × 3
#>    group value group_mean
#>    <chr> <int>      <dbl>
#>  1 A         1          3
#>  2 A         2          3
#>  3 A         3          3
#>  4 A         4          3
#>  5 A         5          3
#>  6 B         6          8
#>  7 B         7          8
#>  8 B         8          8
#>  9 B         9          8
#> 10 B        10          8

10.5 Error #3: duplicate column names

⭐ BEGINNER 🔤 SYNTAX

10.5.1 The Warning/Problem

df <- data.frame(x = 1:3, x = 4:6, check.names = FALSE)
names(df)
#> [1] "x" "x"

R allows duplicate column names (with warning), but it causes problems:

# Which x?
df$x  # Gets first one
#> [1] 1 2 3

# Confusion!
df[, "x"]  # Gets first one
#> [1] 1 2 3

10.5.2 Why It’s Dangerous

# Create with duplicates
df <- data.frame(value = 1:3, value = 4:6, check.names = FALSE)

# Operations become unpredictable
df$value <- df$value * 2  # Which one gets modified?
df
#>   value value
#> 1     2     4
#> 2     4     5
#> 3     6     6

# Selection is confusing
df[, c("value", "value")]  # Gets same column twice
#>   value value.1
#> 1     2       2
#> 2     4       4
#> 3     6       6

10.5.3 Solutions

SOLUTION 1: Let R Fix Names

# Default: R makes names unique
df <- data.frame(x = 1:3, x = 4:6)  # check.names = TRUE by default
names(df)  # "x" and "x.1"
#> [1] "x"   "x.1"

# Or manually
names_original <- c("value", "value", "score")
names_fixed <- make.names(names_original, unique = TRUE)
names_fixed
#> [1] "value"   "value.1" "score"

SOLUTION 2: Check and Fix Names

fix_duplicate_names <- function(df) {
  col_names <- names(df)
  
  if (anyDuplicated(col_names)) {
    dupes <- col_names[duplicated(col_names)]
    warning("Duplicate column names found: ", 
            paste(unique(dupes), collapse = ", "))
    names(df) <- make.names(col_names, unique = TRUE)
  }
  
  return(df)
}

# Test
df <- data.frame(x = 1:3, x = 4:6, check.names = FALSE)
df <- fix_duplicate_names(df)
#> Warning in fix_duplicate_names(df): Duplicate column names found: x
names(df)
#> [1] "x"   "x.1"

SOLUTION 3: Prevent Duplicates

safe_add_column <- function(df, name, values) {
  if (name %in% names(df)) {
    stop("Column '", name, "' already exists. ",
         "Use a different name or remove the existing column first.")
  }
  
  df[[name]] <- values
  return(df)
}

# Test
df <- data.frame(x = 1:3)
df <- safe_add_column(df, "y", 4:6)  # Works
df <- safe_add_column(df, "x", 7:9)  # Errors
#> Error in safe_add_column(df, "x", 7:9): Column 'x' already exists. Use a different name or remove the existing column first.

10.6 Error #4: names attribute must be same length as vector

⭐ BEGINNER 📏 DIMENSION

10.6.1 The Error

df <- data.frame(x = 1:5, y = 6:10, z = 11:15)
names(df) <- c("a", "b")  # Only 2 names for 3 columns!

🔴 ERROR

Error in names(df) <- c("a", "b") : 
  'names' attribute must be the same length as the vector (3)

10.6.2 What It Means

When setting column names, you must provide exactly one name per column.

10.6.3 Common Causes

10.6.3.1 Cause 1: Wrong Count

df <- data.frame(x = 1:3, y = 4:6, z = 7:9)

# Too few
names(df) <- c("first", "second")

# Too many
names(df) <- c("first", "second", "third", "fourth")
#> Error in names(df) <- c("first", "second", "third", "fourth"): 'names' attribute [4] must be the same length as the vector [3]

10.6.3.2 Cause 2: After Adding Columns

df <- data.frame(x = 1:3, y = 4:6)
new_names <- c("a", "b")

# Add a column
df$z <- 7:9

# Try to use old names
names(df) <- new_names  # Error! Now 3 columns

10.6.3.3 Cause 3: From External Source

df <- data.frame(matrix(1:12, nrow = 3, ncol = 4))
column_labels <- c("ID", "Value")  # Wrong number

names(df) <- column_labels

10.6.4 Solutions

SOLUTION 1: Match Number of Names

df <- data.frame(x = 1:3, y = 4:6, z = 7:9)

# Provide all names
names(df) <- c("first", "second", "third")

# Or rename specific columns
names(df)[1] <- "id"
names(df)[3] <- "score"
names(df)
#> [1] "id"     "second" "score"

SOLUTION 2: Use Named Vector for Partial Rename

library(dplyr)

df <- data.frame(x = 1:3, y = 4:6, z = 7:9)

# Rename specific columns
df <- df %>% rename(id = x, score = z)
names(df)
#> [1] "id"    "y"     "score"

# Or base R
names(df)[names(df) == "y"] <- "value"
names(df)
#> [1] "id"    "value" "score"

SOLUTION 3: Safe Rename Function

safe_rename <- function(df, ...) {
  name_mapping <- list(...)
  
  for (old_name in names(name_mapping)) {
    new_name <- name_mapping[[old_name]]
    
    if (!old_name %in% names(df)) {
      warning("Column '", old_name, "' not found, skipping")
      next
    }
    
    if (new_name %in% names(df) && new_name != old_name) {
      warning("Column '", new_name, "' already exists, skipping")
      next
    }
    
    names(df)[names(df) == old_name] <- new_name
  }
  
  return(df)
}

# Test
df <- data.frame(x = 1:3, y = 4:6, z = 7:9)
df <- safe_rename(df, x = "id", z = "score", w = "missing")
#> Warning in safe_rename(df, x = "id", z = "score", w = "missing"): Column 'w'
#> not found, skipping
names(df)
#> [1] "id"    "y"     "score"

10.7 Removing Columns

🎯 Best Practice: Removing Columns

df <- data.frame(x = 1:3, y = 4:6, z = 7:9)

# Method 1: Set to NULL
df$y <- NULL
df
#>   x z
#> 1 1 7
#> 2 2 8
#> 3 3 9

# Method 2: Subset (keep what you want)
df <- data.frame(x = 1:3, y = 4:6, z = 7:9)
df <- df[, c("x", "z")]
df
#>   x z
#> 1 1 7
#> 2 2 8
#> 3 3 9

# Method 3: Subset (exclude what you don't want)
df <- data.frame(x = 1:3, y = 4:6, z = 7:9)
df <- df[, !names(df) %in% c("y")]
df
#>   x z
#> 1 1 7
#> 2 2 8
#> 3 3 9

# Method 4: dplyr select with minus
library(dplyr)
df <- data.frame(x = 1:3, y = 4:6, z = 7:9)
df <- df %>% select(-y)
#> Error in select(., -y): unused argument (-y)
df
#>   x y z
#> 1 1 4 7
#> 2 2 5 8
#> 3 3 6 9

# Method 5: Remove multiple
df <- data.frame(x = 1:3, y = 4:6, z = 7:9, w = 10:12)
df <- df %>% select(-c(y, w))
#> Error in select(., -c(y, w)): unused argument (-c(y, w))
df
#>   x y z  w
#> 1 1 4 7 10
#> 2 2 5 8 11
#> 3 3 6 9 12

Never do this:

# ❌ Bad: modifies in place
df[, "y"] <- NULL  # Doesn't work as expected!

# ✅ Good: explicit assignment
df$y <- NULL

10.8 Column Reordering

💡 Key Insight: Reordering Columns

df <- data.frame(z = 7:9, x = 1:3, y = 4:6)
names(df)
#> [1] "z" "x" "y"

# Method 1: Specify order explicitly
df <- df[, c("x", "y", "z")]
names(df)
#> [1] "x" "y" "z"

# Method 2: Sort alphabetically
df <- df[, sort(names(df))]
names(df)
#> [1] "x" "y" "z"

# Method 3: Move specific columns first
df <- data.frame(z = 7:9, x = 1:3, y = 4:6)
df <- df[, c("x", setdiff(names(df), "x"))]
names(df)
#> [1] "x" "z" "y"

# Method 4: dplyr relocate
library(dplyr)
df <- data.frame(z = 7:9, x = 1:3, y = 4:6)
df <- df %>% relocate(x, y, z)
names(df)
#> [1] "x" "y" "z"

# Or move to front/end
df <- data.frame(z = 7:9, x = 1:3, y = 4:6)
df <- df %>% relocate(x, .before = everything())
df %>% relocate(z, .after = everything())
#>   x y z
#> 1 1 4 7
#> 2 2 5 8
#> 3 3 6 9

10.9 Type Preservation

⚠️ Common Pitfall: Type Changes

# Start with factors
df <- data.frame(
  id = 1:3,
  category = c("A", "B", "C"),
  stringsAsFactors = TRUE
)
class(df$category)  # "factor"
#> [1] "factor"

# Select columns - type changes!
df_subset <- df[, "category"]
class(df_subset)  # "factor" (still)
#> [1] "factor"

# But extract as vector
vec <- df$category
class(vec)  # "factor"
#> [1] "factor"

# With drop = TRUE (default)
df_subset <- df[, "category", drop = TRUE]
class(df_subset)  # "factor" - becomes vector
#> [1] "factor"

# With drop = FALSE
df_subset <- df[, "category", drop = FALSE]
class(df_subset)  # "data.frame" - stays data frame
#> [1] "data.frame"

Best practice:

# Use drop = FALSE when you want to keep data frame structure
df[, "category", drop = FALSE]
#>   category
#> 1        A
#> 2        B
#> 3        C

# Or use $ when you explicitly want a vector
df$category
#> [1] A B C
#> Levels: A B C

10.10 Adding Multiple Columns

🎯 Best Practice: Adding Multiple Columns

df <- data.frame(x = 1:5)

# Method 1: One at a time
df$y <- 6:10
df$z <- 11:15

# Method 2: cbind
df <- data.frame(x = 1:5)
df <- cbind(df, data.frame(y = 6:10, z = 11:15))

# Method 3: dplyr mutate
library(dplyr)
df <- data.frame(x = 1:5)
df <- df %>%
  mutate(
    y = x + 5,
    z = y + 5
  )

# Method 4: Transform base R
df <- data.frame(x = 1:5)
df <- transform(df,
  y = x + 5,
  z = y + 5  # Can reference previous
)
#> Error in data.frame(structure(list(x = 1:5), class = "data.frame", row.names = c(NA, : arguments imply differing number of rows: 5, 3

# Method 5: within
df <- data.frame(x = 1:5)
df <- within(df, {
  y <- x + 5
  z <- y + 5
})

10.11 Summary

Key Takeaways:

  1. Check column exists before accessing with %in% names()
  2. Match row count when adding columns (or use length 1)
  3. Avoid duplicate names - check with anyDuplicated()
  4. Provide all names when renaming - one per column
  5. Use drop = FALSE to preserve data frame structure
  6. $ sets to NULL removes columns cleanly
  7. dplyr is clearer for complex column operations

Quick Reference:

Error Cause Fix
undefined columns selected Column doesn’t exist Check with %in% names()
replacement has X rows Wrong length column Match nrows or use length 1
duplicate column names Non-unique names Use make.names(unique=TRUE)
names attribute wrong length Wrong # of names Provide one per column

Column Operations Checklist:

# Before accessing:
"colname" %in% names(df)      # Check exists
anyDuplicated(names(df))      # Check no duplicates

# When adding column:
length(new_values) == nrow(df) || length(new_values) == 1

# When renaming:
length(new_names) == ncol(df)
!anyDuplicated(new_names)

# Safe patterns:
df$col <- NULL                # Remove column
df[, cols, drop = FALSE]      # Keep as data frame

Best Practices:

# ✅ Good
df %>% select(any_of(c("a", "b", "c")))  # Safe selection
df %>% mutate(new = old * 2)             # Add column
names(df)[names(df) == "old"] <- "new"   # Rename one

# ❌ Avoid
df[, "missing_col"]           # No check
df$new <- wrong_length_vector # No validation
names(df) <- c("a", "b")      # Partial names

10.12 Exercises

📝 Exercise 1: Safe Column Access

Write a function that safely gets a column: - Returns the column if it exists - Returns default value if it doesn’t - Warns user about missing columns - Handles both $ and [[ ]] style access

📝 Exercise 2: Batch Rename

You have:

df <- data.frame(
  old_name_1 = 1:5,
  old_name_2 = 6:10,
  old_name_3 = 11:15
)

Write a function to rename all columns matching a pattern.

📝 Exercise 3: Safe Column Addition

Write add_column(df, name, values) that: 1. Checks if name already exists 2. Validates values length 3. Handles recycling appropriately 4. Returns modified data frame 5. Gives informative errors

📝 Exercise 4: Column Audit

Write a function that audits a data frame and reports: - Missing column names - Duplicate column names - Invalid column names (non-syntactic) - Columns with NA names

10.13 Exercise Answers

Click to see answers

Exercise 1:

safe_get_column <- function(df, col, default = NULL, warn = TRUE) {
  if (!col %in% names(df)) {
    if (warn) {
      warning("Column '", col, "' not found in data frame")
    }
    return(default)
  }
  
  return(df[[col]])
}

# Test
df <- data.frame(x = 1:5, y = 6:10)
safe_get_column(df, "x")         # Returns column
#> [1] 1 2 3 4 5
safe_get_column(df, "z")         # Returns NULL with warning
#> Warning in safe_get_column(df, "z"): Column 'z' not found in data frame
#> NULL
safe_get_column(df, "z", default = NA, warn = FALSE)
#> [1] NA

Exercise 2:

rename_pattern <- function(df, pattern, replacement) {
  old_names <- names(df)
  new_names <- gsub(pattern, replacement, old_names)
  
  if (identical(old_names, new_names)) {
    message("No columns matched pattern '", pattern, "'")
    return(df)
  }
  
  # Check for duplicates after rename
  if (anyDuplicated(new_names)) {
    warning("Renaming would create duplicate names, using make.unique()")
    new_names <- make.unique(new_names)
  }
  
  names(df) <- new_names
  
  # Report changes
  changed <- old_names != new_names
  if (any(changed)) {
    message("Renamed ", sum(changed), " columns:")
    for (i in which(changed)) {
      message("  ", old_names[i], " -> ", new_names[i])
    }
  }
  
  return(df)
}

# Test
df <- data.frame(
  old_name_1 = 1:5,
  old_name_2 = 6:10,
  old_name_3 = 11:15
)
df <- rename_pattern(df, "old_name_", "new_col_")
#> Renamed 3 columns:
#>   old_name_1 -> new_col_1
#>   old_name_2 -> new_col_2
#>   old_name_3 -> new_col_3
names(df)
#> [1] "new_col_1" "new_col_2" "new_col_3"

Exercise 3:

add_column <- function(df, name, values, overwrite = FALSE) {
  # Check if name exists
  if (name %in% names(df) && !overwrite) {
    stop("Column '", name, "' already exists. ",
         "Use overwrite = TRUE to replace.")
  }
  
  # Check length
  n_rows <- nrow(df)
  n_values <- length(values)
  
  if (n_values == n_rows) {
    # Perfect match
    df[[name]] <- values
  } else if (n_values == 1) {
    # Recycle single value
    message("Recycling single value to ", n_rows, " rows")
    df[[name]] <- values
  } else if (n_rows %% n_values == 0) {
    # Multiple recycling
    message("Recycling ", n_values, " values to ", n_rows, " rows")
    df[[name]] <- rep(values, length.out = n_rows)
  } else {
    stop("Length mismatch: values has ", n_values, 
         " elements but data frame has ", n_rows, " rows")
  }
  
  return(df)
}

# Test
df <- data.frame(x = 1:5)
df <- add_column(df, "y", 10)           # Recycles
#> Recycling single value to 5 rows
df <- add_column(df, "z", 11:15)        # Matches
df <- add_column(df, "w", 1:3)          # Errors
#> Error in add_column(df, "w", 1:3): Length mismatch: values has 3 elements but data frame has 5 rows

Exercise 4:

audit_columns <- function(df) {
  col_names <- names(df)
  issues <- list()
  
  # Check for missing names
  if (any(is.na(col_names) | col_names == "")) {
    issues$missing <- which(is.na(col_names) | col_names == "")
  }
  
  # Check for duplicates
  if (anyDuplicated(col_names)) {
    dupes <- col_names[duplicated(col_names)]
    issues$duplicates <- unique(dupes)
  }
  
  # Check for invalid names (non-syntactic)
  valid <- make.names(col_names) == col_names
  if (!all(valid)) {
    issues$invalid <- col_names[!valid]
  }
  
  # Report
  if (length(issues) == 0) {
    message("✓ All column names are valid")
    return(invisible(TRUE))
  }
  
  message("Column name issues found:")
  
  if (!is.null(issues$missing)) {
    message("  Missing names at positions: ", 
            paste(issues$missing, collapse = ", "))
  }
  
  if (!is.null(issues$duplicates)) {
    message("  Duplicate names: ", 
            paste(issues$duplicates, collapse = ", "))
  }
  
  if (!is.null(issues$invalid)) {
    message("  Invalid names: ", 
            paste(issues$invalid, collapse = ", "))
    message("  Suggested: ", 
            paste(make.names(issues$invalid), collapse = ", "))
  }
  
  return(invisible(issues))
}

# Test
df_good <- data.frame(x = 1:3, y = 4:6)
audit_columns(df_good)
#> ✓ All column names are valid

df_bad <- data.frame(x = 1:3, x = 4:6, `2bad` = 7:9, 
                     check.names = FALSE)
audit_columns(df_bad)
#> Column name issues found:
#>   Duplicate names: x
#>   Invalid names: 2bad
#>   Suggested: X2bad