Learn R – Part 4

R provides built-in functions for statistical analysis:

summary(): Summary statistics (min, max, quartiles, mean).

sum(): Total of values.

range(): Minimum and maximum.

var(): Variance.

sd(): Standard deviation.

Demo Data
# Basic dataset  
data_basic <- c(2, 4, 6, 8, 10)  
# Advanced dataset (mtcars)  
data(mtcars)  

mpg <- mtcars$mpg  
Practice
# BASIC TASKS  
# HW1: Calculate the sum of data_basic  
# HW2: Find the range (min and max) of data_basic  
# HW3: Compute the variance of data_basic  

# ADVANCED TASKS  
# HW4: Calculate the standard deviation of mtcars$mpg  

# HW5: Generate a summary of mtcars$hp (horsepower)  
Solution
R
# BASIC SOLUTIONS  
sum_basic <- sum(data_basic)  
range_basic <- range(data_basic)  
var_basic <- var(data_basic)  

# ADVANCED SOLUTIONS  
sd_mpg <- sd(mpg)  
summary_hp <- summary(mtcars$hp)  

Mean : Average (mean()).

Median : Middle value (median()).

Mode : Most frequent value (no built-in function—custom code required).

Demo Data
# Basic dataset  
data_numbers <- c(1, 2, 2, 3, 4, 5, 5, 5)  
# Advanced dataset (iris)  
data(iris)  
sepal_length <- iris$Sepal.Length 
Practice
# BASIC TASKS  
# HW1: Calculate the mean of data_numbers  
# HW2: Find the median of data_numbers  
# HW3: Write a function to compute the mode  

# ADVANCED TASKS  
# HW4: Compute the mean of iris$Sepal.Length  

# HW5: Find the median of iris$Petal.Length grouped by Species  
Solution
R
# BASIC SOLUTIONS  
mean_val <- mean(data_numbers)  
median_val <- median(data_numbers)  
mode_func <- function(x) {  
  ux <- unique(x)  
  ux[which.max(tabulate(match(x, ux)))]  
}  
mode_val <- mode_func(data_numbers)  

# ADVANCED SOLUTIONS  
mean_sepal <- mean(sepal_length)  
median_petal <- aggregate(Petal.Length ~ Species, iris, median)  

max()/min(): Extreme values.

quantile(): Percentiles (e.g., 25th, 50th).

IQR(): Interquartile range.

Demo Data
# Basic dataset  
data_scores <- c(45, 67, 89, 34, 56, 78, 90, 23)  
# Advanced dataset (airquality)  
data(airquality)  

temp <- airquality$Temp  
Practice
# BASIC TASKS  
# HW1: Find the max and min of data_scores  
# HW2: Calculate the 75th percentile of data_scores  
# HW3: Compute the IQR of data_scores  

# ADVANCED TASKS  
# HW4: Find the 90th percentile of airquality$Temp  

# HW5: Identify outliers in airquality$Ozone using IQR  
Solution
R
# BASIC SOLUTIONS  
max_score <- max(data_scores)  
min_score <- min(data_scores)  
percentile_75 <- quantile(data_scores, 0.75)  
iqr_score <- IQR(data_scores)  

# ADVANCED SOLUTIONS  
percentile_90 <- quantile(temp, 0.90)  
# Outlier detection (IQR method)  
q1 <- quantile(airquality$Ozone, 0.25)  
q3 <- quantile(airquality$Ozone, 0.75)  
iqr <- IQR(airquality$Ozone)  
outliers <- airquality$Ozone[airquality$Ozone < (q1 - 1.5*iqr) | airquality$Ozone > (q3 + 1.5*iqr)]  

Perform t-tests (t.test()), ANOVA (aov()), and chi-square tests (chisq.test()) to compare groups.

Demo Data
# Create sample data  
group_a <- c(20, 22, 19, 18, 24)  

group_b <- c(25, 24, 22, 23, 20)  
Practice
# HW1: Perform an independent t-test between group_a and group_b  

# HW2: Run a one-way ANOVA on `mtcars` to compare `mpg` across cylinder groups  
Solution
R
# HW1  
t.test(group_a, group_b)  
# HW2  
cyl_groups <- split(mtcars$mpg, mtcars$cyl)  
anova_result <- aov(mpg ~ factor(cyl), data=mtcars)  
summary(anova_result)  

Fit linear (lm()) and logistic regression models. Use summary() to interpret coefficients and p-values.

Demo Data
# Use `mtcars` for linear regression  
Practice
# HW1: Fit a linear model predicting `mpg` from `wt` and `hp`  

# HW2: Check the R-squared value of the model  
Solution
R
# HW1  
model <- lm(mpg ~ wt + hp, data=mtcars)  
# HW2  
summary(model)$r.squared  

Recode variables with dplyr::mutate() and case_when(). Create new variables using arithmetic/logical operations.

Demo Data
# Create sample data  
df <- data.frame(  
  age = c(18, 25, 30, 35, 40),  
  income = c(50000, 60000, 75000, 90000, 120000) 
 
)  
Practice
# HW1: Recode `age` into categories: "<25", "25-35", ">35"  

# HW2: Create a new variable `income_group` (Low: <70k, High: >=70k)  
Solution
R
# HW1  
library(dplyr)  
df <- df %>%  
  mutate(age_group = case_when(  
    age < 25 ~ "<25",  
    age >= 25 & age <= 35 ~ "25-35",  
    age > 35 ~ ">35"  
  ))  
# HW2  
df <- df %>%  
  mutate(income_group = ifelse(income >= 70000, "High", "Low"))  

Export tables and plots using write.csv(), stargazer, or flextable.

Practice
# HW1: Save `mtcars` summary to a CSV  

# HW2: Export a ggplot to PNG  
Solution
R
# HW1  
write.csv(summary(mtcars), "mtcars_summary.csv")  
# HW2  
ggsave("plot.png", plot=last_plot())  

Skewness : Measure of asymmetry (moments package).

Kurtosis : Tailedness of the distribution (moments package).

Covariance : cov().

Correlation : cor().

Demo Data
# Advanced dataset (cars)  
data(cars)  
speed <- cars$speed  

dist <- cars$dist  
Practice
# HW1: Calculate covariance between speed and distance  
# HW2: Compute correlation between speed and distance  

# HW3: Install the `moments` package and calculate skewness of speed  
Solution
R
# HW1  
covariance <- cov(speed, dist)  
# HW2  
correlation <- cor(speed, dist)  
# HW3  
library(moments)  
skewness_speed <- skewness(speed)  

Use na.rm = TRUE to ignore NA values in calculations.

Demo Data
data_missing <- c(1, 2, NA, 4, 5)  
Practice
# HW1: Calculate the mean of data_missing (ignore NA)  

# HW2: Check if data_missing contains any NA values  
Solution
R
mean_missing <- mean(data_missing, na.rm = TRUE)  
has_na <- anyNA(data_missing)  

Leave a Reply

Your email address will not be published. Required fields are marked *