library(ggplot2)

Case Study 1: Simple linear regression

We first use lm function to fit a linear regression line.

x <- seq(1, 10, 0.5)
set.seed(252) # fix the seed in random-number generation
y <- rnorm(length(x), mean = x, sd = 1)

# method 1: `lm` function
lm_fit <- lm(y ~ x)
lm_fit$coefficients[1]
## (Intercept) 
##  -0.1178328
lm_fit$coefficients[2]
##        x 
## 1.072806

We can also calculate the estimated regression coefficients using its analytical solution \((X^{T} X)^{-1} (X^T Y)\).

# design matrix
X <- cbind(rep(1, length(x)), x)
y <- as.matrix(y)

solve(t(X) %*% X) %*% t(X) %*% y # Notice this is the same as the regression coefficients calculated from `lm`.
##         [,1]
##   -0.1178328
## x  1.0728060

1. summary result of the linear model

summary_fit <- summary(lm_fit)
summary_fit
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34412 -0.41581  0.07099  0.36324  1.31307 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.11783    0.37770  -0.312    0.759    
## x            1.07281    0.06147  17.452 2.74e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7338 on 17 degrees of freedom
## Multiple R-squared:  0.9471, Adjusted R-squared:  0.944 
## F-statistic: 304.6 on 1 and 17 DF,  p-value: 2.741e-12
# two methods to get your coefficient matrix
summary_fit$coefficients # method 1
##               Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) -0.1178328 0.37769737 -0.3119769 7.588520e-01
## x            1.0728060 0.06147315 17.4516192 2.741319e-12
coef(summary_fit) # method 2
##               Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) -0.1178328 0.37769737 -0.3119769 7.588520e-01
## x            1.0728060 0.06147315 17.4516192 2.741319e-12
class(summary_fit$coefficients)
## [1] "matrix"
class(coef(summary_fit))
## [1] "matrix"
summary_fit$coefficients[1, 3] # t test statistic of the intercept
## [1] -0.3119769

2. confidence intervals (2 methods)

# method 1
confint(lm_fit)
##                  2.5 %   97.5 %
## (Intercept) -0.9147046 0.679039
## x            0.9431090 1.202503
# method 2
n <- length(x) 
mean_beta0 <- coef(summary_fit)[1, 1]
se_beta0 <- coef(summary_fit)[1, 2]
mean_beta1 <- coef(summary_fit)[2, 1]
se_beta1 <- coef(summary_fit)[2, 2]
CI_beta0 <- c(mean_beta0 - qt(0.975, df = n - 2) * se_beta0,
         mean_beta0 + qt(0.975, df = n - 2) * se_beta0)
CI_beta1 <- c(mean_beta1 - qt(0.975, df = n - 2) * se_beta1,
              mean_beta1 + qt(0.975, df = n - 2) * se_beta1)
CI_beta0
## [1] -0.9147046  0.6790390
CI_beta1
## [1] 0.943109 1.202503

3. plot

data_combo <- as.data.frame(cbind(x, y))

ggplot(data_combo, aes(x = x, y = y)) +
  geom_point() +
  geom_smooth(method = lm, se = FALSE) # without se being plotted

ggplot(data_combo, aes(x = x, y = y)) +
  geom_point() +
  geom_smooth(method = lm) + # with se being plotted
  ggtitle("Strong positive linearity") + # add a plot title
  xlab("this is X-axis") + # add axis title
  ylab("this is Y-axis") +
  theme_classic()# add a clean theme without grids

# residual plot
ggplot(lm_fit) +
  geom_point(aes(x = x, y = .resid)) +
  geom_hline(aes(yintercept = 0), col = "red") +
  theme_classic()# add a clean theme without grids

Case Study 2: Multiple linear regression

Example 1: Categorical predictors. \(X_1 = I\)(control), \(X_2 = I\)(treatment).

Annette Dobson (1990) “An Introduction to Generalized Linear Models”. Page 9: Plant Weight Data.

We first fit a linear regression line where the predictor is the group type.

ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)
trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)
group <- c(rep("ctl", length(ctl)), rep("trt", length(trt)))
weight <- c(ctl, trt)

# method 1: `lm` function
lm_D9 <- lm(weight ~ group)
summary(lm_D9)
## 
## Call:
## lm(formula = weight ~ group)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0710 -0.4938  0.0685  0.2462  1.3690 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.0320     0.2202  22.850 9.55e-15 ***
## grouptrt     -0.3710     0.3114  -1.191    0.249    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6964 on 18 degrees of freedom
## Multiple R-squared:  0.07308,    Adjusted R-squared:  0.02158 
## F-statistic: 1.419 on 1 and 18 DF,  p-value: 0.249
lm_D9$coefficients[1]
## (Intercept) 
##       5.032
lm_D9$coefficients[2]
## grouptrt 
##   -0.371

We can also estimate the regression coefficients using \((X^TX)^{-1}(X^TY)\).

# design matrix
X <- cbind(rep(1, length(ctl) + length(trt)),
           c(rep(0, length(ctl)), rep(1, length(trt))))
y <- as.matrix(weight)

solve(t(X) %*% X) %*% t(X) %*% y
##        [,1]
## [1,]  5.032
## [2,] -0.371

Example 2: Quadratic form

We first simulate data using a quadratic form, and then fit a linear regression line. As you can see from the plot, such linearity is definitely not desired in our data.

x_quad <- seq(-5, 5, 0.1)
y_quad <- rnorm(length(x_quad), mean = x_quad^2, sd = 1)

data_quad <- as.data.frame(cbind(x_quad, y_quad))
ggplot(data_quad, aes(x = x_quad, y = y_quad)) + 
  geom_point() + 
  geom_smooth(method = "lm")

fit_quad <- lm(y_quad ~ x_quad, data = data_quad)
summary(fit_quad)
## 
## Call:
## lm(formula = y_quad ~ x_quad, data = data_quad)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.041 -6.692 -1.939  6.417 16.906 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.602010   0.768216  11.197   <2e-16 ***
## x_quad      -0.008578   0.263496  -0.033    0.974    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.72 on 99 degrees of freedom
## Multiple R-squared:  1.07e-05,   Adjusted R-squared:  -0.01009 
## F-statistic: 0.00106 on 1 and 99 DF,  p-value: 0.9741
ggplot(fit_quad) + 
  geom_point(aes(x = x_quad, y = .resid))

To fix such problem, we can introduce some function \(f\) of the original predictor \(x\), and take \(f(x)\) as our new predictor. Here, I am “guessing” a quadratic form would be the most appropriate given the above plot. (Shhh… let us pretend the true data generation process is unknown.)

data_quad$x_sq <- (data_quad$x_quad)**2
fit_quad_new <- lm(y_quad ~ x_quad + x_sq, data = data_quad)
summary(fit_quad_new)
## 
## Call:
## lm(formula = y_quad ~ x_quad + x_sq, data = data_quad)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0449 -0.6253 -0.1452  0.6336  3.7479 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.137032   0.160202   0.855    0.394    
## x_quad      -0.008578   0.036629  -0.234    0.815    
## x_sq         0.995880   0.014049  70.887   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.073 on 98 degrees of freedom
## Multiple R-squared:  0.9809, Adjusted R-squared:  0.9805 
## F-statistic:  2513 on 2 and 98 DF,  p-value: < 2.2e-16
ggplot(data_quad, aes(x = x_quad, y = y_quad)) +  
  geom_point() + 
  geom_smooth(method = "lm", formula = y ~ x + I(x^2)) +
  ggtitle("A perfect quadratic regression line") +
  xlab("x") +
  ylab("y") +
  theme_classic()

ggplot(fit_quad_new) + 
  geom_point(aes(x = x_quad, y = .resid)) + 
  geom_hline(aes(yintercept = 0), col = "green")

Congrats!! Now, you get a quadratic regression line which prefectly fits your data!