–//– Load R Packages –//–


Install and/or load all required R Packages

knitr::opts_chunk$set(echo = TRUE)

if(!require("tidyverse")) {install.packages("tidyverse"); library("tidyverse")}
if(!require("DescTools")) {install.packages("DescTools"); library("DescTools")}
if(!require("sjlabelled")) {install.packages("sjlabelled"); library("sjlabelled")}
if(!require("Hmisc")) {install.packages("Hmisc"); library("Hmisc")}
if(!require("stats")) {install.packages("stats"); library("stats")}
if(!require("MASS")) {install.packages("MASS"); library("MASS")}
if(!require("psych")) {install.packages("psych"); library("psych")}

–//– Clean-Up Working Emvironment –//–


Removes all objects from the current working environment

# clean up working environment
rm(list = ls())

–//– Chapter 02 - Regression Analysis –//–


[=> Case Study] | 2.3 | A: Case Study Regression: Method “Enter”.

(+) First Analysis: Method "Enter“.

(-) Load dataset (40 cases).

# Generate data
mydatc2_case <- data.frame(ID = c(1:40),
                      Shopping = c(6,   12, 2,  15, 2,  10, 17, 0,  18, 14, 5,  2,  8,  2,  3,  7,  2,  5,  5,  12, 5,  4,  6,  5,  2,  10, 10, 6,  14, 14, 5,  7,  6,  7,  12, 12, 9,  17, 15, 9),
                      Age = c(37,   25, 20, 53, 41, 44, 39, 46, 44, 39, 29, 29, 52, 46, 36, 24, 45, 52, 40, 43, 60, 25, 47, 43, 44, 39, 44, 27, 48, 35, 48, 52, 25, 31, 52, 60, 56, 40, 54, 56),
                      Gender = c(0, 0,  1,  0,  0,  0,  0,  0,  1,  1,  0,  1,  1,  0,  0,  1,  0,  1,  0,  1,  1,  0,  1,  1,  1,  1,  0,  0,  1,  1,  1,  0,  0,  1,  0,  0,  1,  0,  0,  0),
                      Income = c(1.8,   2.9,    2,  6.1,    2.2,    5.5,    5.4,    2.1,    5.6,    4.3,    2.9,    1.8,    3.7,    2.1,    3.2,    3.2,    2,  4.1,    4.4,    4.6,    3.5,    2.7,    4.9,    2.3,    3.9,    5.4,    4.6,    1.3,    6.1,    4.9,    3.1,    3,  2.9,    2.3,    4.9,    6.2,    4.4,    4.9,    6.3,    4.9))
                      
# Set labels                      
var.labels <- c(Period = "i", 
               Shopping = "Shopping frequency", 
               Age = "Age",
               Gender = "Gender (1 = male, 0 = female)",
               Income = "Income in [1000 EUR]")

Hmisc::label(mydatc2_case) <- as.list(var.labels[match(names(mydatc2_case), names(var.labels))])

# Display generated data for verification
print(mydatc2_case)
##    ID Shopping Age Gender Income
## 1   1        6  37      0    1.8
## 2   2       12  25      0    2.9
## 3   3        2  20      1    2.0
## 4   4       15  53      0    6.1
## 5   5        2  41      0    2.2
## 6   6       10  44      0    5.5
## 7   7       17  39      0    5.4
## 8   8        0  46      0    2.1
## 9   9       18  44      1    5.6
## 10 10       14  39      1    4.3
## 11 11        5  29      0    2.9
## 12 12        2  29      1    1.8
## 13 13        8  52      1    3.7
## 14 14        2  46      0    2.1
## 15 15        3  36      0    3.2
## 16 16        7  24      1    3.2
## 17 17        2  45      0    2.0
## 18 18        5  52      1    4.1
## 19 19        5  40      0    4.4
## 20 20       12  43      1    4.6
## 21 21        5  60      1    3.5
## 22 22        4  25      0    2.7
## 23 23        6  47      1    4.9
## 24 24        5  43      1    2.3
## 25 25        2  44      1    3.9
## 26 26       10  39      1    5.4
## 27 27       10  44      0    4.6
## 28 28        6  27      0    1.3
## 29 29       14  48      1    6.1
## 30 30       14  35      1    4.9
## 31 31        5  48      1    3.1
## 32 32        7  52      0    3.0
## 33 33        6  25      0    2.9
## 34 34        7  31      1    2.3
## 35 35       12  52      0    4.9
## 36 36       12  60      0    6.2
## 37 37        9  56      1    4.4
## 38 38       17  40      0    4.9
## 39 39       15  54      0    6.3
## 40 40        9  56      0    4.9
# Structure of the generated data
str(mydatc2_case)
## 'data.frame':    40 obs. of  5 variables:
##  $ ID      : 'labelled' int  1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "label")= chr NA
##  $ Shopping: 'labelled' num  6 12 2 15 2 10 17 0 18 14 ...
##   ..- attr(*, "label")= chr "Shopping frequency"
##  $ Age     : 'labelled' num  37 25 20 53 41 44 39 46 44 39 ...
##   ..- attr(*, "label")= chr "Age"
##  $ Gender  : 'labelled' num  0 0 1 0 0 0 0 0 1 1 ...
##   ..- attr(*, "label")= chr "Gender (1 = male, 0 = female)"
##  $ Income  : 'labelled' num  1.8 2.9 2 6.1 2.2 5.5 5.4 2.1 5.6 4.3 ...
##   ..- attr(*, "label")= chr "Income in [1000 EUR]"

(-) Figure 2.33 - Analog to SPSS output for regression analysis 1

# 02_RA SPSS DATEN - Case Study (40 cases) 10-9-20.sav

reg_A1 <- lm(Shopping ~ Age + Gender, data = mydatc2_case)
summary(reg_A1)
## 
## Call:
## lm(formula = Shopping ~ Age + Gender, data = mydatc2_case)
## 
## Residuals:
## Shopping frequency 
##     Min      1Q  Median      3Q     Max 
## -8.4870 -3.7156 -0.5317  3.1374  9.7308 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  3.83231    3.27319   1.171    0.249
## Age          0.10119    0.07447   1.359    0.182
## Gender      -0.01545    1.56343  -0.010    0.992
## 
## Residual standard error: 4.919 on 37 degrees of freedom
## Multiple R-squared:  0.04753,    Adjusted R-squared:  -0.003955 
## F-statistic: 0.9232 on 2 and 37 DF,  p-value: 0.4062

(+) Second Analysis: Method “Enter”.

(-) Figure 2.34 - Analog to SPSS output for regression analysis 2

reg_A2 <- lm(Shopping ~ Age + Gender + Income, data = mydatc2_case)
summary(reg_A2)
## 
## Call:
## lm(formula = Shopping ~ Age + Gender + Income, data = mydatc2_case)
## 
## Residuals:
## Shopping frequency 
##     Min      1Q  Median      3Q     Max 
## -5.8187 -2.0614 -0.2042  1.9980  5.1458 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.4005     2.0111   0.696   0.4906    
## Age          -0.1204     0.0530  -2.271   0.0292 *  
## Gender       -0.4379     0.9511  -0.460   0.6480    
## Income        3.1160     0.3886   8.018 1.59e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.988 on 36 degrees of freedom
## Multiple R-squared:  0.6581, Adjusted R-squared:  0.6296 
## F-statistic:  23.1 on 3 and 36 DF,  p-value: 1.635e-08

(-) Descriptive statistics

psych::describe(mydatc2_case)
##          vars  n  mean    sd median trimmed   mad  min  max range  skew
## ID          1 40 20.50 11.69   20.5   20.50 14.83  1.0 40.0    39  0.00
## Shopping    2 40  8.05  4.91    7.0    7.78  5.19  0.0 18.0    18  0.37
## Age         3 40 41.75 10.58   43.5   42.00 11.86 20.0 60.0    40 -0.27
## Gender      4 40  0.45  0.50    0.0    0.44  0.00  0.0  1.0     1  0.19
## Income      5 40  3.81  1.44    3.8    3.77  1.63  1.3  6.3     5  0.08
##          kurtosis   se
## ID          -1.29 1.85
## Shopping    -1.03 0.78
## Age         -0.87 1.67
## Gender      -2.01 0.08
## Income      -1.29 0.23

(-) Correlations

# Correlations
options("Hmisc.rcorr.digits" = 4)
correlations <- rcorr(as.matrix(mydatc2_case), type = "pearson")

print(correlations) # Correlation matrix
##            ID Shopping  Age Gender Income
## ID       1.00     0.19 0.34   0.05   0.29
## Shopping 0.19     1.00 0.22   0.00   0.78
## Age      0.34     0.22 1.00   0.01   0.52
## Gender   0.05     0.00 0.01   1.00   0.05
## Income   0.29     0.78 0.52   0.05   1.00
## 
## n= 40 
## 
## 
## P
##          ID     Shopping Age    Gender Income
## ID              0.2361   0.0336 0.7489 0.0651
## Shopping 0.2361          0.1766 0.9949 0.0000
## Age      0.0336 0.1766          0.9413 0.0006
## Gender   0.7489 0.9949   0.9413        0.7428
## Income   0.0651 0.0000   0.0006 0.7428
print(correlations$r, digits = 3) # correlations 
##              ID Shopping   Age  Gender Income
## ID       1.0000  0.19167 0.337 0.05224 0.2945
## Shopping 0.1917  1.00000 0.218 0.00104 0.7794
## Age      0.3367  0.21801 1.000 0.01203 0.5212
## Gender   0.0522  0.00104 0.012 1.00000 0.0535
## Income   0.2945  0.77941 0.521 0.05355 1.0000
print(correlations$P, digits = 3) # p-values
##              ID Shopping      Age Gender   Income
## ID           NA 2.36e-01 0.033608  0.749 6.51e-02
## Shopping 0.2361       NA 0.176567  0.995 3.12e-09
## Age      0.0336 1.77e-01       NA  0.941 5.64e-04
## Gender   0.7489 9.95e-01 0.941278     NA 7.43e-01
## Income   0.0651 3.12e-09 0.000564  0.743       NA

(-) Figure 2.35 - Correlation matrix for Shopping, Age, Gender, Income

cor(mydatc2_case[,-c(1)])
##             Shopping        Age      Gender     Income
## Shopping 1.000000000 0.21800798 0.001036672 0.77941025
## Age      0.218007981 1.00000000 0.012028319 0.52117363
## Gender   0.001036672 0.01202832 1.000000000 0.05354568
## Income   0.779410247 0.52117363 0.053545683 1.00000000

(-) Figure 2.36 - Scatterplot of the dependent variable ’shopping frequency’ against ‘income’

# Regression Plot: Shopping ~ Income
plot(mydatc2_case$Income, 
     mydatc2_case$Shopping, 
     xlab = "Income", 
     ylab = "Shopping frequency", 
     main = "Scatterplot of Shopping frequency by Income",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1) + abline(lm(Shopping ~ Income, data = mydatc2_case)); 

## integer(0)
summary(lm(Shopping ~ Income, data = mydatc2_case)) # 0.607 R²
## 
## Call:
## lm(formula = Shopping ~ Income, data = mydatc2_case)
## 
## Residuals:
## Shopping frequency 
##     Min      1Q  Median      3Q     Max 
## -6.2884 -2.0099 -0.3775  1.2859  6.3602 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2.0409     1.4050  -1.453    0.155    
## Income        2.6485     0.3454   7.669 3.12e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.116 on 38 degrees of freedom
## Multiple R-squared:  0.6075, Adjusted R-squared:  0.5972 
## F-statistic: 58.81 on 1 and 38 DF,  p-value: 3.119e-09

(-) Figure 2.38 - Plot of residuals against predicted values

(-) Figure 2.39 - Histogram of residuals with normal curve

# plot residuals
plot(reg_A2$residuals,
     main = "Scatterplot | Dependent variable: Shopping frequency",
     ylab = "Regression standardized residuals", 
     xlab = "Regression standardized predicted value",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1); 

hist(reg_A2$residuals, freq = TRUE,
     main = "Histogram | Dependent variable: Shopping frequency",
     ylab = "Frequency", 
     xlab = "Regression standardized residual",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1); 
curve(dnorm, add = TRUE,
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1); 

summary(reg_A2$residuals); 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -5.8187 -2.0614 -0.2042  0.0000  1.9980  5.1458
mean(reg_A2$residuals); 
## [1] 2.012279e-16
sd(reg_A2$residuals)
## [1] 2.870413
# get unstandardized predicted and residual values
unstandardizedPredicted <- predict(reg_A2)
unstandardizedResiduals <- resid(reg_A2)

# get standardized values
standardizedPredicted <- (unstandardizedPredicted - mean(unstandardizedPredicted)) / sd(unstandardizedPredicted)
standardizedResiduals <- (unstandardizedResiduals - mean(unstandardizedResiduals)) / sd(unstandardizedResiduals)

# create standardized residuals plot; # add horizontal line
plot(standardizedPredicted, 
     standardizedResiduals, 
     main = "Scatterplot (Dependent variable: Shopping frequency)", 
     xlab = "Regression standardized predicted value", 
     ylab = "Regression standardized residual",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1); 
abline(0,0)

# create residuals histogram; # add normal curve
hist(standardizedResiduals, freq = FALSE); curve(dnorm, add = TRUE)

(-) Figure 2.40 - P-P- Plot

# get unstandardized predicted and residual values
unstandardizedPredicted <- predict(reg_A2, type = "response")
unstandardizedResiduals <- resid(reg_A2)

# get standardized values
standardizedPredicted <- (unstandardizedPredicted - mean(unstandardizedPredicted)) / sd(unstandardizedPredicted)
standardizedResiduals <- (unstandardizedResiduals - mean(unstandardizedResiduals)) / sd(unstandardizedResiduals)

# create standardized residuals plot; # add horizontal line
plot(standardizedPredicted, 
     standardizedResiduals, 
     main = "Standardized residuals plot", 
     xlab = "Standardized predicted values", 
     ylab = "Standardized residuals",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1); 
abline(0,0)

# get probability distribution for residuals
probDist <- pnorm(standardizedResiduals)

# create PP plot; # add diagonal line
plot(ppoints(length(standardizedResiduals)), 
     sort(probDist), 
     main = "P-P-Plot | Standardized residual", 
     xlab = "Expected cumulated probability", 
     ylab = "Observed cumulated probability",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1); 
abline(0,1)

#plot(regmod.3);
#qqnorm(rstandard(regmod.3)); #qqline(rstandard(regmod.3))

[=> Case Study] | 2.3 | B: Case Study regression: Method: “Stepwise”.

(-) Figure 2.41, Figure 2.42, Figure 2.43: Stepwise Regression

# Stepwise regression
full.model <- lm(Shopping ~ Age + Gender + Income, data = mydatc2_case)

reg_B <- MASS::stepAIC(full.model, direction = "both", trace = TRUE)
## Start:  AIC=91.34
## Shopping ~ Age + Gender + Income
## 
##          Df Sum of Sq    RSS     AIC
## - Gender  1      1.89 323.22  89.579
## <none>                321.33  91.344
## - Age     1     46.04 367.37  94.699
## - Income  1    573.89 895.23 130.328
## 
## Step:  AIC=89.58
## Shopping ~ Age + Income
## 
##          Df Sum of Sq    RSS     AIC
## <none>                323.22  89.579
## + Gender  1      1.89 321.33  91.344
## - Age     1     45.71 368.93  92.869
## - Income  1    572.00 895.23 128.328
summary(reg_B); reg_B$anova
## 
## Call:
## lm(formula = Shopping ~ Age + Income, data = mydatc2_case)
## 
## Residuals:
## Shopping frequency 
##    Min     1Q Median     3Q    Max 
## -6.060 -1.871 -0.162  1.791  5.354 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.22224    1.95232   0.626    0.535    
## Age         -0.11992    0.05243  -2.287    0.028 *  
## Income       3.10613    0.38386   8.092 1.05e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.956 on 37 degrees of freedom
## Multiple R-squared:  0.6561, Adjusted R-squared:  0.6375 
## F-statistic:  35.3 on 2 and 37 DF,  p-value: 2.653e-09
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## Shopping ~ Age + Gender + Income
## 
## Final Model:
## Shopping ~ Age + Income
## 
## 
##       Step Df Deviance Resid. Df Resid. Dev      AIC
## 1                             36   321.3317 91.34377
## 2 - Gender  1 1.892502        37   323.2242 89.57866

(-) 2.4.2 | Regression analysis with time-series data.

(-) Table 2.17 - Time-series data for chocolate sales

# Observed and standardized values
mydatc2t <- data.frame(Period = c(1:12),
                       Sales = c(2596,2709,2552,3004,3076,2513,2626,3120,2751,2965,2818,3171),
                       Time = c(1:12),
                       d1 = c(1,0,0,0,1,0,0,0,1,0,0,0),
                       d2 = c(0,1,0,0,0,1,0,0,0,1,0,0),
                       d3 = c(0,0,1,0,0,0,1,0,0,0,1,0),
                       d4 = c(0,0,0,1,0,0,0,1,0,0,0,1)) 

# Set labels                      
var.labels <- c(Period = "i", 
               Sales = "1000 units", 
               Time = "t [Quarter]",
               d1 = "d1",
               d2 = "d2",
               d3 = "d3",
               d4 = "d4")

label(mydatc2t) <- as.list(var.labels[match(names(mydatc2t), names(var.labels))])

#  Display generated data for verification; # Structure of the generated data
print(mydatc2t); str(mydatc2t)
##    Period Sales Time d1 d2 d3 d4
## 1       1  2596    1  1  0  0  0
## 2       2  2709    2  0  1  0  0
## 3       3  2552    3  0  0  1  0
## 4       4  3004    4  0  0  0  1
## 5       5  3076    5  1  0  0  0
## 6       6  2513    6  0  1  0  0
## 7       7  2626    7  0  0  1  0
## 8       8  3120    8  0  0  0  1
## 9       9  2751    9  1  0  0  0
## 10     10  2965   10  0  1  0  0
## 11     11  2818   11  0  0  1  0
## 12     12  3171   12  0  0  0  1
## 'data.frame':    12 obs. of  7 variables:
##  $ Period: 'labelled' int  1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "label")= chr "i"
##  $ Sales : 'labelled' num  2596 2709 2552 3004 3076 ...
##   ..- attr(*, "label")= chr "1000 units"
##  $ Time  : 'labelled' int  1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "label")= chr "t [Quarter]"
##  $ d1    : 'labelled' num  1 0 0 0 1 0 0 0 1 0 ...
##   ..- attr(*, "label")= chr "d1"
##  $ d2    : 'labelled' num  0 1 0 0 0 1 0 0 0 1 ...
##   ..- attr(*, "label")= chr "d2"
##  $ d3    : 'labelled' num  0 0 1 0 0 0 1 0 0 0 ...
##   ..- attr(*, "label")= chr "d3"
##  $ d4    : 'labelled' num  0 0 0 1 0 0 0 1 0 0 ...
##   ..- attr(*, "label")= chr "d4"

(-) Figure 2.45 - Scatterplot and linear trend

plot(mydatc2t$Sales~mydatc2t$Time, 
     xlab = "Time", 
     ylab = "Sales",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1); 
abline(a = 2617, b = 32.09)

(-) Figure 2.46 - Seasonal pattern

timereg.1 <- lm(Sales~d1+d2+d3+d4+Time-1, data = mydatc2t)
summary(timereg.1); barplot(scale(timereg.1$coefficients[1:4])[,1], 
                            main = "Seasonal pattern", 
                            xlab = "Quarter",
                            cex.main = 1,
                            cex.lab = 1, 
                            cex.axis = 1) 
## 
## Call:
## lm(formula = Sales ~ d1 + d2 + d3 + d4 + Time - 1, data = mydatc2t)
## 
## Residuals:
## 1000 units 
##      Min       1Q   Median       3Q      Max 
## -216.000  -56.042    1.667   56.750  268.333 
## 
## Coefficients:
##      Estimate Std. Error t value Pr(>|t|)    
## d1    2675.79     118.54   22.57 8.48e-08 ***
## d2    2570.75     127.82   20.11 1.88e-07 ***
## d3    2480.71     137.97   17.98 4.07e-07 ***
## d4    2887.33     148.83   19.40 2.41e-07 ***
## Time    26.37      14.41    1.83     0.11    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 163 on 7 degrees of freedom
## Multiple R-squared:  0.9981, Adjusted R-squared:  0.9967 
## F-statistic: 723.7 on 5 and 7 DF,  p-value: 2.449e-09

(-) Figure 2.47 - Estimated time-series with seasonal fluctuations

plot(ts(mydatc2t, frequency = 1),
     main = "Time-series of chocolate sales",
     cex.main = 1,
     cex.lab = 1, 
     cex.axis = 1)