knitr::opts_chunk$set(echo = TRUE)
if(!require("tidyverse")) {install.packages("tidyverse"); library("tidyverse")}
if(!require("DescTools")) {install.packages("DescTools"); library("DescTools")}
if(!require("sjlabelled")) {install.packages("sjlabelled"); library("sjlabelled")}
if(!require("Hmisc")) {install.packages("Hmisc"); library("Hmisc")}
if(!require("stats")) {install.packages("stats"); library("stats")}
if(!require("MASS")) {install.packages("MASS"); library("MASS")}
if(!require("psych")) {install.packages("psych"); library("psych")}
# clean up working environment
rm(list = ls())
# Generate data
mydatc2_case <- data.frame(ID = c(1:40),
Shopping = c(6, 12, 2, 15, 2, 10, 17, 0, 18, 14, 5, 2, 8, 2, 3, 7, 2, 5, 5, 12, 5, 4, 6, 5, 2, 10, 10, 6, 14, 14, 5, 7, 6, 7, 12, 12, 9, 17, 15, 9),
Age = c(37, 25, 20, 53, 41, 44, 39, 46, 44, 39, 29, 29, 52, 46, 36, 24, 45, 52, 40, 43, 60, 25, 47, 43, 44, 39, 44, 27, 48, 35, 48, 52, 25, 31, 52, 60, 56, 40, 54, 56),
Gender = c(0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0),
Income = c(1.8, 2.9, 2, 6.1, 2.2, 5.5, 5.4, 2.1, 5.6, 4.3, 2.9, 1.8, 3.7, 2.1, 3.2, 3.2, 2, 4.1, 4.4, 4.6, 3.5, 2.7, 4.9, 2.3, 3.9, 5.4, 4.6, 1.3, 6.1, 4.9, 3.1, 3, 2.9, 2.3, 4.9, 6.2, 4.4, 4.9, 6.3, 4.9))
# Set labels
var.labels <- c(Period = "i",
Shopping = "Shopping frequency",
Age = "Age",
Gender = "Gender (1 = male, 0 = female)",
Income = "Income in [1000 EUR]")
Hmisc::label(mydatc2_case) <- as.list(var.labels[match(names(mydatc2_case), names(var.labels))])
# Display generated data for verification
print(mydatc2_case)
## ID Shopping Age Gender Income
## 1 1 6 37 0 1.8
## 2 2 12 25 0 2.9
## 3 3 2 20 1 2.0
## 4 4 15 53 0 6.1
## 5 5 2 41 0 2.2
## 6 6 10 44 0 5.5
## 7 7 17 39 0 5.4
## 8 8 0 46 0 2.1
## 9 9 18 44 1 5.6
## 10 10 14 39 1 4.3
## 11 11 5 29 0 2.9
## 12 12 2 29 1 1.8
## 13 13 8 52 1 3.7
## 14 14 2 46 0 2.1
## 15 15 3 36 0 3.2
## 16 16 7 24 1 3.2
## 17 17 2 45 0 2.0
## 18 18 5 52 1 4.1
## 19 19 5 40 0 4.4
## 20 20 12 43 1 4.6
## 21 21 5 60 1 3.5
## 22 22 4 25 0 2.7
## 23 23 6 47 1 4.9
## 24 24 5 43 1 2.3
## 25 25 2 44 1 3.9
## 26 26 10 39 1 5.4
## 27 27 10 44 0 4.6
## 28 28 6 27 0 1.3
## 29 29 14 48 1 6.1
## 30 30 14 35 1 4.9
## 31 31 5 48 1 3.1
## 32 32 7 52 0 3.0
## 33 33 6 25 0 2.9
## 34 34 7 31 1 2.3
## 35 35 12 52 0 4.9
## 36 36 12 60 0 6.2
## 37 37 9 56 1 4.4
## 38 38 17 40 0 4.9
## 39 39 15 54 0 6.3
## 40 40 9 56 0 4.9
# Structure of the generated data
str(mydatc2_case)
## 'data.frame': 40 obs. of 5 variables:
## $ ID : 'labelled' int 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "label")= chr NA
## $ Shopping: 'labelled' num 6 12 2 15 2 10 17 0 18 14 ...
## ..- attr(*, "label")= chr "Shopping frequency"
## $ Age : 'labelled' num 37 25 20 53 41 44 39 46 44 39 ...
## ..- attr(*, "label")= chr "Age"
## $ Gender : 'labelled' num 0 0 1 0 0 0 0 0 1 1 ...
## ..- attr(*, "label")= chr "Gender (1 = male, 0 = female)"
## $ Income : 'labelled' num 1.8 2.9 2 6.1 2.2 5.5 5.4 2.1 5.6 4.3 ...
## ..- attr(*, "label")= chr "Income in [1000 EUR]"
# 02_RA SPSS DATEN - Case Study (40 cases) 10-9-20.sav
reg_A1 <- lm(Shopping ~ Age + Gender, data = mydatc2_case)
summary(reg_A1)
##
## Call:
## lm(formula = Shopping ~ Age + Gender, data = mydatc2_case)
##
## Residuals:
## Shopping frequency
## Min 1Q Median 3Q Max
## -8.4870 -3.7156 -0.5317 3.1374 9.7308
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.83231 3.27319 1.171 0.249
## Age 0.10119 0.07447 1.359 0.182
## Gender -0.01545 1.56343 -0.010 0.992
##
## Residual standard error: 4.919 on 37 degrees of freedom
## Multiple R-squared: 0.04753, Adjusted R-squared: -0.003955
## F-statistic: 0.9232 on 2 and 37 DF, p-value: 0.4062
reg_A2 <- lm(Shopping ~ Age + Gender + Income, data = mydatc2_case)
summary(reg_A2)
##
## Call:
## lm(formula = Shopping ~ Age + Gender + Income, data = mydatc2_case)
##
## Residuals:
## Shopping frequency
## Min 1Q Median 3Q Max
## -5.8187 -2.0614 -0.2042 1.9980 5.1458
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.4005 2.0111 0.696 0.4906
## Age -0.1204 0.0530 -2.271 0.0292 *
## Gender -0.4379 0.9511 -0.460 0.6480
## Income 3.1160 0.3886 8.018 1.59e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.988 on 36 degrees of freedom
## Multiple R-squared: 0.6581, Adjusted R-squared: 0.6296
## F-statistic: 23.1 on 3 and 36 DF, p-value: 1.635e-08
psych::describe(mydatc2_case)
## vars n mean sd median trimmed mad min max range skew
## ID 1 40 20.50 11.69 20.5 20.50 14.83 1.0 40.0 39 0.00
## Shopping 2 40 8.05 4.91 7.0 7.78 5.19 0.0 18.0 18 0.37
## Age 3 40 41.75 10.58 43.5 42.00 11.86 20.0 60.0 40 -0.27
## Gender 4 40 0.45 0.50 0.0 0.44 0.00 0.0 1.0 1 0.19
## Income 5 40 3.81 1.44 3.8 3.77 1.63 1.3 6.3 5 0.08
## kurtosis se
## ID -1.29 1.85
## Shopping -1.03 0.78
## Age -0.87 1.67
## Gender -2.01 0.08
## Income -1.29 0.23
# Correlations
options("Hmisc.rcorr.digits" = 4)
correlations <- rcorr(as.matrix(mydatc2_case), type = "pearson")
print(correlations) # Correlation matrix
## ID Shopping Age Gender Income
## ID 1.00 0.19 0.34 0.05 0.29
## Shopping 0.19 1.00 0.22 0.00 0.78
## Age 0.34 0.22 1.00 0.01 0.52
## Gender 0.05 0.00 0.01 1.00 0.05
## Income 0.29 0.78 0.52 0.05 1.00
##
## n= 40
##
##
## P
## ID Shopping Age Gender Income
## ID 0.2361 0.0336 0.7489 0.0651
## Shopping 0.2361 0.1766 0.9949 0.0000
## Age 0.0336 0.1766 0.9413 0.0006
## Gender 0.7489 0.9949 0.9413 0.7428
## Income 0.0651 0.0000 0.0006 0.7428
print(correlations$r, digits = 3) # correlations
## ID Shopping Age Gender Income
## ID 1.0000 0.19167 0.337 0.05224 0.2945
## Shopping 0.1917 1.00000 0.218 0.00104 0.7794
## Age 0.3367 0.21801 1.000 0.01203 0.5212
## Gender 0.0522 0.00104 0.012 1.00000 0.0535
## Income 0.2945 0.77941 0.521 0.05355 1.0000
print(correlations$P, digits = 3) # p-values
## ID Shopping Age Gender Income
## ID NA 2.36e-01 0.033608 0.749 6.51e-02
## Shopping 0.2361 NA 0.176567 0.995 3.12e-09
## Age 0.0336 1.77e-01 NA 0.941 5.64e-04
## Gender 0.7489 9.95e-01 0.941278 NA 7.43e-01
## Income 0.0651 3.12e-09 0.000564 0.743 NA
cor(mydatc2_case[,-c(1)])
## Shopping Age Gender Income
## Shopping 1.000000000 0.21800798 0.001036672 0.77941025
## Age 0.218007981 1.00000000 0.012028319 0.52117363
## Gender 0.001036672 0.01202832 1.000000000 0.05354568
## Income 0.779410247 0.52117363 0.053545683 1.00000000
# Regression Plot: Shopping ~ Income
plot(mydatc2_case$Income,
mydatc2_case$Shopping,
xlab = "Income",
ylab = "Shopping frequency",
main = "Scatterplot of Shopping frequency by Income",
cex.main = 1,
cex.lab = 1,
cex.axis = 1) + abline(lm(Shopping ~ Income, data = mydatc2_case));
## integer(0)
summary(lm(Shopping ~ Income, data = mydatc2_case)) # 0.607 R²
##
## Call:
## lm(formula = Shopping ~ Income, data = mydatc2_case)
##
## Residuals:
## Shopping frequency
## Min 1Q Median 3Q Max
## -6.2884 -2.0099 -0.3775 1.2859 6.3602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.0409 1.4050 -1.453 0.155
## Income 2.6485 0.3454 7.669 3.12e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.116 on 38 degrees of freedom
## Multiple R-squared: 0.6075, Adjusted R-squared: 0.5972
## F-statistic: 58.81 on 1 and 38 DF, p-value: 3.119e-09
# plot residuals
plot(reg_A2$residuals,
main = "Scatterplot | Dependent variable: Shopping frequency",
ylab = "Regression standardized residuals",
xlab = "Regression standardized predicted value",
cex.main = 1,
cex.lab = 1,
cex.axis = 1);
hist(reg_A2$residuals, freq = TRUE,
main = "Histogram | Dependent variable: Shopping frequency",
ylab = "Frequency",
xlab = "Regression standardized residual",
cex.main = 1,
cex.lab = 1,
cex.axis = 1);
curve(dnorm, add = TRUE,
cex.main = 1,
cex.lab = 1,
cex.axis = 1);
summary(reg_A2$residuals);
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.8187 -2.0614 -0.2042 0.0000 1.9980 5.1458
mean(reg_A2$residuals);
## [1] 2.012279e-16
sd(reg_A2$residuals)
## [1] 2.870413
# get unstandardized predicted and residual values
unstandardizedPredicted <- predict(reg_A2)
unstandardizedResiduals <- resid(reg_A2)
# get standardized values
standardizedPredicted <- (unstandardizedPredicted - mean(unstandardizedPredicted)) / sd(unstandardizedPredicted)
standardizedResiduals <- (unstandardizedResiduals - mean(unstandardizedResiduals)) / sd(unstandardizedResiduals)
# create standardized residuals plot; # add horizontal line
plot(standardizedPredicted,
standardizedResiduals,
main = "Scatterplot (Dependent variable: Shopping frequency)",
xlab = "Regression standardized predicted value",
ylab = "Regression standardized residual",
cex.main = 1,
cex.lab = 1,
cex.axis = 1);
abline(0,0)
# create residuals histogram; # add normal curve
hist(standardizedResiduals, freq = FALSE); curve(dnorm, add = TRUE)
# get unstandardized predicted and residual values
unstandardizedPredicted <- predict(reg_A2, type = "response")
unstandardizedResiduals <- resid(reg_A2)
# get standardized values
standardizedPredicted <- (unstandardizedPredicted - mean(unstandardizedPredicted)) / sd(unstandardizedPredicted)
standardizedResiduals <- (unstandardizedResiduals - mean(unstandardizedResiduals)) / sd(unstandardizedResiduals)
# create standardized residuals plot; # add horizontal line
plot(standardizedPredicted,
standardizedResiduals,
main = "Standardized residuals plot",
xlab = "Standardized predicted values",
ylab = "Standardized residuals",
cex.main = 1,
cex.lab = 1,
cex.axis = 1);
abline(0,0)
# get probability distribution for residuals
probDist <- pnorm(standardizedResiduals)
# create PP plot; # add diagonal line
plot(ppoints(length(standardizedResiduals)),
sort(probDist),
main = "P-P-Plot | Standardized residual",
xlab = "Expected cumulated probability",
ylab = "Observed cumulated probability",
cex.main = 1,
cex.lab = 1,
cex.axis = 1);
abline(0,1)
#plot(regmod.3);
#qqnorm(rstandard(regmod.3)); #qqline(rstandard(regmod.3))
# Stepwise regression
full.model <- lm(Shopping ~ Age + Gender + Income, data = mydatc2_case)
reg_B <- MASS::stepAIC(full.model, direction = "both", trace = TRUE)
## Start: AIC=91.34
## Shopping ~ Age + Gender + Income
##
## Df Sum of Sq RSS AIC
## - Gender 1 1.89 323.22 89.579
## <none> 321.33 91.344
## - Age 1 46.04 367.37 94.699
## - Income 1 573.89 895.23 130.328
##
## Step: AIC=89.58
## Shopping ~ Age + Income
##
## Df Sum of Sq RSS AIC
## <none> 323.22 89.579
## + Gender 1 1.89 321.33 91.344
## - Age 1 45.71 368.93 92.869
## - Income 1 572.00 895.23 128.328
summary(reg_B); reg_B$anova
##
## Call:
## lm(formula = Shopping ~ Age + Income, data = mydatc2_case)
##
## Residuals:
## Shopping frequency
## Min 1Q Median 3Q Max
## -6.060 -1.871 -0.162 1.791 5.354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.22224 1.95232 0.626 0.535
## Age -0.11992 0.05243 -2.287 0.028 *
## Income 3.10613 0.38386 8.092 1.05e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.956 on 37 degrees of freedom
## Multiple R-squared: 0.6561, Adjusted R-squared: 0.6375
## F-statistic: 35.3 on 2 and 37 DF, p-value: 2.653e-09
## Stepwise Model Path
## Analysis of Deviance Table
##
## Initial Model:
## Shopping ~ Age + Gender + Income
##
## Final Model:
## Shopping ~ Age + Income
##
##
## Step Df Deviance Resid. Df Resid. Dev AIC
## 1 36 321.3317 91.34377
## 2 - Gender 1 1.892502 37 323.2242 89.57866
# Observed and standardized values
mydatc2t <- data.frame(Period = c(1:12),
Sales = c(2596,2709,2552,3004,3076,2513,2626,3120,2751,2965,2818,3171),
Time = c(1:12),
d1 = c(1,0,0,0,1,0,0,0,1,0,0,0),
d2 = c(0,1,0,0,0,1,0,0,0,1,0,0),
d3 = c(0,0,1,0,0,0,1,0,0,0,1,0),
d4 = c(0,0,0,1,0,0,0,1,0,0,0,1))
# Set labels
var.labels <- c(Period = "i",
Sales = "1000 units",
Time = "t [Quarter]",
d1 = "d1",
d2 = "d2",
d3 = "d3",
d4 = "d4")
label(mydatc2t) <- as.list(var.labels[match(names(mydatc2t), names(var.labels))])
# Display generated data for verification; # Structure of the generated data
print(mydatc2t); str(mydatc2t)
## Period Sales Time d1 d2 d3 d4
## 1 1 2596 1 1 0 0 0
## 2 2 2709 2 0 1 0 0
## 3 3 2552 3 0 0 1 0
## 4 4 3004 4 0 0 0 1
## 5 5 3076 5 1 0 0 0
## 6 6 2513 6 0 1 0 0
## 7 7 2626 7 0 0 1 0
## 8 8 3120 8 0 0 0 1
## 9 9 2751 9 1 0 0 0
## 10 10 2965 10 0 1 0 0
## 11 11 2818 11 0 0 1 0
## 12 12 3171 12 0 0 0 1
## 'data.frame': 12 obs. of 7 variables:
## $ Period: 'labelled' int 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "label")= chr "i"
## $ Sales : 'labelled' num 2596 2709 2552 3004 3076 ...
## ..- attr(*, "label")= chr "1000 units"
## $ Time : 'labelled' int 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "label")= chr "t [Quarter]"
## $ d1 : 'labelled' num 1 0 0 0 1 0 0 0 1 0 ...
## ..- attr(*, "label")= chr "d1"
## $ d2 : 'labelled' num 0 1 0 0 0 1 0 0 0 1 ...
## ..- attr(*, "label")= chr "d2"
## $ d3 : 'labelled' num 0 0 1 0 0 0 1 0 0 0 ...
## ..- attr(*, "label")= chr "d3"
## $ d4 : 'labelled' num 0 0 0 1 0 0 0 1 0 0 ...
## ..- attr(*, "label")= chr "d4"
plot(mydatc2t$Sales~mydatc2t$Time,
xlab = "Time",
ylab = "Sales",
cex.main = 1,
cex.lab = 1,
cex.axis = 1);
abline(a = 2617, b = 32.09)
timereg.1 <- lm(Sales~d1+d2+d3+d4+Time-1, data = mydatc2t)
summary(timereg.1); barplot(scale(timereg.1$coefficients[1:4])[,1],
main = "Seasonal pattern",
xlab = "Quarter",
cex.main = 1,
cex.lab = 1,
cex.axis = 1)
##
## Call:
## lm(formula = Sales ~ d1 + d2 + d3 + d4 + Time - 1, data = mydatc2t)
##
## Residuals:
## 1000 units
## Min 1Q Median 3Q Max
## -216.000 -56.042 1.667 56.750 268.333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## d1 2675.79 118.54 22.57 8.48e-08 ***
## d2 2570.75 127.82 20.11 1.88e-07 ***
## d3 2480.71 137.97 17.98 4.07e-07 ***
## d4 2887.33 148.83 19.40 2.41e-07 ***
## Time 26.37 14.41 1.83 0.11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 163 on 7 degrees of freedom
## Multiple R-squared: 0.9981, Adjusted R-squared: 0.9967
## F-statistic: 723.7 on 5 and 7 DF, p-value: 2.449e-09
plot(ts(mydatc2t, frequency = 1),
main = "Time-series of chocolate sales",
cex.main = 1,
cex.lab = 1,
cex.axis = 1)