What predict() achieves is rather simple, in that, it provides an applied output as it pertains to applying a linear model to observational data.
Let’s delve right in with a few examples of this application.
Linear Regression
# Model Creation #
x <- c(27, 34, 22, 30, 17, 32, 25, 34, 46, 37)
y <- c(70, 80, 73, 77, 60, 93, 85, 72, 90, 85)
linregress <- (lm(y ~ x))
# Build Predictive Structure #
predictdataframe <- data.frame(x)
# Print Predicted Values to Console #
predict(linregress, predictdataframe)
# Organize predicted values in a variable column adjacent to observed values #
# Store Predicted Values in Variable #
predictedvalues <- predict(linregress, predictdataframe)
# Add Variables to Data Frame #
predictdataframe$y <- y
predictdataframe$predictedvalues <- predictedvalues
# View Results #
predictdataframe
# Console Output #
x y predictedvalues
1 27 70 75.60686
2 34 80 81.56332
3 22 73 71.35224
4 30 77 78.15963
5 17 60 67.09763
6 32 93 79.86148
7 25 85 73.90501
8 34 72 81.56332
9 46 90 91.77441
10 37 85 84.11609
Loglinear Analysis
# Model Creation #
Obese <- c("Yes", "Yes", "No", "No")
Smoking <- c("Yes", "No", "Yes", "No")
Count <- c(5, 1, 2, 2)
DataModel <- glm(Count ~ Obese + Smoking , family = poisson)
# Build Predictive Structure #
predictdataframe <- data.frame(Obese, Smoking)
# Print Predicted Values to Console #
exp(predict(DataModel, predictdataframe))
# Organize predicted values in a variable column adjacent to observed values #
# Store Predicted Values in Variable #
predictedvalues <- predict(DataModel, predictdataframe)
# Add Variables to Data Frame #
predictdataframe$Obese <- Obese
predictdataframe$Smoking <- Smoking
predictdataframe$Count <- Count
predictdataframe$predictedvalues <- exp(predictedvalues)
# View Results #
predictdataframe
# Console Output #
Obese Smoking Count predictedvalues
1 Yes Yes 5 4.2
2 Yes No 1 1.8
3 No Yes 2 2.8
4 No No 2 1.2
Probit Regression
# Create data vectors #
age <- c(55.00, 45.00, 33.00, 22.00, 34.00, 56.00, 78.00, 47.00, 38.00, 68.00, 49.00, 34.00, 28.00, 61.00, 26.00)
obese <- c(1.00, .00, .00, .00, 1.00, 1.00, .00, 1.00, 1.00, .00, 1.00, 1.00, .00, 1.00, .00)
smoking <- c(1.00, .00, .00, 1.00, 1.00, 1.00, .00, .00, 1.00, .00, .00, 1.00, .00, 1.00, 1.00)
cancer <- c(1.00, .00, .00, 1.00, .00, 1.00, .00, .00, 1.00, 1.00, .00, 1.00, 1.00, 1.00, .00)
# Combine data vectors into a single data frame #
cancerdata <- data.frame(cancer, smoking, obese, age)
# Create Probit Model #
probitmodel <- glm(cancer ~ smoking + obese + age, family=binomial(link= "probit"), data=cancerdata)
# Build Predictive Structure #
predictdataframe <- data.frame(smoking, obese, age)
# Print Predicted Values to Console #
plogis(predict(probitmodel, predictdataframe ))
# Organize predicted values in a variable column adjacent to observed values #
# Store Predicted Values in Variable #
predictedvalues <- predict(probitmodel, predictdataframe )
# Add Variables to Data Frame #
predictdataframe$smoking <- smoking
predictdataframe$obese <- obese
predictdataframe$age <- age
predictdataframe$cancer <- cancer
predictdataframe$predictedvalues <- plogis(predictedvalues)
# View Results #
predictdataframe
# Console Output #
smoking obese age cancer predictedvalues
1 1 1 55 1 0.7098209
2 0 0 45 0 0.3552599
3 0 0 33 0 0.3076726
4 1 0 22 1 0.6338307
5 1 1 34 0 0.6267316
6 1 1 56 1 0.7134978
7 0 0 78 0 0.4988303
8 0 1 47 0 0.3088181
9 1 1 38 1 0.6433412
10 0 0 68 1 0.4541625
11 0 1 49 0 0.3165195
12 1 1 34 1 0.6267316
13 0 0 28 1 0.2889239
14 1 1 61 1 0.7314569
15 1 0 26 0 0.6503007
Logistic Regression Analysis (Non-Binary Categorical Variables)
# Non-Binary Categorical Variables #
Age <- c(55, 45, 33, 22, 34, 56, 78, 47, 38, 68, 49, 34, 28, 61, 26)
Obese <- c(1,0,0,0,1,1,0,1,1,0,1,1,0,1,0)
Smoking <- c(1,0,0,1,1,1,0,0,1,0,0,1,0,1,1)
Cancer <- c(1,0,0,1,0,1,0,0,1,1,0,1,1,1,0)
White <- c(1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)
African_American <- c(0,0,0,1,1,1,0,0,0,0,0,0,0,0,0)
Asian <- c(0,0,0,0,0,0,1,1,1,0,0,0,0,0,0)
Indian <- c(0,0,0,0,0,0,0,0,0,1,1,1,0,0,0)
Native_American <- c(0,0,0,0,0,0,0,0,0,0,0,0,1,1,1)
CancerModelLogII <- glm(Cancer~ Age + Obese + Smoking + White + African_American + Asian + Indian + Native_American, family=binomial)
# Build Predictive Structure #
predictdataframe <- data.frame(Age, Obese, Smoking, White, African_American, Asian, Indian, Native_American)
# Print Predicted Values to Console #
plogis(predict(CancerModelLogII, predictdataframe ))
# Organize predicted values in a variable column adjacent to observed values #
# Store Predicted Values in Variable #
predictedvalues <- predict(CancerModelLogII, predictdataframe )
# Add Variables to Data Frame #
predictdataframe$Age <- Age
predictdataframe$Obese <- Obese
predictdataframe$Smoking <- Smoking
predictdataframe$White <- White
predictdataframe$African_American <- African_American
predictdataframe$Asian <- Asian
predictdataframe$Indian <- Indian
predictdataframe$Native_American <- Native_American
predictdataframe$Cancer <- Cancer
predictdataframe$predictedvalues <- plogis(predictedvalues)
# View Results #
predictdataframe
# Console Output #
Age Obese Smoking White African_American Asian Indian Native_American Cancer
1 55 1 1 1 0 0 0 0 1
2 45 0 0 1 0 0 0 0 0
3 33 0 0 1 0 0 0 0 0
4 22 0 1 0 1 0 0 0 1
5 34 1 1 0 1 0 0 0 0
6 56 1 1 0 1 0 0 0 1
7 78 0 0 0 0 1 0 0 0
8 47 1 0 0 0 1 0 0 0
9 38 1 1 0 0 1 0 0 1
10 68 0 0 0 0 0 1 0 1
11 49 1 0 0 0 0 1 0 0
12 34 1 1 0 0 0 1 0 1
13 28 0 0 0 0 0 0 1 1
14 61 1 1 0 0 0 0 1 1
15 26 0 1 0 0 0 0 1 0
predictedvalues
1 0.74330743
2 0.15053796
3 0.10615461
4 0.64063327
5 0.60103365
6 0.75833308
7 0.32059004
8 0.08677812
9 0.59263184
10 0.69613463
11 0.40773029
12 0.89613509
13 0.23207436
14 0.91405050
15 0.85387513
Logistic Regression Analysis
# Model Creation #
Age <- c(55, 45, 33, 22, 34, 56, 78, 47, 38, 68, 49, 34, 28, 61, 26)
Obese <- c(1,0,0,0,1,1,0,1,1,0,1,1,0,1,0)
Smoking <- c(1,0,0,1,1,1,0,0,1,0,0,1,0,1,1)
Cancer <- c(1,0,0,1,0,1,0,0,1,1,0,1,1,1,0)
CancerModelLog <- glm(Cancer~ Age + Obese + Smoking, family=binomial)
# Build Predictive Structure #
predictdataframe <- data.frame(Age, Obese, Smoking, Cancer)
# Print Predicted Values to Console #
plogis(predict(CancerModelLog, predictdataframe ))
# Organize predicted values in a variable column adjacent to observed values #
# Store Predicted Values in Variable #
predictedvalues <- predict(CancerModelLog, predictdataframe )
# Add Variables to Data Frame #
predictdataframe$Age <- Age
predictdataframe$Obese <- Obese
predictdataframe$Smoking <- Smoking
predictdataframe$Cancer <- Cancer
predictdataframe$predictedvalues <- plogis(predictedvalues)
# View Results #
predictdataframe
# Console Output #
Age Obese Smoking Cancer predictedvalues
1 55 1 1 1 0.8102649
2 45 0 0 0 0.2686795
3 33 0 0 0 0.2043280
4 22 0 1 1 0.7018502
5 34 1 1 0 0.6952985
6 56 1 1 1 0.8148105
7 78 0 0 0 0.4958797
8 47 1 0 0 0.2090126
9 38 1 1 1 0.7199845
10 68 0 0 1 0.4219139
11 49 1 0 0 0.2190519
12 34 1 1 1 0.6952985
13 28 0 0 1 0.1811344
14 61 1 1 1 0.8362786
15 26 0 1 0 0.7262143
For all of the time saving capability that the predict() function provides, its internal structure is rather simple. All that is necessary is that the function be called, along with the required model, and the independent variable data which will be utilized to provide predictions.
This concept illustrated, would resemble the following:
predict(linearmodel, new_data_frame_containing_idependent_variables)
For more information pertaining to this function and its customizable options, please consult the like below:
https://www.rdocumentation.org/packages/raster/versions/2.7-15/topics/predict
That’s all for now. Stay ambitious, Data Heads!