-
Notifications
You must be signed in to change notification settings - Fork 0
/
class_example.R
107 lines (76 loc) · 2.6 KB
/
class_example.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Ana Bravo
# Practice Tidy Models
# 2022-07-25
library(skimr)
library(PerformanceAnalytics)
library(splines)
library(tidyverse)
library(tidymodels)
skimr::skim(MASS::Boston)
PerformanceAnalytics::chart.Correlation(MASS::Boston, method = "spearman")
# The black proportion transformation is defined as the squared distance from
# 63%. I have no idea why. I want to "un-transform" this, but I can't find an
# inverse transformation that uniquely defines the neighborhood proportion of
# blacks from this scale variable (e.g., values between 0 and 136.9 map back
# to two different proportions of black inhabitants).
plot(
x = seq(0, 1, length.out = 50),
y = 1000 * (seq(0, 1, length.out = 50) - 0.63)^2
)
# In the 1970 Census, Boston had 16.3% Black population. I am assuming there
# are no homes from neighborhoods with more than 110% Black people (the other
# solution to 1000 * {x - 0.63}^2 = 218.089. Therefore, an "average" amount of
# Black people in a Boston neighbourhood would correspond to a score of ~218.
table(MASS::Boston$black > 218)
# https://en.wikipedia.org/wiki/Boston
bostonFull_df <-
MASS::Boston %>%
mutate(on_river = chas == 1) %>%
mutate(is_large_lot_zoned = zn > 0) %>%
# 96% of Boston was either white or black in 1970
mutate(in_white_nbd = black > 218) %>%
# 1970s had much higher crime rates than today, but 1/1000 is reasonable for
# a "low crime" indicator
mutate(has_low_crime = crim <= 1) %>%
mutate(rad_idx = factor(rad, ordered = TRUE)) %>%
rename(value = medv) %>%
mutate(log(value)) |>
select(-chas, -zn, -crim, -black, -rad) %>%
select(value, everything()) %>%
as_tibble()
PerformanceAnalytics::chart.Correlation(
# bostonFull_df,
bostonFull_df[, -14],
method = "spearman"
)
# split data
set.seed(1995)
housing_split <- initial_split(bostonFull_df)
housing_train <- training(housing_split)
housing_test <- testing(housing_split)
housing_split
# build the receipt
housing_receipt <-
recipe(formula = value ~ ., data = bostonFull_df) |>
step_normalize(all_numeric_predictors()) |>
step_zv(all_predictors()) |>
step_dummy(all_nominal_predictors())
# do specifications
kknn_spec <-
nearest_neighbor(neighbors = 5) |>
set_mode("regression") |>
set_engine("kknn")
# do workflow for kknn
kknn_workflow <-
workflow() |>
add_recipe(housing_receipt) |>
add_model(kknn_spec)
kknn_workflow
# fit the model in the training set
housing_fit <- fit(kknn_workflow, housing_train)
housing_fit
# check predictions
housing_pred <-
augment(housing_fit, housing_train) |>
select(value, .pred)
housing_pred