forked from ryanpmcintire/nfl_py3
-
Notifications
You must be signed in to change notification settings - Fork 0
/
nfl_SVM.R
133 lines (117 loc) · 7.14 KB
/
nfl_SVM.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# This script uses support vector machines to predict winners of nfl games straight up. Spread is ignored
# Libs
require(purrr)
require(tidyr)
require(dplyr)
# I like this SVM package a lot. It takes minimal effort to get SVM's up and running
require(e1071)
# Caret for pre-processing stuff
require(caret)
# We'll take the SVMFeatures from our nfl_cleane.R and operate on them a bit more so they work nicely with SVM
source("nfl_cleane.R")
SVMFeatures <- features
# Game identifying columns
game_cols <- c("Home_Win", "Home_Team", "Home_Score", "Away_Score", "Year", "Week")
ratios <- as.data.frame(SVMFeatures[,game_cols])
# Convert game stats into ratios with home in numerator, away in denominator. I think this should offer
# small performance improvement versus keeping home & away cols separate but haven't definitively confirmed.
# These ratio columns represent the performance of the home & away teams compared to each other. I think
# this simplifies the number crunching for the SVM but, again I have not confirmed. One thing to keep in mind
# is that by organizing the data this way, we lose the ability to feature select on each home & away column.
# It could be the case that certain features are relevant to the away team and not the home team
# and vice versa. I don't care enough to confirm this here because this is just a proof of concept that
# we can in fact predict games with some level of accuracy. We can feature select in more advanced models
# (that incorporate spreads) later on if need be.
ratios$Points <- SVMFeatures$Trail_Home_Score / SVMFeatures$Trail_Away_Score
ratios$Allowed <- SVMFeatures$Home_Allowed / SVMFeatures$Away_Allowed
ratios$TO <- SVMFeatures$Home_TO / SVMFeatures$Away_TO
ratios$FTO <- SVMFeatures$Home_FTO / SVMFeatures$Away_FTO
ratios$Pass_Eff <- SVMFeatures$Home_Pass_Eff / SVMFeatures$Away_Pass_Eff
ratios$Pass_Def <- SVMFeatures$Home_Pass_Def / SVMFeatures$Away_Pass_Def
ratios$Rush_Eff <- SVMFeatures$Home_Rush_Eff / SVMFeatures$Away_Rush_Eff
ratios$Rush_Def <- SVMFeatures$Home_Rush_Def / SVMFeatures$Away_Rush_Def
ratios$PenYds <- SVMFeatures$Home_Pen_Yds / SVMFeatures$Away_Pen_Yds
ratios$PenYdsAgg <- SVMFeatures$Home_Pen_Yds_Agg / SVMFeatures$Away_Pen_Yds_Agg
ratios$ThirdEff <- SVMFeatures$Home_Third_Eff / SVMFeatures$Away_Third_Eff
ratios$ThirdDef <- SVMFeatures$Home_Third_Def / SVMFeatures$Away_Third_Def
ratios$FourthEff <- SVMFeatures$Home_Fourth_Eff / SVMFeatures$Away_Fourth_Eff
ratios$FourthDef <- SVMFeatures$Home_Fourth_Def / SVMFeatures$Away_Fourth_Def
# Replacing infs with 1. Not ideal, need to look into lapalce transform
ratios$FourthEff[is.infinite(ratios$FourthEff)] <- 1
ratios$FourthDef[is.infinite(ratios$FourthDef)] <- 1
# We get duplicates of every game entry because two teams, so remove one of them. Doesn't matter which one.
ratios <- ratios[!duplicated(ratios[, -(1:8)]), ]
# Remove home score and away score
ratios <- ratios[, -c(3:4)]
# Set seed for reproducibility
set.seed(42)
# Create training set with years before 2018. We'll use 2018 as test set
train <- ratios %>%
filter(Year < 2018)
test <- ratios %>%
filter(Year == 2018)
# Do some basic pre-processing. I tried lots of combinations of other pre-process
# techniques but didn't see any difference in performance
# Important to use the same preProcValues for all data sets
preProcValues <- preProcess(ratios, method = c("center", "scale"))
trainTransformed <- predict(preProcValues, train)
testTransformed <- predict(preProcValues, test)
# Save 10% of the training data to use for validation set
# If seed wasn't set this will be different each time
# Home Wins happens more frequently than losses
# This means we should organize our partitions so that Home_Win outcome is represented
# equally percentage-wise. Otherwise we open up possibility of the model over-favoring
# one outcome more or less than it should
train_ind <- createDataPartition(trainTransformed$Home_Win, p = 0.9, list = FALSE)
trainSample <- trainTransformed[train_ind, ]
validSample <- trainTransformed[-train_ind, ]
# Use a grid search to find best parameters, kernal etc.
# This can take some time if grid increments are small
# Start with linear kernel because it is least time consuming to tune
# Home team wins just under 57% of the time from 2009-2017 seasons. This is a good benchmark
# for the SVM to beat
tuned <- tune.svm(Home_Win~., data = trainSample, gamma = 2^seq(-15, -6, 1), kernel = "linear")
summary(tuned)
# ~64.7% accuracy with this. Beating our benchmark but nothing to get too excited about
svmLinear <- svm(Home_Win~., data = trainSample, cost = 0.1, kernel = "linear")
# Using the svm, predict on the validation set
pred <- predict(svmLinear, validSample)
# Calculate number of games predicted correctly
record <- pred == validSample$Home_Win
sum(record == TRUE) / NROW(record)
# Functionalize the above performance check for convenience
# Takes in the model and dataset and returns the accuracy of the model
svmPerformance <- function(model, data) {
pred <- predict(model, data)
record <- pred == data$Home_Win
sum(record == TRUE) / NROW(record)
}
# According to this paper: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.141.880&rep=rep1&type=pdf
# a radial kernal should be at least as good as linear. So if we want to squeeze some more performance
# out of our SVM, this seems like the natural next step
# I don't want to waste too much time tuning paramters. It can be helpful to visualize the error curves to figure out
# what range of parameters the error is minimized. This gives us a good idea for the bounds of the grid search
# Grid search with both hyper-params
tuned <- tune.svm(Home_Win~., data = trainSample, cost = 2^seq(-3, 1, 1), gamma = 2^seq(-6, 0, 1), kernel = "radial")
plot(tuned)
# Based on above grid search, we haven't narrowed down the cost much but we know gamma < 0.4 looks good
# You can continue to repeat the search with more granular increments on a more narrow grid and this should save
# time versus just brute forcing every possible combination
# With these params we can match the ~64.7% accuracy of the linear kernal but not able to do any better.
# It could still be possible to out-perform linear kernal with radial but don't want to spend too much
# time on the svm's
svmRadial <- svm(Home_Win~., data = trainSample, gamma = 0.03, cost = 0.5, kernel = "radial")
svmPerformance(svmRadial, validSample)
# Sigmoid kernel is also worth a shot
tuned <- tune.svm(Home_Win~., data = trainSample, cost = 2^seq(-3, 3, 1), gamma = 2^seq(-6, 0, 1), kernel = "sigmoid")
plot(tuned)
# From above plot, keeping gamma < 0.4 and cost < 8 appear to be nice boundaries
# With these params, Sigmoid kernel gives us a slight boost to accuracy on the validation set with ~65.7%
svmSigmoid <- svm(Home_Win~., data = trainSample, cost = 6.5, gamma = .01, kernel = "sigmoid")
svmPerformance(svmSigmoid, validSample)
# Finally, we'll compare the performance of these three SVM's against the test set
svmPerformance(svmLinear, testTransformed)
svmPerformance(svmRadial, testTransformed)
svmPerformance(svmSigmoid, testTransformed)
# Linear kernel performs best on test set with ~64.2%, then radial with ~63.1%, last is sigmoid with ~60.2%
# The linear kernel is the easiest to tune and performs the best in general