#clear workspace
#commented so that you don't accidentally do it
# rm(list = ls())
#load BaseballData from a package - usually will use read.csv to import a csv
# install.packages("corrgram")
library(corrgram)
BaseballData = corrgram::baseball
#this is baseball data on 322 Major Leaque Baseball regular and substitute hitters in 1986
#along with some statistics about their careers
#we now have an object in our environment called BaseballData
dim(BaseballData)
#this is a BaseballData frame with 322 rows (people) and 22 columns (variables)
#let's subset the BaseballData to take only the columns we want
colnames(BaseballData)
BaseballData = BaseballData[,2:21]
#renaming columns to be more interpretable
colnames(BaseballData) = c("League", "Team", "Position", "SeasonAtBats", "SeasonHits", "SeasonHomeRuns", 
                           "SeasonRuns", "SeasonRBIs", "SeasonWalks", "CareerYears", "CareerAtBats", "CareerHits", "CareerHomeRuns",
                           "CareerRuns", "CareerRBIs", "CareerWalks", "SeasonPutouts", "SeasonAssists", "SeasonErrors", 
                           "SeasonSalary")
#let's remove people with less than 100 SeasonAtBats - some of them are outliers
BaseballData = BaseballData[BaseballData$SeasonAtBats > 100,]
#removing utility players - those who play multiple positions
BaseballData = BaseballData[BaseballData$Position != "UT",]
#creating new variables from other variables
BaseballData$SeasonBattingAvg = BaseballData$SeasonHits / BaseballData$SeasonAtBats
BaseballData$CareerHitsPerYear = BaseballData$CareerHits / BaseballData$CareerYears
#let's remove cases with missing BaseballData
BaseballData = na.omit(BaseballData)
dim(BaseballData)
#scatterplot
plot(BaseballData$SeasonSalary, BaseballData$SeasonBattingAvg, main = "Salary and Season Batting Average", 
     xlab = "Salary (thousands of $ per year)", ylab = "Season Batting Average")
#adding a line of best fit - the regression line
abline(lm(SeasonBattingAvg ~ SeasonSalary, BaseballData), col="red", lwd = 3)
#running regression model with one predictor
SimpleModel = lm(SeasonBattingAvg ~ SeasonSalary, BaseballData)
#getting most of the important information about our regression model
summary(SimpleModel)
#but let's get the betas too
# install.packages("QuantPsyc")
library(QuantPsyc)
lm.beta(SimpleModel)
#correlation
cor.test(BaseballData$SeasonSalary, BaseballData$SeasonBattingAvg)
#running regression model with multiple predictors
TwoPredModel = lm(SeasonBattingAvg ~ CareerYears + CareerHitsPerYear, BaseballData)
summary(TwoPredModel)
lm.beta(TwoPredModel)
#adding a predictor
ThreePredModel = lm(SeasonBattingAvg ~ CareerYears + CareerHitsPerYear + SeasonSalary, BaseballData)
summary(ThreePredModel)
lm.beta(ThreePredModel)
#let's see how much R-squared increased by adding a predictor 
summary(ThreePredModel)$r.squared - summary(TwoPredModel)$r.squared
#may be more interpetable to square-root that number: the semi-partial correlation
sqrt(summary(ThreePredModel)$r.squared - summary(TwoPredModel)$r.squared)
#model comparison
anova(TwoPredModel, ThreePredModel)
#same p-value of individual predictor
#One dichotomous predictor: equivalent to a t-test
#boxplot
plot(BaseballData$League, BaseballData$SeasonBattingAvg, xlab = "League", ylab = "Season Batting Average")
#regresion model
DichotPredModel = lm(SeasonBattingAvg ~ League, BaseballData)
summary(DichotPredModel)
lm.beta(DichotPredModel)
#this is the correlation, which is a measure of effect size/consistency
#multiple-categorical predictor
#what values could Position have?
levels(BaseballData$Position)
#why is UT still there? We deleted the data but the column can still take that value
#can fix by converting to character vector then back to factor
BaseballData$Position = as.factor(as.character(BaseballData$Position))
levels(BaseballData$Position)
#see fixed
#now let's see how many people are at each position
# install.packages("plyr")
library(plyr)
count(BaseballData$Position)
#what coding scheme is it using
contrasts(BaseballData$Position)
#creating dummy codes with OF as reference group
contrasts(BaseballData$Position) = contr.treatment(7, base = 6)
#boxplot
plot(BaseballData$Position, BaseballData$SeasonBattingAvg, xlab = "Position", ylab = "Season Batting Average")
#regression model
CategoricalModel = lm(SeasonBattingAvg ~ Position, BaseballData)
summary(CategoricalModel)
lm.beta(CategoricalModel)
#betas with multi-categorical predictors not very useful
#anova model
CategoricalANOVA = aov(SeasonBattingAvg ~ Position, BaseballData)
summary(CategoricalANOVA)
# install.packages("sjstats")
library(sjstats)
eta_sq(CategoricalANOVA)
#they are the same!
#ANCOVA
ANCOVAModel = lm(SeasonBattingAvg ~ CareerYears + CareerHitsPerYear + SeasonSalary + League + Position, BaseballData)
summary(ANCOVAModel)
lm.beta(ANCOVAModel)
#non-linear relationships
plot(BaseballData$CareerYears, BaseballData$SeasonBattingAvg, xlab = "Career Years", ylab = "Season Batting Average")
lines(lowess(BaseballData$CareerYears, BaseballData$SeasonBattingAvg), col="blue", lwd = 3)
BaseballData$CareerYearsCentered = scale(BaseballData$CareerYears, scale = F)
BaseballData$CareerYearsCenteredSq = BaseballData$CareerYearsCentered^2
QuadraticModel = lm(SeasonBattingAvg ~ CareerYearsCentered + CareerYearsCenteredSq, BaseballData)
summary(QuadraticModel)
lm.beta(QuadraticModel)
plot(BaseballData$CareerYears, BaseballData$SeasonBattingAvg, xlab = "Career Years", ylab = "Season Batting Average")
points(BaseballData$CareerYears, QuadraticModel$fitted.values, col="blue", lwd = 3)
#interactions
BaseballData$CareerHitsPerYearCentered = scale(BaseballData$CareerHitsPerYear, scale = F)
InteractionModel = lm(SeasonBattingAvg ~ CareerYears*CareerHitsPerYearCentered, BaseballData)
summary(InteractionModel)
lm.beta(InteractionModel)
#interaction plot
InteractionData = BaseballData[,c("SeasonBattingAvg", "CareerYears", "CareerHitsPerYear")]
InteractionData$CareerYearsCat = "medium"
InteractionData$CareerYearsCat[InteractionData$CareerYears < mean(InteractionData$CareerYears)-sd(InteractionData$CareerYears)] = "low"
InteractionData$CareerYearsCat[InteractionData$CareerYears > mean(InteractionData$CareerYears)+sd(InteractionData$CareerYears)] = "high"
plot(InteractionData$CareerHitsPerYear[InteractionData$CareerYearsCat == "low"], 
     InteractionData$SeasonBattingAvg[InteractionData$CareerYearsCat == "low"], col = "green3", lwd = 3,
     xlab = "Career Hits Per Year", ylab = "Season Batting Average")
abline(lm(InteractionData$SeasonBattingAvg[InteractionData$CareerYearsCat == "low"] ~ InteractionData$CareerHitsPerYear[InteractionData$CareerYearsCat == "low"]), 
       col = "green3", lwd = 3)
points(InteractionData$CareerHitsPerYear[InteractionData$CareerYearsCat == "medium"], InteractionData$SeasonBattingAvg[InteractionData$CareerYearsCat == "medium"], 
       col = "blue3", lwd = 3)
abline(lm(InteractionData$SeasonBattingAvg[InteractionData$CareerYearsCat == "medium"] ~ InteractionData$CareerHitsPerYear[InteractionData$CareerYearsCat == "medium"]), 
       col = "blue3", lwd = 3)
points(InteractionData$CareerHitsPerYear[InteractionData$CareerYearsCat == "high"], InteractionData$SeasonBattingAvg[InteractionData$CareerYearsCat == "high"], 
       col = "brown", lwd = 3)
abline(lm(InteractionData$SeasonBattingAvg[InteractionData$CareerYearsCat == "high"] ~ InteractionData$CareerHitsPerYear[InteractionData$CareerYearsCat == "high"]), 
       col = "brown", lwd = 3)
legend("bottomright", legend=c("Low Career Years", "Medium Career Years", "High Career Years"), col=c("green3", "blue3", "brown"), 
       lty = 1, lwd = 3)
#spline regression
#creating some data
set.seed(pi)
SplineData = data.frame(matrix(nrow = 500, ncol = 0))
SplineData$X = rnorm(500)
SplineData$Y = SplineData$X + rnorm(500)
SplineData$Y[SplineData$X>1] = SplineData$Y[SplineData$X>1] + 2*SplineData$X[SplineData$X>1]
#plotting that data with a lowess curve
plot(SplineData$X, SplineData$Y)
lines(lowess(SplineData$X, SplineData$Y), col="blue", lwd = 3)
#knot is at 1 so will make two variables (because two spline lines) based on that info
SplineData$X1 = SplineData$X
SplineData$X2 = SplineData$X
#if a value of X1 is above the knot, make it the knot value
SplineData$X1[SplineData$X1 > 1] = 1
#if a value of X2 is below the knot, make it 0
SplineData$X2[SplineData$X2 < 1] = 0
#if a value of X2 is above the knot, subtract out the knot value
SplineData$X2[SplineData$X2 > 1] = SplineData$X2[SplineData$X2 > 1]-1
#regression model
SplineModel = lm(Y ~ X1 + X2, SplineData)
summary(SplineModel)
lm.beta(SplineModel)
#betas not very useful
#count outcome - negative binomial regression
# install.packages("psych")
library(psych)
describe(BaseballData)
#could use poisson regression because means and standard devations look equal
#but why have an assumption that can never really be true
#so we will use negative binomial regression
# install.packages("MASS")
library(MASS)
NegBinomModel = glm.nb(CareerAtBats ~ CareerYears, BaseballData)
summary(NegBinomModel)
plot(BaseballData$CareerYears, BaseballData$CareerAtBats, xlab = "Career Years", ylab = "Career At-bats")
points(BaseballData$CareerYears, NegBinomModel$fitted.values, col = "red", lwd = 3)
#will want to log predictor if relationship looks linear with raw variables because y is log transformed
NegBinomModel2 = glm.nb(CareerAtBats ~ log(CareerYears), BaseballData)
summary(NegBinomModel2)
plot(BaseballData$CareerYears, BaseballData$CareerAtBats, xlab = "Career Years", ylab = "Career At-bats")
points(BaseballData$CareerYears, NegBinomModel2$fitted.values, col = "red", lwd = 3)
anova(NegBinomModel, NegBinomModel2)
#zero-inflated model
BaseballData$SeasonHitsZeroInflated= BaseballData$SeasonHits - 100
BaseballData$SeasonHitsZeroInflated[BaseballData$SeasonHitsZeroInflated < 0] = 0
sum(BaseballData$SeasonHitsZeroInflated == 0)
plot(BaseballData$SeasonSalary, BaseballData$SeasonHitsZeroInflated)
# install.packages("pscl")
library(pscl)
ZeroInflatedModel = zeroinfl(SeasonHitsZeroInflated ~ SeasonSalary, BaseballData, dist = "negbin")
summary(ZeroInflatedModel)
plot(BaseballData$SeasonSalary, BaseballData$SeasonHitsZeroInflated)
points(BaseballData$SeasonSalary, ZeroInflatedModel$fitted.values, col = "red", lwd = 3)
#proportional outcome - beta regresion
#we probably should have been modeling season batting average as a proportion this whole time...
#beta distribution is bounded by 0 and 1 just as a proportion is
#install.packages("betareg")
library(betareg)
BetaModel = betareg(SeasonBattingAvg ~ SeasonSalary, data = BaseballData)
summary(BetaModel)
plot(BaseballData$SeasonSalary, BaseballData$SeasonBattingAvg, xlab = "Season Salary", ylab = "Season Batting Average")
points(sort(BaseballData$SeasonSalary), sort(BetaModel$fitted.values), lwd = 3, type = "l", col = "red")
#quantile (percentile) regression
#more robust to outliers and heterscedasticity
#install.packages("quantreg")
library(quantreg)
#start with predicting median Y at each X (rather than mean in OLS regression)
QuantileModel.5 = rq(SeasonBattingAvg ~ SeasonSalary, data = BaseballData, tau = 0.5)
summary(QuantileModel.5)
plot(BaseballData$SeasonSalary, BaseballData$SeasonBattingAvg, xlab = "Season Salary", ylab = "Season Batting Average")
points(sort(BaseballData$SeasonSalary), sort(QuantileModel.5$fitted.values), lwd = 3, type = "l", col = "green3")
legend("bottomright", legend="50th percentile", col="green3", 
       lty = 1, lwd = 3)
#can predict 80th percentile of Y at each X
QuantileModel.8 = rq(SeasonBattingAvg ~ SeasonSalary, data = BaseballData, tau = 0.8)
summary(QuantileModel.8)
points(sort(BaseballData$SeasonSalary), sort(QuantileModel.8$fitted.values), lwd = 3, type = "l", col = "blue3")
legend("bottomright", legend=c("80th percentile", "50th percentile"), col=c("blue3", "green3"), 
       lty = 1, lwd = 3)
#can predict 20th percentile of Y at each X
QuantileModel.2 = rq(SeasonBattingAvg ~ SeasonSalary, data = BaseballData, tau = 0.2)
summary(QuantileModel.2)
points(sort(BaseballData$SeasonSalary), sort(QuantileModel.2$fitted.values), lwd = 3, type = "l", col = "brown")
legend("bottomright", legend=c("80th percentile", "50th percentile", "20th percentile"), col=c("blue3", "green3", "brown"), 
       lty = 1, lwd = 3)
#dichotomous outcome - logistic regression
#outcome will be infielder vs. outfielder
#removing designated hitters as they don't have a position
BaseballData = BaseballData[BaseballData$Position != "DH",]
#must convert position factor to a character vector so it can take different values
BaseballData$Position = as.character(BaseballData$Position)
#making new variable for infielder vs. outfielder
BaseballData$DichotomousPosition = 0
#if they are an outfielder, they are coded as 1 (otherwise, 0, by defualt)
BaseballData$DichotomousPosition[BaseballData$Position == "OF"] = 1
#converting column from character vector to numeric vector
BaseballData$SeasonBattingAvg[BaseballData$DichotomousPosition == 1] = BaseballData$SeasonBattingAvg[BaseballData$DichotomousPosition == 1] + .05
LogisticModel = glm(DichotomousPosition ~ SeasonBattingAvg, data = BaseballData, family = binomial)
summary(LogisticModel)
plot(BaseballData$SeasonBattingAvg, BaseballData$DichotomousPosition, cex = 2, xlab = "Season Batting Average", ylab = "Probability of Outfielder")
points(sort(BaseballData$SeasonBattingAvg), sort(LogisticModel$fitted.values), lwd = 3, type = "l", col = "red")
#ordinal outcome - ordered logistic regression
#making an orindal variable out of season hits by only allowing for 1 signficant digit
BaseballData$SeasonHits1SignificantDigit = as.factor(signif(BaseballData$SeasonHits, 1))
count(BaseballData$SeasonHits1SignificantDigit)
library(MASS)
OrdinalLogisticModel = polr(SeasonHits1SignificantDigit ~ SeasonWalks, data = BaseballData, Hess = T)
summary(OrdinalLogisticModel)
#intercepts not very useful
exp(coef(OrdinalLogisticModel))
#for each extra Season Walk, the odds of Season Hits being in a category vs. the category below it are multiplied by 1.06 
#does not provide p values but does provide t value and could convert that to a p value
pt(summary(OrdinalLogisticModel)$coefficients[1,3], nrow(BaseballData)-2, lower.tail = F) * 2
#can extract probability of being in each ordinal category based on SeasonWalks
OrdinalLogisticProbabilities = data.frame(BaseballData$SeasonWalks[!duplicated(BaseballData$SeasonWalks)], 
                                           predict(OrdinalLogisticModel, 
                                                   BaseballData[!duplicated(BaseballData$SeasonWalks),], 
                                                   type = "probs"))
colnames(OrdinalLogisticProbabilities) = c("SeasonWalks", "P30Hits", "P40Hits", "P50Hits", "P60Hits", 
                                            "P70Hits", "P80Hits", "P90Hits", "P100Hits", "P200Hits")
OrdinalLogisticProbabilities = OrdinalLogisticProbabilities[order(OrdinalLogisticProbabilities$SeasonWalks),]
OrdinalLogisticProbabilities
plot(OrdinalLogisticProbabilities$SeasonWalks, OrdinalLogisticProbabilities$P30Hits, xlab = "Season Walks", 
     ylab = "Probability", lwd = 3, type = "l", col = rainbow(9)[1], ylim = c(0, 1))
points(OrdinalLogisticProbabilities$SeasonWalks, OrdinalLogisticProbabilities$P40Hits, lwd = 3, type = "l", col = rainbow(9)[2])
points(OrdinalLogisticProbabilities$SeasonWalks, OrdinalLogisticProbabilities$P50Hits, lwd = 3, type = "l", col = rainbow(9)[3])
points(OrdinalLogisticProbabilities$SeasonWalks, OrdinalLogisticProbabilities$P60Hits, lwd = 3, type = "l", col = rainbow(9)[4])
points(OrdinalLogisticProbabilities$SeasonWalks, OrdinalLogisticProbabilities$P70Hits, lwd = 3, type = "l", col = rainbow(9)[5])
points(OrdinalLogisticProbabilities$SeasonWalks, OrdinalLogisticProbabilities$P80Hits, lwd = 3, type = "l", col = rainbow(9)[6])
points(OrdinalLogisticProbabilities$SeasonWalks, OrdinalLogisticProbabilities$P90Hits, lwd = 3, type = "l", col = rainbow(9)[7])
points(OrdinalLogisticProbabilities$SeasonWalks, OrdinalLogisticProbabilities$P100Hits, lwd = 3, type = "l", col = rainbow(9)[8])
points(OrdinalLogisticProbabilities$SeasonWalks, OrdinalLogisticProbabilities$P200Hits, lwd = 3, type = "l", col = rainbow(9)[9])
legend("topleft", legend=c("30 Hits", "40 Hits", "50 Hits", "60 Hits", "70 Hits", 
                           "80 Hits", "90 Hits", "100 Hits", "200 Hits"),
       col=rainbow(9), lty = 1, lwd = 3)
#categorical outcome - multinomial logistic regression
#we will predict position from batting average
#install.packages("nnet")
library(nnet)
#need outcome to be a factor
BaseballData$Position = as.factor(BaseballData$Position)
levels(BaseballData$Position) = c("C ", "1B", "2B", "3B",  "OF", "SS")
#makes catchers our baseline group
MultinomialLogisticModel = multinom(Position ~ SeasonBattingAvg, data = BaseballData)
summary(MultinomialLogisticModel)
#intercepts not very useful
exp(summary(MultinomialLogisticModel)$coefficients)
#as batting average increases by 1 (a lot!), the odds of being a 2nd baseman vs a catcher is multiplied by 1.5
#no significance testing provided but the coeffecients divided by their standard errors provide t values, 
#which you could convert to p values
ts = summary(MultinomialLogisticModel)$coefficients/summary(MultinomialLogisticModel)$standard.errors
ts
pt(ts, nrow(BaseballData)-2, lower.tail = F) * 2
#can extract probability of being in each position based on SeasonWalks
MultinomialLogisticProbabilities = data.frame(BaseballData$SeasonBattingAvg[!duplicated(BaseballData$SeasonBattingAvg)], 
                                           predict(MultinomialLogisticModel, 
                                                   BaseballData[!duplicated(BaseballData$SeasonBattingAvg),], 
                                                   type = "probs"))
colnames(MultinomialLogisticProbabilities) = c("SeasonBattingAvg", "Catcher", "FirstBaseman", "SecondBaseman",
                                               "ThirdBaseman", "Outfielder", "Shortstop")
MultinomialLogisticProbabilities = MultinomialLogisticProbabilities[order(MultinomialLogisticProbabilities$SeasonBattingAvg),]
MultinomialLogisticProbabilities[100:234,]
plot(MultinomialLogisticProbabilities$SeasonBattingAvg, MultinomialLogisticProbabilities$Catcher, xlab = "Season Batting Average", 
     ylab = "Probability", lwd = 3, type = "l", col = rainbow(6)[1], ylim = c(0, 1))
points(MultinomialLogisticProbabilities$SeasonBattingAvg, 
       MultinomialLogisticProbabilities$FirstBaseman, lwd = 3, type = "l", col = rainbow(6)[2])
points(MultinomialLogisticProbabilities$SeasonBattingAvg, 
       MultinomialLogisticProbabilities$SecondBaseman, lwd = 3, type = "l", col = rainbow(6)[3])
points(MultinomialLogisticProbabilities$SeasonBattingAvg, 
       MultinomialLogisticProbabilities$ThirdBaseman, lwd = 3, type = "l", col = rainbow(6)[4])
points(MultinomialLogisticProbabilities$SeasonBattingAvg, 
       MultinomialLogisticProbabilities$Shortstop, lwd = 3, type = "l", col = rainbow(6)[5])
points(MultinomialLogisticProbabilities$SeasonBattingAvg, 
       MultinomialLogisticProbabilities$Outfielder, lwd = 3, type = "l", col = rainbow(6)[6])
legend("topleft", legend=c("Catcher", "1st Baseman", "2nd Baseman",
                            "3rd Baseman", "Shortstop", "Outfielder"),
       col=rainbow(6), lty = 1, lwd = 3)
#multilevel modeling
#install.packages("lme4")
library(lme4)
#treating players as nested within teams
#predcting batting average from home runs within teams
MultilevelModel = lmer(SeasonSalary ~ SeasonHomeRuns + (SeasonHomeRuns | Team), 
                       data = BaseballData, REML = F)
summary(MultilevelModel)
#no p values - what are the degrees of freedom?
#can get rough estimates of degrees of freedom and p-values from the lmerTest package
#uses the the Satterthwaite approximation
#install.packages("lmerTest")
library(lmerTest)
MultilevelModel = lmer(SeasonSalary ~ SeasonHomeRuns + (SeasonHomeRuns | Team), 
                       data = BaseballData, REML = F)
summary(MultilevelModel)
#but can do model comparison (using liklihood ratios for more accurate results)
MultilevelBaseModel = lmer(SeasonSalary ~ 1 + (SeasonHomeRuns | Team), 
                       data = BaseballData, REML = F)
summary(MultilevelBaseModel)
anova(MultilevelBaseModel, MultilevelModel)
#we can calculate a correlation-like value
#because proportion of reduction in error variance is a pseudo-Rsquared
PseudoRSquared = (1.497e-03 - 1.459e-03) / 1.497e-03
PseudoRSquared
PseudoR = sqrt(PseudoRSquared)
PseudoR
#bootstrapping
#install.packages("boot")
library(boot)
# function to obtain regression coeffecients
bs <- function(formula, data, indices) {
  d <- data[indices,] # allows boot to select sample 
  fit <- lm(formula, data=d)
  return(coef(fit)) 
} 
BootResults = boot(data = BaseballData, statistic = bs, R = 10000, 
                formula = SeasonBattingAvg ~ CareerYears + CareerHitsPerYear + SeasonSalary)
BootResults
plot(BootResults, index=2)
boot.ci(BootResults, type="basic", index=2) 
#regularization
#multiple regression model
#output is Batting Average but just going to use linear regression
MultipleRegressionModel = lm(SeasonBattingAvg ~ SeasonAtBats + SeasonHits + SeasonHomeRuns + SeasonRuns + 
                               SeasonRBIs + SeasonWalks + SeasonPutouts + SeasonAssists + SeasonErrors + 
                               SeasonSalary, data = BaseballData)
summary(MultipleRegressionModel)
lm.beta(MultipleRegressionModel)
plot(MultipleRegressionModel$fitted.values, BaseballData$SeasonBattingAvg)
cor.test(MultipleRegressionModel$fitted.values, BaseballData$SeasonBattingAvg)
#install.packages("car")
library(car)
vif(MultipleRegressionModel)
sqrt(vif(MultipleRegressionModel))
#The square root of the variance inflation factor indicates how much larger the standard error is, 
#compared with what it would be if that variable were uncorrelated with the other predictor variables in the model.
#ridge regression
#going to standardize variables
BaseballData[,grep("^Season", colnames(BaseballData))[1:10]] = scale(BaseballData[,grep("^Season", colnames(BaseballData))[1:10]])
BaseballData$SeasonBattingAvg = scale(BaseballData$SeasonBattingAvg)
#install.packages("glmnet")
library(glmnet)
#use 10-fold cross validation to choose the best lambda (how much of a penalty for coeffecients)
RidgeCV = cv.glmnet(as.matrix(BaseballData[,grep("^Season", colnames(BaseballData))[1:10]]), 
                    BaseballData$SeasonBattingAvg, alpha = 0)
plot(RidgeCV)
#run ridge regression iwth best lambda penalty
RidgeModel = glmnet(as.matrix(scale(BaseballData[,grep("^Season", colnames(BaseballData))[1:10]])), 
                    BaseballData$SeasonBattingAvg, alpha = 0, lambda = RidgeCV$lambda.min)
coef(RidgeModel)
plot(predict(RidgeModel, newx = as.matrix(BaseballData[,grep("^Season", colnames(BaseballData))[1:10]])),
     BaseballData$SeasonBattingAvg, xlab = "Predicted Season Batting Average", 
     ylab = "Actual Season Batting Average")
cor.test(predict(RidgeModel, newx = as.matrix(BaseballData[,grep("^Season", colnames(BaseballData))[1:10]])),
         BaseballData$SeasonBattingAvg)
#no standard errors so no confidence intervals and p values :(
#lasso regression
#use 10-fold cross validation to choose the best lambda (how much of a penalty for coeffecients)
LassoCV = cv.glmnet(as.matrix(BaseballData[,grep("^Season", colnames(BaseballData))[1:10]]), 
                    BaseballData$SeasonBattingAvg, alpha = 1)
plot(LassoCV)
#run lasso regression with best lambda penalty
LassoModel = glmnet(as.matrix(BaseballData[,grep("^Season", colnames(BaseballData))[1:10]]), 
                    BaseballData$SeasonBattingAvg, alpha = 1, lambda = LassoCV$lambda.min)
coef(LassoModel)
plot(predict(LassoModel, newx = as.matrix(BaseballData[,grep("^Season", colnames(BaseballData))[1:10]])),
     BaseballData$SeasonBattingAvg, xlab = "Predicted Season Batting Average", 
     ylab = "Actual Season Batting Average")
cor.test(predict(LassoModel, newx = as.matrix(BaseballData[,grep("^Season", colnames(BaseballData))[1:10]])),
         BaseballData$SeasonBattingAvg)
#no standard errors so no confidence intervals and p values :(
#elastic net regression
#use 10-fold cross validation to choose the best lambda (how much of a penalty for coeffecients)
ElasticNetCV = cv.glmnet(as.matrix(BaseballData[,grep("^Season", colnames(BaseballData))[1:10]]), 
                         BaseballData$SeasonBattingAvg, alpha = .5)
plot(ElasticNetCV)
#run elastic net regression with best lambda penalty
ElasticNetModel = glmnet(as.matrix(BaseballData[,grep("^Season", colnames(BaseballData))[1:10]]), 
                         BaseballData$SeasonBattingAvg, alpha = .5, lambda = ElasticNetCV$lambda.min)
coef(ElasticNetModel)
plot(predict(ElasticNetModel, newx = as.matrix(BaseballData[,grep("^Season", colnames(BaseballData))[1:10]])),
     BaseballData$SeasonBattingAvg, xlab = "Predicted Season Batting Average", 
     ylab = "Actual Season Batting Average")
cor.test(predict(ElasticNetModel, newx = as.matrix(BaseballData[,grep("^Season", colnames(BaseballData))[1:10]])),
         BaseballData$SeasonBattingAvg)
#no standard errors so no confidence intervals and p values :(