#############################################################
## Introduction to R                                       ##
## GradQuant workshop supplementary code                   ##
## Summer 2018                                             ##
#############################################################

# Welcome to R! 

## Installing and Using R

#Recommended software (you need R installed to use RStudio)
# R (https://www.r-project.org/)
# RStudio (https://www.rstudio.com/)
# RMarkdown: install.packages("rmarkdown")

## Mathematical Operations
2+2
2^3

#R uses standard order of operations
3+4*(4^2)

factorial(6) #our first function

#get the help file for the function "factorial"
?factorial

#test (in)equalities
2 == 3
2 < 3
2^2 == 4

#Notice that we use "==" instead of "=" to test equality

#What about rounding error?
sqrt(2)^2 == 2

#test "near equality"
all.equal(sqrt(2)^2, 2)

## Data Storage and Manipulation

#store something in x
x <- 2*3

#output
print(x)
#or
x

#create a vector & output
y <- c(2,4,7)
y

#access all or part of our vector:
y[1]
y[3]
#(R numbers its vector and matrices starting at 1.) 

#work with entire vector
y+2
x*y

#R works well with vector and matrix algebra, but that's not R's default! 
z <- c(0.5,2,1,3,2.5,6)
y
y*z
#This is not a dot- or a cross-product! Those are separate functions in R.

#complete list of the objects we have saved
ls()

#overwrite objects we have saved
x <- 10
x <- 20
x

#create sequences of variables
x <- 1:10
x
#or
x <- seq(from = 1, to = 10, by = 1)
x

#change boundaries and increments,
x <- seq(from = 1, to = 12, by = .5)
x

#repeat value a certain number of times
x <- rep(1, 10)
x

#repeat entire vector a particular number of times
x <- rep(c(1,4), 10)
x

#Combining all of this...
rep(1:10, 10) + seq(1, 50.5, .5)

#remove certain objects from the working environment
rm(x)

#clear the working environment entirely
rm(list=ls())

## Matrices

#R has extensive capabilities in working with matrices.
mymatrix <- matrix(c(1,2,3,4,5,6), ncol=3, nrow=2)
mymatrix

#get dimensions
dim(mymatrix)
# take the transpose
t(mymatrix)

#access specific element
mymatrix[2,3]
#specific row
mymatrix[2,]
#specific column
mymatrix[,3]

#Numbers before the comma are row numbers 
#numbers after the comma refer to columns. 

# inverse (of a square matrix).
mymatrix2 <- matrix(c(1,2,3,4,5,6,0,1,0), ncol=3, nrow=3)
solve(mymatrix2)

#column sums
colSums(mymatrix)
#row sums
rowSums(mymatrix)

#Matrices can work with other objects:
x <- 2
mymatrix+x
mymatrix*x

#to get R to do matrix multiplication, we use `x%*%y` instead of `x*y`.
mymatrix %*% mymatrix2

## Data

#set up a working directory where all of your packages and other R files will be downloaded to / uploaded from
getwd()
setwd("C:\\Users\\GradQuant\\Desktop") #you'll need to change this filepath!
getwd() 


#import different types of data
#without a full filepath, R will look in the working directory
data = read.table("hmnrghts.txt", header=TRUE)
data = read.csv ("hmnrghts.csv", header=TRUE)

#In RStudio, we can do this using "Import Dataset" in the "Environment" tab. 

#R also has a huge number of built in datasets
data(mtcars)

#find out about the mtcars dataset
?mtcars

# rename this "data" so that it's easier to work with
data <- mtcars

#view the first few rows
head(data)
#and the last few rows
tail(data)

#Say we wanted to view just one column. We might be temped to try
mpg
#but that doesn't work. 

#need to be specific about where our column comes from
mtcars$mpg

#The `$` command tells R that we want to extract the named element `mpg` out of `mtcars`.

#can also "attach" our dataset
#Now, we can refer to items in our dataset directly. 
attach(data)
mpg

#Notice that you can only attach one dataset at a time! 

#can also detatch it when it's no longer needed
#detach(data) #this line is commented out because we still want our data attached!

#select columns by number
newdata <- data[,c(1,2,3)] 
#or
newdata <- data[,c(1:3)]
head(newdata)

#get columns not adjacent to each other
newdata<-data[,c(1,3:6)]
head(newdata)

#remove variables using a minus sign:
droppeddata <- data[,c(-3,-5)]
head(droppeddata)

#now remove three through five, instead of three and five
newdata <- data[,c(-3:-5)]

#select certain rows the same way (notice the comma change)
newdata <- data[1:5,]

#subset data to cars with mpg greater than 30
subset(data, mpg>30)

##Exercise 

#Select all the cars with an mpg greater than 20.0 and engine displacement over 200, and name it `exercisedata`. 
#Then output the `mpg`, `disp`, and `am` columns.




#Exercise solution is in workshop slides PDF

## Using R Packages
install.packages("matrixStats")
library(matrixStats)

#With a package loaded, we can use our help function to examine its documentation
?matrixStats

## Statistics in R

#mean
mean(data$mpg)
#standard deviation
sd(data$mpg)

#summary of data
summary(data$mpg)
#another summary
psych::describe(data$mpg)

#Notice how we were able to use a function from a package without loading it
#we called the `psych` package as part of the `describe` command.

## Statistics in R: A Simple t-Test

#The V/S variable tells us whether the car is a V-engine or a straight engine. 
#We will compare mpg based on the V/S variable. 

#make sure that `vs` is a factor
is.factor(data$vs)

#It's not, so we need to convert it
data$vs <- as.factor(data$vs)

#try it again
is.factor(data$vs)

#examine the data broken down by group
psych::describeBy(mpg, group=vs)

#t-test
t.test(mpg~vs, data=data)

## Statistics in R: Linear Models
#same analysis in the form of a linear model
model <- lm(mpg~vs, data)
summary(model)

## Statistics in R: ANOVA
#We can run an ANOVA for more than two groups. 
#We have three cylinder groups, they aren't saved as factors, but we have a shortcut.
anova <- aov(mpg~as.factor(cyl), data)
summary(anova)

#can also extract an ANOVA from a model:
model <- lm(mpg~as.factor(cyl), data)
anova <- aov(model)
summary(anova)

#These two ANOVA tables are exactly the same! So why might we want the model?
#Let's take a look at all the elements in our model
names(model)

#then we can check the residuals
model$residuals

## Statistics in R: Linear Models
#check the residuals for normality using a qqplot.
qqnorm(model$residuals) #without line
qqline(model$residuals) #with line

## Statistics in R: Correlations
#correlate mpg with engine displacement.
cor(mpg, disp)
#but that doesn't give much output. Let's try this instead
cor.test(mpg, disp)

## Plots
#Let's start simple:
plot(mpg~disp)
#add better axis labels
plot(mpg~disp, xlab = "Engine Displacement", ylab="Miles per gallon")
#and a main title
plot(mpg~disp, xlab = "Engine Displacement", ylab="Miles per gallon", main="Sample Scatterplot")
#and maybe even a best fit regression line
abline(lm(mpg~disp))

##Exercise 

#Find the correlation between weight (`wt`) and quarter mile time (`qsec`).
#Then run a linear model and create a scatterplot.




#Solution can be found in workshop slides PDF

## Statistics in R: Categorical Variables
#create a contingency table
table <- xtabs(~vs + am)
table
#chi-squared test on it
chisq.test(table)
  
## Functions
#anatomy of a function.
?seq
#options associated with a particular function are called "arguments"
#Some arguments have defaults that we can override. 
#Others don't have a default and we may need to specify some input. 

#If we have the right order, we don't need to specify the argument
lm(mpg~vs,data)

#If we get the order wrong, it returns an error (or does the wrong thing!)
lm(data, mpg~vs)

#can use any order we like if we specify the argument
lm(data=data, formula = mpg~vs)

#Thanks for reading along!

#Want more R? We'll be having more workshops during the school year. 
