#Welcome to Data manipulation in R!
#There are many different ways to manipulate data, and there are far more real world issues
#that you will encounter when working with your own. This workshop is less about how to solve particular
#data management issues, but instead give you the tools to deal with any issues that might arise.
#This also means that we will use a lot of different datasets, because many functions and operations are meant to be used
#with only certain shapes, sizes, and structures of data.

#First, a bit on data types in R.
#We have scalar values, which represent just single data points
#For example...

x <- 5*5
x #x is a scalar
#A vector is just a combination of scalars
y <- c(5*(1:4))
y #y is a vector
z <- cbind(1:10, 10:1) #We can efficiently combine column vectors
z #The same also works for rows
#Matrices are vectors that have two dimensions
#Consider the following code...
y <- matrix(c(5,10,15,20), nrow=2) #Our input begins as a simple vector, but we add dimensions
#An array is like a matrix, but with more than two dimensions, there's no easy to make an array by hand
?array() # for more help on arrays
#data frames are generalizations of matrices, now we have columns of different data types
iris
str(iris) #for information aobut the structure of a data frame (we can see different types here)
#We can gather information about the data type...
is.vector(iris) #iris is NOT a vector
class(iris) #iris IS a data frame
class(iris$Species) #The species variable is a factor
levels(iris$Species) #We see three different species
#There are also lists. Lists can contain combinations of all of the above
thelist <- list(x,y,z)
str(thelist)
#Lists don't are about the dimensions are types of data.  It is the most general type of object.
#To index specific locations within data frames, we use brackets
iris[5,] # to index the fifth row
iris[,5] #to index the fifth column
iris[5,5] #to index the fifth row in the fifth column
iris[5:10,] #for multiple rows
#To subset for specific values
iris[iris$Species == 'setosa',] #Before the comma, because we're selecting particular rows
iris[iris$Sepal.Length >= 5.0,] #Selecting rows with sepal lengths greater or equal to 5.0
library(dplyr) #dplyr is one of the most helpful data manipulation packages
#Let's start with some basic dplyr functions
attach(iris)
selected <- select(iris, starts_with("Sepal"))
selected
cummean(Sepal.Length) # The cumulative mean for Sepal length
shorter <- select(iris, Species) #select just the Species column
selected1 <- select(iris, starts_with("Sepal"))
selected1 #WE have selected those columns that begin with "Sepal"
selected2 <- select(iris, ends_with("Width"))
selected2 #WE have selected those columns that END with "Width"
renamed <- rename(iris, Whaaaaaaa = Species) #rename the Species column
filtered <- filter(iris, Species == 'virginica' | Sepal.Length > 5.0)
filtered #This has both rows with a virginica species OR sepal length greater than 5
filtered <- filter(iris, Species == 'virginica' & Sepal.Length > 5.0)
filtered #Now we have only those observations that are virginica AND sepals longer than 5
sorted <- arrange(iris, Sepal.Length)
sorted

#Now let's get more complicated
iris %>% 
  group_by(Species) %>%
  summarise(mean = mean(Sepal.Length))
#We've done a lot of new things here. First thing we see new is the '%>%' function.
#This is the pipe function. We can read this function as saying "then"
#We have data THEN we are grouping by the different species
#THEN we are summarizing these groups, here we are taking the mean Sepal length.
iris %>%
  transmute(centered_sepal_length = Sepal.Length - mean(Sepal.Length)) %>%
  summarise(centered_mean = mean(centered_sepal_length))
#We are creating a new variable that is the centered sepal length, then we're checking that
#we did it correctly by seeing the mean is tiny tiny tiny
half <- sample_n(iris, 75, replace=FALSE)
half #this randomly 
half <- sample_frac(iris, size = .5, replace=TRUE)
row.names(half)
iris %>% 
  as.data.frame() %>% #The mutate function is also good for creating new variables, then we can use 
  mutate(total_length = Sepal.Length + Petal.Length, #the new variables to create newer variables
       total_width = Sepal.Width + Petal.Width,
       length_to_width = total_length / total_width) %>%
  filter(length_to_width >= 2.0) %>%
  print() %>%
  summarize(mean = mean(length_to_width))

#Let's work through an example using all of the skills we've learned so far
install.packages("Lahman")
batting <- Lahman::Batting
batting
#The first thing we want to do is create a slugging percentage variable. Slugging is just a weighted
#average of extra base hits, a single gets a 1, double = 2, triple = 3, homerun = 4.
batting <- batting %>% #We don't have a number for singles, only for hits, so we have to do some subtracting
  mutate(slugging = (1*(H - `X2B` - `X3B` - `HR`) + 2*(`X2B`) + 3*(`X3B`) + 4*HR)/ AB)
#Okay, now we want to group by team, and then sort
slug <- batting %>%
  na.omit() %>%
  group_by(teamID) %>%
  summarize(mean = mean(slugging)) %>%
  arrange(desc(mean))
slug
#So what did we do here? First, we took our new batting data, with slugging, then we got rid of missing cases,
#then we grouped players by their team, then took the mean of each team's slugging percentage, then arranged
#them in descending order
playerslugging <- batting %>%
  group_by(playerID) %>%
  summarize(slug = sum(1*(H - `X2B` - `X3B` - `HR`) + 2*(`X2B`) + 3*(`X3B`) + 4*HR)/ sum(AB),
            ab=sum(AB, na.rm=TRUE)) %>%
  arrange(desc(slug))
playerslugging
#What's the problem here? Well, the players with the highest slugging are ones that only had one or two career
#at-bats, we can see that if we graph it, something odd is happening
install.packages("ggplot2")
library(ggplot2)
attach(playerslugging)
ggplot(data=playerslugging, aes(x=ab, y=slug)) + geom_point() + geom_smooth()
#Let's examine only those players with fewer than 50 career at-bats
playerslugging %>%
  filter(ab < 50) %>%
  ggplot(aes(x=ab, y = slug)) + geom_point() + geom_smooth()
playerslugging %>%
  filter(ab > 50) %>%
  ggplot(aes(x=ab, y = slug)) + geom_point() + geom_smooth()
#So there appears to be some weird things going on when a player has really few at bats. Therefore,
#let's rearrange our data to find the highest sluggest percentrage for players over 100 career at bats
playerslugging <- batting %>%
  group_by(playerID) %>%
  summarize(slug = sum(1*(H - `X2B` - `X3B` - `HR`) + 2*(`X2B`) + 3*(`X3B`) + 4*HR)/ sum(AB),
            ab=sum(AB, na.rm=TRUE)) %>%
  filter(ab >= 100) %>%
  arrange(desc(slug))
playerslugging
#Sure enough, now Babe Ruth is tops on our list. Just as he should be: http://www.baseball-almanac.com/hitting/hislug1.shtml
#Last thing we want to do is examine how slugging has change by year, this process should be familiar now
yearslug <- batting %>% 
  filter(AB > 0) %>%
  group_by(yearID) %>%
  summarise(mean = mean(slugging)) %>%
  arrange(desc(mean))
yearslug
#So we see here that 1894 was the best year for slugging (back before pitchers could throw)
yearslug %>% top_n(-10)
#And 1917 was the worst. This was during the "dead ball era", just a couple years before Babe Ruth became an established hitter. 
yearslug %>%
  ggplot(aes(x=yearID, y=mean)) + geom_point() + geom_smooth(span=.4)
#Don't worry aobut the graphing thus far, you can learn more at the upcoming workshop!

######The apply family#######
nasa <- as.data.frame(dplyr::nasa) #NASA dataset
View(nasa) 
#There are a couple arguments to pay attention to. 1. We have to specify the dataset
#2. We need to apply across a dimension (2=column, 1 = row) 3. We need a function to apply
apply(nasa, 2, FUN=mean) #What happened with cloud low? 
sum(is.na(nasa$cloudlow)) #Apply doesn't like there to be missing values, so it returned nothing
apply(nasa, 1, FUN=mean) #Here we are applying the mean for every row
apply(nasa[7:9], 2, mean) #Now just applying to the selected columns
apply(nasa, 2, median) #Any function can go here
apply(nasa, 2, class)

#While this is okay, there isn't much new functionality here yet. The real power of the apply family
#comes with user written function. This is a bit beyond what we are going into here but here's a sample...
coefficientofvar <- function(x) {
  sd(x)/mean(x)
}
y <- c(1,2,3,4,5,6,7,8,9)
coefficientofvar(y)
sd(y)/mean(y) #Just to try it out and make sure it works right
apply(nasa, 2, coefficientofvar)
apply(nasa, 1, coefficientofvar)
#There was no standard way to do that before
nasa <- na.omit(nasa) #Get rid of those pesky NAs
apply(na.omit(nasa), 2, coefficientofvar) 
#The apply function is "godfather" of the apply family. There have been several variants and extensions
#of this basic function added. Most of them differ based on the form of the input output
sapply(nasa, coefficientofvar) #sapply is just simpler, in that we don't need a dimension
listofnasa <- lapply(nasa, coefficientofvar) #lapply returns a list now
str(listofnasa)
?mapply #This is a multivariate extension of the sapply function
sums <- mapply(sum, nasa$ozone, nasa$pressure)
sums #We've created a vector that is the sum of the two
?tapply #this is used for assymertic arrays (I personally have never needed to use it)
?vapply #This is just a more specific type of sapply, which returns a vector. Somes that's what we want, sometimes not
#the aggregate function is similar to the apply family...
?aggregate
aggregate(nasa$ozone, by = list(nasa$year), mean)
#Here we have applied our function 'mean' to the variable ozone, across levels of year
#Don't worry about tibbles right now, for our purposes today they behave like data frames
x <- tribble( 
     ~key, ~val_x, #It's just easier to specify column names
     1, "x1",
     2, "x2",
     3, "x3")
x
y <- tribble(
     ~key, ~val_y,
     1, "y1",
     2, "y2",
     4, "y4")
y
x %>%
  inner_join(y, by= "key") #The inner join is join the key values found in both
x <- tribble(
     ~key, ~val_x,
     1, "x1",
     2, "x2",
     2, "x3",
     8, "x4")
left_join(x, y, by="key") #The left join is based off the key values found in X
right_join(x, y, by = 'key') #The right join is based off the key values found in y
anti_join(x, y, by = 'key') #The anti join is based off those vlaues found in x, but not y
anti_join(y,x, by= 'key') #now those found in y, not x
full_join(x, y, by='key') #the full join is looking for unique values in either dataset, now both X and Y have NA's
install.packages('nycflights13') #We'll get our datasets
library(nycflights13) #Several different datasets, with dramatically different dimensions
airports
airlines
planes
flights
flights_and_planes <- flights %>%
    left_join(planes, by ='tailnum') %>%
    na.omit() #We can join together the flights with the aircraft datasets
cor.test(flights_and_planes$distance, flights_and_planes$engines) #Couldn't answer this question before
flights_anti_planes <- flights %>%
     anti_join(planes, by='tailnum') #Here we are seeing planes in the flight data, but not the planes data
flights_anti_planes 
flights_anti_planes$tailnum #Take a look at the tail numbers left
planes[planes$tailnum =="N628MQ", ] #this tells us that the N628MQ tail number is not in planes, all is good
flights_anti_airlines <- flights %>%
    anti_join(airlines, by = 'carrier') 
flights_anti_airlines # Here we can see there were no flights on airlines not listed in our airlines dataset
