library(tidyverse)
library(janitor)
library(dplyr)

############
## I. Read data
############

# Set a working directory
setwd("Desktop/Einstein R Course/")
getwd() 

# Read athlete_event.csv file
events_data <- read_csv('athlete_events.csv')

#################
## II. A quick check on the data
#################
# Glimpse the data
events_data
glimpse(events_data) 
# view() will pop out the data (may take some time)
#view(events_data)

head(events_data)
tail(events_data, n=10)
summary(events_data)

events_data %>% count(Sex)

install.packages('skimr')

library(skimr)
skim(events_data)
events_data %>% skim()

## Variable names
names(events_data)

# Clean variable names
#   Make variable names to be unique, consisting only of letters, numbers, and _
#   By default, upper case into lower case
events_data <- events_data %>% janitor::clean_names()

names(events_data)

## save this data in a new file name (olympic.csv)
write_csv(events_data, file='olympic.csv')


## Practice
# Read noc_regions.csv, clean variable names, and save it as 'olympic_noc_code.csv'

code_data <- read_csv('noc_regions.csv')
code_data
glimpse(code_data)
head(code_data)
code_data <- code_data %>% janitor::clean_names()
glimpse(code_data)
skim(code_data)
code_data %>% count(noc)
code_data %>% n_distinct(noc)
write_csv(code_data, file='olympic_noc_code.csv')


## Check the Olympics data on categorical variables.  
## When to when, were the Olympics held? How many points at each game?
# group_by(): data is group by ..
# n(): number of row
events_data %>% group_by(games) %>% summarize(n_participated_athletes=n()) 

## equivalently, count(): count number of rows in each group defined by the variable(s)
events_data %>% count(games) 
events_data %>% count(season, year) -> season_year_data
events_data %>% count(season, year)
# to print all the rows
events_data %>% count(season, year) %>% print(n=Inf)

view(season_year_data)

write_csv(season_year_data, file='season_year_olympics.csv')

## short cut for %>% is 
## Cmd + shift + M (mac)
## Ctrl + shift + M (windows)

## Practice
## How many sports?  
## Arrange by the frequency (most to least) using arrange()
events_data %>% count(sport)
events_data %>% group_by(sport) %>% summarize(count=n())
events_data %>% count(sport) %>% arrange(desc(n))

# number of unique sports 
events_data %>% count(sport) %>% summarize(n_sport=n())
events_data %>% summarize(n_distinct(sport)) 

## how records by sex
events_data %>% count(sex)


##################
##  III. Data wrangling I (data organization)
## We will practice to use arrange, filter, mutate, group_by, and summarize
##################
## 1. Sort the data by years; season and year; season, year, and noc
events_data %>% arrange(year)
events_data %>% arrange(season, year)
events_data %>% arrange(season, year,noc)

## 2. Create the data only including winter Olympics
events_data %>% filter(season == 'Summer')
events_data %>% filter(season == 'Summer') %>% arrange(year)

## Practice A. Create the data of two sports of your choice, then arrange by year
events_data %>% filter(season == 'Summer') %>% arrange(year)


## 3. mutate. 
## 3-1. Calculate BMI
events_data %>% mutate(bmi=weight/(height/100)^2)

events_data %>% mutate(bmi=weight/(height/100)^2) %>% 
  summarize(min_bmi = min(bmi, na.rm=T), max_country = max(bmi, na.rm=T), 
            mean_bmi = mean(bmi, na.rm=T), sd_bmi=sd(bmi, na.rm=T), 
            na_bmi=sum(is.na(bmi)), not_na_bmi=sum(!is.na(bmi)))


## 3-2. Combine two columns (season, year) to create one variable
events_data %>% mutate(z=paste(season, year, sep='_'))

events_data %>% unite("z", c(season, year), sep="_", remove=FALSE)
events_data %>% unite("z", c(season, year), sep="_", remove=FALSE) %>% head(n=10) %>% view()
events_data %>% unite("z", c(season, year), sep="_", remove=FALSE) -> events_data2

events_data2 %>% rename('z', games2=z) -> events_data2


## 4. Only include certain variables
events_data %>% select(id, sex, age, noc, season, year, sport, medal) 
events_data %>% select(-c(games, name, team, event))

## 5. group_by and summarize(). 
##  Number of records by game 
events_data %>% group_by(games) %>% summarize(n=n())
events_data %>% count(games)

## Number of records by game add sex
events_data %>% group_by(games, sex) %>% summarize(n=n())

## How many unique athletes attended at each Olympic games
events_data %>% group_by(games) %>% summarize(n=n_distinct(id))  


## Practice B. 
# (a) Create the data of number of records and number of unique athletes by game, 
# then (b) calculate the mean number of records and athletes. 

events_data %>% group_by(games) %>% summarize(n_participation=n(), n_id=n_distinct(id)) -> game_data
game_data %>% summarize(mean_participation= mean(n_participation), mean_n_athlete=mean(n_id))


## Practice C. 
## We want to know the number of countries attended at each Winter Olympic game and summarize (min, max, median, SD)
## 1. filter by season
## 2. group_by()
## 3. get number of countries attended the game ()
## 4. make a summary statistics (summarize())
events_data %>% filter(season == 'Winter') %>% 
  group_by(year) %>% 
  summarize(number_country=n_distinct(noc)) -> n_country_winter_data

n_country_winter_data %>% summarize(min_country = min(number_country), max_country = max(number_country),
            mean_country = median(number_country), sd_country=sd(number_country))

## all together
events_data %>% filter(season == 'Winter') %>% 
  group_by(year) %>% 
  summarize(number_country=n_distinct(noc)) %>% 
  summarize(min_country = min(number_country), max_country = max(number_country),
            mean_country = median(number_country), sd_country=sd(number_country))



######################
## IV. Data wrangling II. 
## Medals! 
######################
## Which country has the most medals at each game?
# step 1. Create a number of medal variable 
# step 2. Summarize the number of medals (take the sum), grouped by game and country.  
# step 3. Find the country with the most medals at each game.

## Types of Medals
events_data %>% count(medal) 

# Step 1.  A new variable for whether they got a medal or not
events_data %>% mutate(have_medal = !is.na(medal))
events_data %>% mutate(have_medal = as.integer(!is.na(medal)))

events_data %>% mutate(point_gold=if_else(medal == c('Gold'), 1, 0))
events_data %>% mutate(point_gold_incl_na=if_else(medal == 'Gold', 1, 0, missing=0))
events_data %>% mutate(point_gold_incl_na=if_else(medal == 'Gold', 1, 0, missing=0))

## Point system by medal type
## 3 for gold, 2 for silver, and 1 for bronze
events_data %>% mutate(point3=case_when(medal == 'Gold' ~ 3, medal == 'Silver' ~ 2, medal == 'Bronze' ~ 1, is.na(medal) ~ 0))


## Create a new data (events_point_data) with two new variables (have a medal or not, point system) to events_data
events_data %>% mutate(have_medal = as.integer(!is.na(medal)),
  point3=case_when(medal == 'Gold' ~ 3, medal == 'Silver' ~ 2, medal == 'Bronze' ~ 1, is.na(medal) ~ 0)) -> events_point_data


# Step 2. group by games and noc, summarize as the sum of medals. 
events_data %>%  mutate(have_medal = as.integer(!is.na(medal))) %>%
          group_by(games, noc) %>% 
         summarize(number_medals_noc = sum(have_medal)) 

events_data %>%  mutate(have_medal = as.integer(!is.na(medal))) %>%
  group_by(games, noc) %>% 
  summarize(number_medals_noc = sum(have_medal)) -> n_medal_data

## Step 3. Which country has the most medals at each game?
n_medal_data  %>% slice_max(number_medals_noc)

n_medal_data_ungrouped <- n_medal_data %>% ungroup() 
n_medal_data_ungrouped %>% group_by(games) %>% slice_max(number_medals_noc)

n_medal_data_ungrouped  %>% group_by(noc) %>% slice_max(number_medals_noc)
n_medal_data_ungrouped  %>% group_by(noc) %>% slice_max(number_medals_noc) %>% filter(noc=='USA')


## Exploration of .groups in summarize()
events_data %>%  mutate(have_medal = as.integer(!is.na(medal))) %>%
  group_by(games, noc) %>% 
  summarize(number_medals_noc = sum(have_medal), .groups='drop_last') # drop the last grouping variable (still grouped by games)

events_data %>%  mutate(have_medal = as.integer(!is.na(medal))) %>%
  group_by(games, noc) %>% 
  summarize(number_medals_noc = sum(have_medal), .groups='keep') # keep the grouping structure

events_data %>%  mutate(have_medal = as.integer(!is.na(medal))) %>%
  group_by(games, noc) %>% 
  summarize(number_medals_noc = sum(have_medal), .groups='drop') # you can drop the grouping structure

events_data %>%  mutate(have_medal = as.integer(!is.na(medal))) %>%
  group_by(games, noc) %>% 
  summarize(number_medals_noc = sum(have_medal), .groups='rowwise')  # each row in the summary as one group


## Practice A.
## In terms of point system (3 for gold, ...), 
## which country has the most points at each game?
## which year did each country has the most points?

        

## 2. You are interested in athletes' demographic data (age, sex, height, weight) over Olympic games.  
## For example, trend of age over years, female rate over years, or height over years. 
## Note that some athletes participated in multiple events. 
## So, you want to extract only one data point for each athlete at a game.
## In other words, remove duplicated data.

## Here, ID is unique for each athlete. 
## So, you want to have one record per game-ID.

## step 1. select variables you want to keep.
## step 2. group by games and id
## step 3. remove duplicated data

events_data %>% select(id, name, games, season, year, height, weight,  age, sex) %>% 
  group_by(games,id) %>% filter(!duplicated(games, id))

x <- c(1,2,3, 1,1,3)
duplicated(x)

x <- c('melissa',  'angel','chenxin', 'kith', 'chenxin', 'angel', 'mimi')
duplicated(x)

## simpler solution
## step 2. use distinct() to remove rows with duplicate values 
## .keep_all=TRUE keeps all other variables. By default, .keep_all=F
events_data %>% select(id, name, games, season, year, height, weight,  age, sex) %>% 
  distinct(games,id) 

events_data %>% select(id, name, games, season, year, height, weight,  age, sex) %>% 
  distinct(games,id, .keep_all=T) -> athlete_data

## Q. You want to add the number of participated events and  the number of medals at each game)
## step 1. get number of participated events and number of medals, grouped by games and id
events_data %>% group_by(games, id) %>% 
  mutate(number_event = n(), number_medal = sum(!is.na(medal))) %>%
  ungroup() -> tmp_data

print(tmp_data, width=Inf)

## step 2. repeat the above 
tmp_data %>% select(id, name, games, season, year, height, weight,  age, sex, number_event, number_medal) %>% 
  distinct(games,id, .keep_all=T)  -> athlete_data2

athlete_data2 %>% filter(number_medal>0)
tmp_data %>% filter(id==20)

## save
write_csv(athlete_data2, file='athlete.csv')
saveRDS(athlete_data2, file='athlete.rds')
readRDS('athlete.rds')

## save can save multiple R objects
save(athlete_data2, file='athlete.RData')
load('athlete.RData', verbose=T)


################
## V. Coding challenge!
##  Olympic Trivia 
##################
## Data has ID given to each athlete. 

## A. Who attended the most events (specific event (e.g., swimming 100m) in history (across games)? 
## Hint. 1. By ID, 2. arrange by the number of participated events
events_data %>% group_by(id) %>% 
  mutate(number_event = n())  %>% ungroup() %>% 
  arrange(desc(number_event)) 


## B. Who got the most medals? What's the athlete's name?
events_data %>% group_by(id) %>% 
  summarize(all_time_medal=sum(!is.na(medal))) %>% 
  slice_max(order_by=all_time_medal)

events_data %>% group_by(id, name) %>% 
  summarize(all_time_medal=sum(!is.na(medal))) %>% arrange(desc(all_time_medal))

events_data %>% group_by(id) %>% 
  mutate(number_event = n(), all_time_medal=sum(!is.na(medal)))  %>% ungroup() %>% 
  arrange(desc(all_time_medal)) %>% 
  filter(!duplicated(id)) %>% print(width=Inf)


