### Applied Session 2
### Olympics Dataset
### Angel Garcia de la Garza


### Set Libraries

library(tidyverse)

### Load Datasets

setwd("~/Desktop/Einstein R Course/")

events_data <- read_csv("athlete_events.csv") %>%
  janitor::clean_names()

code_data <- read_csv("noc_regions.csv") %>%
  janitor::clean_names()  %>%
  rename(country_name = region)

gdp_data <- read_csv("countries_gdp_population_data.csv") %>%
  janitor::clean_names()

### The list of countries in the two datasets aren't the same. Find a list of the names that do not match. 

unique.gdp <- gdp_data %>%
  select(country_name) %>%
  unique() %>%
  mutate(missing = F)

missing <- code_data %>%
  left_join(unique.gdp, by = "country_name") %>%
  filter(is.na(gdp))


###### Look at the names of the variables
###### Country names do no match 100%
###### Country names have changed; New countries have appeared. 
###### See Russia vs. USRR vs. Russian Federation
###### This is code to manually standardize these names

gdp_data <- gdp_data %>%
  mutate(country_name = case_when(country_name == "Russian Federation" ~ "Russia",
                                  country_name == "Bahamas, The" ~ "Bahamas",
                                  country_name == "Bolivia" ~ "Boliva",
                                  country_name == "Brunei Darussalam" ~ "Brunei",
                                  country_name == "Cote d'Ivoire" ~ "Ivory Coast",
                                  country_name == "Czechia" ~ "Czech Republic",
                                  country_name == "Cabo Verde" ~ "Cape Verde",
                                  country_name == "Congo, Dem. Rep." ~ "Democratic Republic of the Congo",
                                  country_name == "Egypt, Arab Rep." ~ "Egypt",
                                  country_name == "Gambia, The" ~ "Gambia",
                                  country_name == "Iran, Islamic Rep." ~ "Iran",
                                  country_name == "United Kingdom" ~ "UK",
                                  country_name == "Kyrgyz Republic" ~ "Kyrgyzstan",
                                  country_name == "Lao PDR" ~ "Laos",
                                  country_name == "numberrth Macedonia" ~ "Macedonia",
                                  country_name == "Micronesia, Fed. Sts." ~ "Micronesia",
                                  country_name == "Congo, Rep." ~ "Republic of Congo",
                                  country_name == "Slovak Republic" ~ "Slovakia",
                                  country_name == "Korea, Rep." ~ "South Korea",
                                  country_name == "Eswatini" ~ "Swaziland",
                                  country_name == "Syrian Arab Republic" ~ "Syria",
                                  country_name == "Trinidad and Tobago" ~ "Trinidad",
                                  country_name == "Turkiye" ~ "Turkey",
                                  country_name == "United States" ~ "USA",
                                  country_name == "Venezuela, RB" ~ "Venezuela",
                                  country_name == "Viet Nam" ~ "Vietnam",
                                  country_name == "Virgin Islands (U.S.)" ~ "Virgin Islands US",
                                  country_name == "Yemen, Rep." ~ "Yemen",
                                  country_name == "St. Lucia" ~ "Saint Lucia",
                                  country_name == "St. Kitts and Nevis" ~ "Saint Kitts",
                                  country_name == "St. Vincent and the Grenadines" ~ "Saint Vincent",
                                  T ~ country_name))


### Is GDP Related to Number of Gold Medals Using 2000 as an Example? 

medals_summary <- events_data %>%
  left_join(code_data, by = "noc") %>%
  filter(year == 2000) %>%
  group_by(country_name, year, ) %>%
  summarize(number_gold = sum(medal == "Gold", na.rm = T)) %>%
  ungroup() %>%
  left_join(gdp_data, by = c("country_name", "year")) %>%
  mutate(rank_medals = rank(-number_gold),
         rank_gdp = rank(-gdp)) %>%
  select(country_name,
         rank_medals,
         rank_gdp) %>%
  arrange(rank_medals)

head(medals_summary)


### Is a country GDP proportional to the number of Gold Medals? 


medals_summary <- events_data %>%
  left_join(code_data, by = "noc") %>%
  filter(year == 2000) %>%
  group_by(country_name, year) %>%
  summarize(number_gold = sum(medal == "Gold", na.rm = T)) %>%
  ungroup() %>%
  left_join(gdp_data, by = c("country_name", "year")) %>%
  filter(number_gold > 0) %>%
  mutate(ratio_gdp = gdp / number_gold)

###### Compare the returns in medal for each dollar of GDP. 
###### Cameroon's return is ~ 1000 than that of Mexico or Japan



### Try this yourself; Calculate the Ranks but with the Total number of Medals (to the point system)

medals_summary <- events_data %>%
  left_join(code_data, by = "noc") %>%
  filter(year == 2000) %>%
  group_by(country_name, year) %>%
  summarize(number_gold = sum(medal == "Gold", na.rm = T),
            number_silver = sum(medal == "Silver", na.rm = T),
            number_bronze = sum(medal == "Bronze", na.rm = T),
            number_medals = number_gold*3 + number_silver*2 + number_bronze) %>%
  ungroup() %>%
  left_join(gdp_data, by = c("country_name", "year")) %>%
  mutate(rank_medals = rank(-number_medals),
         rank_gdp = rank(-gdp)) %>%
  select(country_name,
         rank_medals,
         rank_gdp)

### What's the Average over all years of the ranks?

medals_summary <- events_data %>%
  left_join(code_data, by = "noc") %>%
  group_by(country_name, year) %>%
  summarize(number_gold = sum(medal == "Gold", na.rm = T),
            number_silver = sum(medal == "Silver", na.rm = T),
            number_bronze = sum(medal == "Bronze", na.rm = T),
            number_medals = number_gold + number_silver + number_bronze) %>%
  ungroup() %>%
  left_join(gdp_data, by = c("country_name", "year")) %>%
  group_by(year) %>%
  mutate(rank_medals = rank(-number_medals),
         rank_gdp = rank(-gdp)) %>%
  group_by(country_name) %>%
  summarize(mrank_medals = mean(rank_medals),
            mrank_gdp = mean(rank_gdp))


### What's the relationship between number of medals and GDP per capita using 2000 as an example?

medals_summary <- events_data %>%
  left_join(code_data, by = "noc") %>%
  filter(year == 2000) %>%
  group_by(country_name, year) %>%
  summarize(number_gold = sum(medal == "Gold", na.rm = T)) %>%
  ungroup() %>%
  left_join(gdp_data, by = c("country_name", "year")) %>%
  mutate(gdp_pc = gdp / population,
         rank_medals = rank(-number_gold),
         rank_gdp_pc = rank(-gdp_pc)) %>%
  select(country_name,
         rank_medals,
         rank_gdp_pc)


### What's the relationship between number of medals and population using 2000 as an example?

medals_summary <- events_data %>%
  left_join(code_data, by = "noc") %>%
  filter(year == 2000) %>%
  group_by(country_name, year) %>%
  summarize(number_gold = sum(medal == "Gold", na.rm = T)) %>%
  ungroup() %>%
  left_join(gdp_data, by = c("country_name", "year")) %>%
  mutate(rank_medals = rank(-number_gold),
         rank_population = rank(-population)) %>%
  select(country_name,
         rank_medals,
         rank_population)



