library(sqldf)
library(ggplot2)
library(dplyr)
library(tidyr)
library(magrittr)
library(rvest)
library(stringr)
library(ggrepel)
library(plotly)
library(DT)
library(ggalt)
#import data
athevent <- read.csv("data_visualization_git/assignment-1---winter-olympics-szaidman22/data/athletes_and_events.csv")
gdp <- read.csv("data_visualization_git/assignment-1---winter-olympics-szaidman22/data/gdp_pop.csv")
noc <- read.csv("data_visualization_git/assignment-1---winter-olympics-szaidman22/data/noc_regions.csv")
gdp <- gdp %>%
rename(NOC = Code)
base <- merge(x=athevent,y=gdp ,by="NOC", all.x=TRUE)
base <- merge(x=base,y=noc ,by="NOC", all.x=TRUE)
#for regions that non longer exist but have team names that match notes for existing countries, changed NOC to region that was in the notes column
#for other regions that had defunct NOC, used common sense to change to current NOC (ie east and west Germany, USSR)
sqldf(
'
with a as(
select distinct base.NOC, Team, noc.region
from base
join noc on noc.notes = base.Team
where base.Country is null)
,
b as (select * from noc
where notes = "")
select *
from a join b on a.region = b.region
'
)
base <- sqldf(
'select *,
case
when "NOC" in ("FRG","GDR") then "GER"
when "NOC" in ("BOH","TCH") then "CZE"
when "NOC" = "ANZ" then "AUS"
when "NOC" = "NFL" then "CAN"
when "NOC" = "CRT" then "GRE"
when "NOC" = "NBO" then "MAS"
when "NOC" = "YUG" then "SRB"
when "NOC" in ("YAR","YMD") then "YEM"
when "NOC" = "VNM" then "VIE"
else "NOC"
end as "NOC_edit"
from base'
)
base <- merge(x=base,y=gdp ,by.x="NOC_edit", by.y="NOC", all.x=TRUE)
base <- base %>%
rename(Country_edit = Country.y,
Population_edit = Population.y,
GDPpc_edit = GDP.per.Capita.y,
Country = Country.x,
Population = Population.x,
GDPpc = GDP.per.Capita.x
)
'%!in%' <- function(x,y)!('%in%'(x,y))
#distinct years they've competed with medal count by event (no duplicates)
f <- base %>%
filter(Season == "Winter") %>%
group_by(NOC_edit,NOC,region,Year, Games, Sport, Event, Medal, Population_edit, GDPpc_edit, Sex) %>%
mutate(region = case_when(NOC %in% c("EUN","URS") ~ "Soviet Union", NOC %!in% c("EUN","URS") ~ region)) %>%
summarize(medal_count = n_distinct(Medal)) %>%
mutate(medal_count = replace(medal_count, is.na(Medal) == TRUE, 0)) %>%
mutate(bronze_flag = case_when(Medal== "Bronze" ~ 1,Medal!="Bronze"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
mutate(silver_flag = case_when(Medal== "Silver" ~ 1,Medal!="Silver"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
mutate(gold_flag = case_when(Medal== "Gold" ~ 1,Medal!="Gold"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
mutate(bronze_flag = case_when(NOC %in% c("FRG","GDR") ~ bronze_flag*.5, NOC %!in% c("FRG","GDR") ~ bronze_flag)) %>%
mutate(medal_count = case_when(NOC %in% c("FRG","GDR") ~ medal_count*.5, NOC %!in% c("FRG","GDR") ~ medal_count)) %>%
mutate(silver_flag = case_when(NOC %in% c("FRG","GDR") ~ silver_flag*.5, NOC %!in% c("FRG","GDR") ~ silver_flag)) %>%
mutate(gold_flag = case_when(NOC %in% c("FRG","GDR") ~ gold_flag*.5, NOC %!in% c("FRG","GDR") ~ gold_flag)) %>%
group_by(NOC_edit,region,Year, Games, Population_edit, GDPpc_edit, Sex) %>%
summarize(total_medals = sum(medal_count), total_bronze = sum(bronze_flag),total_silver = sum(silver_flag),total_gold = sum(gold_flag))
Graph 1a displays medals earned over time by the top 15 medal-earning countries, divided by sex. Medal totals for East Germany and West Germany have been divided by 2 and added to Germany’s total. This method was chosen so as not to completely erase medals earned by these regions, and to mitigate over-counting for Germany. The USSR is counted as a separate entity from Russia, as it comprised many regions in addition to current-day Russia. Where appropriate, former countries or teams with names that correspond to existing countries have been combined.
#Get top 15 medal earning countries of all time
tops <- f %>%
group_by(region) %>%
summarize(total = sum(total_medals)) %>%
arrange(desc(total)) %>%
ungroup() %>%
slice(1:15)
countries <- c()
for (x in tops[1]){
countries <- c(countries, x)
}
topyears <- f %>%
filter(region %in% countries) %>%
group_by(region) %>%
mutate(totaloverall = sum(total_medals)) %>%
mutate(regiontotal = paste(region, ":", sum(total_medals)))
cc <- topyears %>%
group_by(region,totaloverall) %>%
summarize(xx = unique(regiontotal)) %>%
arrange(desc(totaloverall))
countries1 <- c()
for (x in cc$xx){
countries1 <- c(countries1, x)
}
topyears$regiontotal = factor(topyears$regiontotal, levels=countries1)
cols <- c("#ed806b","#84c3f0")
ggplot(topyears, aes(x= Year,y=total_medals, fill = Sex)) +
geom_bar(position="stack", stat="identity") +
theme_minimal() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
strip.background =element_rect(fill="#f5f5f5", color = "#f5f5f5"),
panel.background = element_rect(fill = "#f5f5f5", color = "#f5f5f5"),
legend.title=element_blank(),
legend.position="bottom",
text=element_text(family="Helvetica",size = 20,color = "black"),
strip.text.x = element_text(size = 18))+
facet_wrap(~ regiontotal,
labeller = labeller(group = label_wrap_gen(width = 25)),
ncol = 3) +
ggtitle("Medals earned over time by 15 highest medal earning countries") +
labs(y="", x = "") +
scale_fill_manual(values = cols) +
scale_x_continuous(breaks = round(seq(min(topyears$Year), max(topyears$Year), by = 20),1))
Graph 1b displays all-time total medals for the top 15 medal-earning countries, broken down by medal type.
topyears$region = factor(topyears$region, levels=countries)
medals_pivot <- topyears %>%
ungroup() %>%
select(region, Year, total_bronze, total_silver, total_gold) %>%
pivot_longer(c(total_bronze, total_silver, total_gold), names_to = "medal_type", values_to = "count") %>%
group_by(region, medal_type) %>%
summarize(count = sum(count))
totals <- medals_pivot %>%
group_by(region) %>%
summarize(total = sum(count))
medals_pivot$medal_type = factor(medals_pivot$medal_type, levels=c("total_bronze","total_silver","total_gold"))
cols <- c("#db7d42","#d2d6d6","#d9bc02")
labs <- c("Bronze", "Silver", "Gold")
ggplot(medals_pivot, aes(x= region,y = count, fill = medal_type, label = count)) +
theme_minimal() +
#scale_fill_discrete() +
labs(y="", x = "", legend="") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
legend.title=element_blank(),
text=element_text(family="Helvetica",size = 18),
legend.position="bottom",
axis.text.x = element_blank())+
scale_fill_manual(values = cols,labels=labs) +
geom_bar(position="stack", stat= "identity") +
geom_text(size = 4, position = position_stack(vjust = 0.5), color = "gray30") +
geom_text(aes(region, total + 20, label = total, fill = NULL),
data = totals, size = 6) +
theme(axis.text.y = element_text(size = 18, color = "black")) +
ggtitle("15 highest medal earning countries of all time") +
coord_flip()
Graph 2 is a visualization of 2014 Winter Olympic performance rankings of any medal-earning country using 3 measures: medal count, population per medal count, and per capita GDP per medal count. Countries are plotted in the coordinate plane based on their population per medal count and per capita GDP per medal count. Point size and color reflects unadjusted medal count. Patterns can be detected in where certain countries fall on the plane. Some countries group together as having high GDP-adjusted rank but low population-adjust ranking (China and Ukraine), while the Nordic countries tend to have the opposite ranking pattern. The Netherlands pulls ahead as being the most balanced in rankings while remaining high in all 3.
cols <- c("#d9bc02","#d2d6d6","#9e9e9e")
adj <- f %>%
filter(Year == 2014) %>%
ungroup() %>%
select(region, Population_edit, GDPpc_edit, Year, total_bronze, total_silver, total_gold) %>%
pivot_longer(c(total_bronze, total_silver, total_gold), names_to = "medal_type", values_to = "count") %>%
group_by(region, Population_edit, GDPpc_edit) %>%
summarize(count = sum(count)) %>%
arrange(desc(count)) %>%
ungroup() %>%
mutate(medal_rank = rank(-count, ties.method = "min")) %>%
group_by(region, Population_edit, GDPpc_edit) %>%
mutate(pop_adj_count = (round(Population_edit/count,0))) %>%
arrange(pop_adj_count) %>%
ungroup() %>%
mutate(pop_adj_rank=row_number()) %>%
group_by(region, Population_edit, GDPpc_edit) %>%
mutate(gdp_adj_count = (round(GDPpc_edit/count,0))) %>%
arrange(gdp_adj_count) %>%
ungroup() %>%
mutate(gdp_adj_rank=row_number()) %>%
arrange(desc(count)) %>%
mutate(totalrank = medal_rank + pop_adj_rank + gdp_adj_rank) %>%
mutate(GDPpc_edit = round(GDPpc_edit,0)) %>%
slice(1:26)
p<-ggplot(adj, aes(x=pop_adj_rank, y=gdp_adj_rank, label=region)) +
theme_bw() +
theme(panel.grid.major = element_line(color = "#f5f5f5",
size = 0.25,
linetype = 1),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
text=element_text(family="Helvetica")) +
geom_point(aes(color = medal_rank,
size = count,
text = paste('Region:', region,
'<br>Population:', Population_edit,
'<br>GDP per capita:', round(GDPpc_edit,0),
'<br>Medal count:', count,
'<br>Medal rank:', medal_rank,
'<br>Pop. adj. rank:', pop_adj_rank,
'<br>GDP adj. rank: ', gdp_adj_rank))) +
scale_size_continuous(range = c(2,8)) +
geom_text(aes(label=region, alpha = 4)) +
labs(y="Performed better for GDP per capita", x = "Performed better for population") +
scale_colour_gradientn(colours = cols) +
scale_x_reverse(breaks=seq(0,26,1)) +
scale_y_reverse(breaks=seq(0,26,1)) +
ggtitle("Ranking performance in the 2014 Winter Olympics using medal count, \npopulation, and GDP per capita") +
theme(legend.position = "none")
ggplotly(p,tooltip = c("text")) %>%
config(displayModeBar = F) %>%
add_annotations(text = '',
showarrow = TRUE,
xref = "paper", axref = "paper",
yref = "paper", ayref = "paper",
x = .98,
ax = 0,
y = 0,
ay = 0,
arrowhead=2,
arrowsize=1,
arrowwidth=3,
arrowcolor='black') %>%
add_annotations(text = '',
showarrow = TRUE,
xref = "paper", axref = "paper",
yref = "paper", ayref = "paper",
x = 0,
ax = 0,
y = .95,
ay = 0,
arrowhead=2,
arrowsize=1,
arrowwidth=3,
arrowcolor='black')
tabledata <- adj %>%
rename(
Region = region,
Population = Population_edit,
"GDP per capita" = GDPpc_edit,
"Medals won" = count,
"Medal rank" = medal_rank,
"Population per medal" = pop_adj_count,
"Rank adjusted by population" = pop_adj_rank,
"GDP per medal" = gdp_adj_count,
"Rank adjusted by GDP"= gdp_adj_rank,
"Sum of all rankings" = totalrank
)
datatable(tabledata, options = list(pageLength = 5, scrollX = T),class = 'cell-border stripe') %>%
formatStyle('Medal rank', backgroundColor = '#d9bc02') %>%
formatStyle('Rank adjusted by population', backgroundColor = '#d9bc02') %>%
formatStyle('Rank adjusted by GDP', backgroundColor = '#d9bc02')
Graph 3 charts the absence or presence of a host country’s advantage in total medals earned. Medal totals for each country that has hosted the winter games are plotted over time, with year(s) the country has hosted highlighted in red. Certain countries jump out as having a potential advantage when hosting, while others do not appear to have had an advantage. Host country advantage was graphed over time in this way so that each Olympics year can be compared to the years directly before and after it.
wiki_hosts <- read_html("https://en.wikipedia.org/wiki/List_of_Olympic_Games_host_cities")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[2]], fill=TRUE)[-1]
hosts <- hosts %>% filter(Winter != "") %>%
select(City, Country, Year)
one <- c('Sapporo','Japan',1940)
two <- c('Garmisch-Partenkirchen','Germany',1940)
three <- c('Sochi','Russia',2014)
four <- c('Salt Lake City','USA',2002)
five <- c('Squaw Valley','USA',1960)
six <- c('Lake Placid','USA',1932)
seven <- c('Lake Placid','USA',1980)
hosts <- rbind(hosts, one,two,three,four,five,six,seven)
h <- base %>%
filter(Season == "Winter") %>%
group_by(NOC_edit,region,Year, notes, Games, Sport, Event, Medal, Population_edit, GDPpc_edit, Sex) %>%
summarize(medal_count = n_distinct(Medal)) %>%
mutate(medal_count = replace(medal_count, is.na(Medal) == TRUE, 0)) %>%
mutate(bronze_flag = case_when(Medal== "Bronze" ~ 1,Medal!="Bronze"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
mutate(silver_flag = case_when(Medal== "Silver" ~ 1,Medal!="Silver"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
mutate(gold_flag = case_when(Medal== "Gold" ~ 1,Medal!="Gold"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
group_by(NOC_edit,region,Year, notes, Games, Population_edit, GDPpc_edit, Sex) %>%
summarize(total_medals = sum(medal_count), total_bronze = sum(bronze_flag),total_silver = sum(silver_flag),total_gold = sum(gold_flag))
hosted <- sqldf(
'
with medals as (
select
case when region = "Serbia" and notes = "Yugoslavia" then "Yugoslavia"
else region end as region,
year, sum(total_medals) as medals,
sum(total_bronze) as bronze,
sum(total_silver) as silver,
sum(total_gold) as gold
from h
group by
case when region = "Serbia" and notes = "Yugoslavia" then "Yugoslavia"
else region end,
year),
stage as (
select m.*,
case when h.Country is null then "N" else "Y" end as host_flag
from medals m
left join hosts h on m.region = h.Country and m.Year = h.Year)
select stage.* from stage
inner join
(select distinct
region
from stage
where host_flag = "Y") s on s.region = stage.region
'
)
hostyears <- hosted %>%
filter(host_flag == "Y")
cols <- c("#cfa7a7","red")
ggplot(hosted, aes(x= year,y=medals, fill = host_flag)) +
geom_bar(position="stack", stat="identity") +
theme_minimal() +
theme(
strip.background =element_rect(fill="#f5f5f5", color = "#f5f5f5"),
panel.background = element_rect(fill = "#f5f5f5", color = "#f5f5f5"),
legend.title=element_blank(),
legend.position="bottom",
plot.title = element_text(size=14),
text=element_text(family="Helvetica",size = 14),
axis.text.x = element_blank())+
facet_wrap(~ region, labeller = labeller(group = label_wrap_gen(width = 25)), ncol = 4) +
ggtitle("Medals won by host countries over time - host years highlighted in red") +
labs(y="", x = "") +
geom_text(aes(label = year), vjust = -0.2,data = hostyears, alpha = .7) +
expand_limits(y = c(0, 45)) +
expand_limits(x = c(1924, 2020)) +
scale_fill_manual(values = cols) +
theme(legend.position = "none")
athlete_total8 <- base %>%
filter(Season == "Winter") %>%
group_by(NOC_edit,region,Year, Team, Name, notes, Games, Sport, Event, Medal, Population_edit, GDPpc_edit, Sex, Age, Height, Weight) %>%
summarize(medal_count = n_distinct(Medal)) %>%
mutate(medal_count = replace(medal_count, is.na(Medal) == TRUE, 0)) %>%
mutate(bronze_flag = case_when(Medal== "Bronze" ~ 1,Medal!="Bronze"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
mutate(silver_flag = case_when(Medal== "Silver" ~ 1,Medal!="Silver"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
mutate(gold_flag = case_when(Medal== "Gold" ~ 1,Medal!="Gold"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
group_by(region, Name, Sex, Height, Weight) %>%
summarize(total_medals = sum(medal_count), total_bronze = sum(bronze_flag),total_silver = sum(silver_flag),total_gold = sum(gold_flag),
min_competed = min(Year), max_competed = max(Year), min_age = min(Age), max_age = max(Age),sports = toString(unique(Sport))) %>%
arrange(desc(total_medals)) %>%
subset(total_medals > 7)
athlete_year <- base %>%
filter(Season == "Winter") %>%
group_by(NOC_edit,region,Year, Team, Name, notes, Games, Sport, Event, Medal, Population_edit, GDPpc_edit, Sex, Age, Height, Weight) %>%
summarize(medal_count = n_distinct(Medal)) %>%
mutate(medal_count = replace(medal_count, is.na(Medal) == TRUE, 0)) %>%
mutate(bronze_flag = case_when(Medal== "Bronze" ~ 1,Medal!="Bronze"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
mutate(silver_flag = case_when(Medal== "Silver" ~ 1,Medal!="Silver"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
mutate(gold_flag = case_when(Medal== "Gold" ~ 1,Medal!="Gold"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
group_by(region, Name, Sex, Year, Age, Height, Weight) %>%
summarize(total_medals = sum(medal_count), total_bronze = sum(bronze_flag),total_silver = sum(silver_flag),total_gold = sum(gold_flag),
sports = toString(unique(Sport))) %>%
arrange(desc(total_medals))
yearly <- sqldf(
'select a.*,
ay.Year,
ay.Age as year_age,
ay.total_medals as year_total,
ay.total_bronze as year_bronze,
ay.total_silver as year_silver,
ay.total_gold as year_gold
from athlete_total8 a
join athlete_year ay on ay.Name = a.Name
'
)
Graph 4 displays detailed medal information over the Olympics careers of the top 16 (anyone who earned 8 medals or more) winter Olympic athletes of all time. Point color is determined by the number of gold medals that an athlete won that year. Point size is based on the total number of medals an athlete won that year. Demographic and sport information is displayed upon hover.
yearly$text1 <- paste(yearly$region, "\n", yearly$sports,"\nSex:", yearly$Sex, "\nAge:", yearly$year_age,"\nTotal Medals:",yearly$total_medals, ",", yearly$Year, "Total:", yearly$year_total,"\nGold:", yearly$year_gold, " Silver:", yearly$year_silver," Bronze:", yearly$year_bronze)
fig <- plot_ly(yearly, color = I("black"), colors = c("#9e9e9e","#d9bc02"))
fig %>%
add_segments(x = ~min_competed,
xend = ~max_competed,
y = ~Name,
yend = ~Name,
showlegend = FALSE) %>%
add_markers(x = ~Year,
y = ~Name,
name = " ",
color = ~year_gold,
opacity = .99,
text = ~text1,
size = ~year_total,
showlegend = FALSE,
opacity = 1,
hovertemplate = "%{x}, %{y}
<br>%{text}") %>%
layout(showlegend = FALSE,
title = "Medal earning history for all-time top performing winter athletes",
xaxis = list(title = ""),
font = list(family = "Helvetica"),
margin = list(l = 65),
yaxis = list(title = '',
categoryorder = "array",
categoryarray = as.array(rev(athlete_total8$Name)))
) %>%
config(displayModeBar = F) %>%
hide_colorbar()
sport_age <- base %>%
filter(Season == "Winter") %>%
group_by(NOC_edit,region,Year, Team, Name, notes, Games, Sport, Event, Medal, Population_edit, GDPpc_edit, Sex, Age, Height, Weight) %>%
summarize(medal_count = n_distinct(Medal)) %>%
mutate(medal_count = replace(medal_count, is.na(Medal) == TRUE, 0)) %>%
mutate(bronze_flag = case_when(Medal== "Bronze" ~ 1,Medal!="Bronze"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
mutate(silver_flag = case_when(Medal== "Silver" ~ 1,Medal!="Silver"~ 0, is.na(Medal)==TRUE ~ 0)) %>%
mutate(gold_flag = case_when(Medal== "Gold" ~ 1,Medal!="Gold"~ 0, is.na(Medal)==TRUE ~ 0))
golds <- sport_age %>%
filter(gold_flag > 0) %>%
group_by(Sport) %>%
summarize(avg_gold = mean(Age, na.rm=TRUE))
nogold <- sport_age %>%
filter(gold_flag == 0) %>%
group_by(Sport) %>%
summarize(avg_age = mean(Age, na.rm=TRUE))
agecompare <- merge(golds, nogold, by = "Sport") %>%
mutate(difference = avg_gold - avg_age)
Graph 5 charts age differences between gold medal winners and non-gold medal winners for each sport. We can see that for most sports, gold medal winners are older on average. Interesting exceptions are skeleton and short track speed skating. We can also see the general age differences for each sport.
hjust <- c(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,2,2,-1,2,-1)
col <- c("#e4fce1","#e4fce1","#e4fce1","#e4fce1","#e4fce1","#e4fce1","#e4fce1","#e4fce1","#e4fce1",
"#e4fce1","#e4fce1","#ffc7cb","#ffc7cb","#e4fce1","#ffc7cb","#e4fce1")
agecompare %>% arrange(desc(avg_age)) %>%
mutate(Sport = factor(Sport, unique(Sport))) %>%
ggplot() +
ggtitle("Average age of gold medal winners vs. \nnon-gold medal winners by sport") +
theme_minimal() +
theme(text=element_text(family="Helvetica",size = 14),
plot.title = element_text(size=14)) +
geom_segment(aes(x = avg_age,
y = Sport,
xend = avg_gold,
yend = Sport),
# arrow = arrow(),
linetype = "dotted") +
geom_point(aes(x=avg_age,y=Sport),
color = "black",
size = 1) +
geom_point(aes(x=avg_gold,y=Sport),
color = "goldenrod",
size = 6) +
expand_limits(x = c(20, 37)) +
labs(y="", x = "") +
theme(#panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background =element_rect(fill="white", color = "white"),
panel.background = element_rect(fill = "white", color = "white"))+
scale_x_continuous(breaks = seq(20, 37, by = 1)) +
geom_label(data=agecompare, fill=col, size=3,hjust = hjust,
aes(x=avg_gold, y=Sport, label=round(difference,2)))