In this video tutorial I show you how to make so called sankey diagrams or sankey networks in R. Sankey diagrams are a nice way to visualize links between certain stats and how they “flow”.

You can find the code for the sankey diagram at the bottom of this page and the data here to reproduce the whole diagram.

The video:

The code:

setwd("C:/Users/bs/Nextcloud/Firma/Projects/baseball_dashboard")

library(tidyverse)
library(data.table)
library(networkD3)


#read in complete data
batting_all_sankeys_data <- data.table::fread("tables/batting_all_sankeys_data.csv")

#filter for specific player or team
bat <- dplyr::filter(batting_all_sankeys_data, Name == "Christian Yelich* (MIL)")[1,]

##Make links and nodes Table##
#The order of the nodes in the final visualization are determined here
links <- data.table(
  source = c(rep("PA", 15), "1B", "2B", "3B", "UBB", "IBB", "HBP", "ROE", "XI", 
             rep("OnBase", 4), "CS", "PO", "OOB", "OnBase"),
  
  target = c("SH", "SF", "SO", "FO", "GO", "GDP", "HR", "1B", "2B", "3B",
             "UBB", "IBB", "HBP", "ROE", "XI", rep("OnBase", 8),
             "R", "CS", "PO", "OOB", "Out", "Out", "Out", "LOB"),
  
  group = c("Sac", "Sac", rep("Out", 4), "Run", rep("OnBase", 16), "Run", 
            rep("Out", 6), "LOB"),
  
  value = c(bat$SH, bat$SF, bat$SO, bat$FO, bat$GO, bat$GDP, bat$HR, bat$`1B`,
            bat$`2B`, bat$`3B`, bat$UBB, bat$IBB, bat$HBP, bat$ROE,
            bat$XI, bat$`1B`, bat$`2B`, bat$`3B`, bat$UBB, bat$IBB,
            bat$HBP, bat$ROE, bat$XI, bat$R, bat$CS, bat$PO, bat$OOB,
            bat$CS, bat$PO, bat$OOB, bat$LOB)
  )

##Nodes##

nodes <- data.table(name=c(as.character(links$source), 
                               as.character(links$target)) %>% unique())

#Order Nodes

nodes_order <- c("PA", "SH", "SF", "SO", "FO", "GO", "GDP", "HR", "1B", "2B", "3B",
                 "UBB", "IBB", "HBP", "ROE", "XI", "OnBase", "R", "CS", "PO",
                 "OOB", "Out", "LOB")

nodes <- nodes[match(nodes_order, nodes$name),]

#Add groups to nodes
nodes$group <- as.factor(c("PA", "Sac", "Sac", rep("Out",4), "Run", 
                           rep("OnBase", 9), "Run", rep("Out", 4), "LOB"))

#Add Values to nodes
nodes$value <- c(bat$PA, bat$SH, bat$SF, bat$SO, bat$FO, bat$GO, bat$GDP, bat$HR,
                 bat$`1B`, bat$`2B`, bat$`3B`, bat$UBB, bat$IBB, bat$HBP, bat$ROE,
                 bat$XI, bat$OnBase, bat$R, bat$CS, bat$PO, bat$OOB, bat$Out,
                 bat$LOB)

##Delete Links and Nodes with no value##

missing_links <- which(links$value < 1 | is.na(links$value))

links <- links[!missing_links]

missing_nodes <- which(nodes$value < 1 | is.na(nodes$value))

nodes <- nodes[!missing_nodes]

#to order our sankey network we have to make two additional variables
links$IDsource <- match(links$source, nodes$name) - 1
links$IDtarget <- match(links$target, nodes$name) - 1

##Write Total PAs into Label##
nodes[1,1] <- paste(nodes[1,1], " Total (", bat$PA, ")", sep ="")

##Calculate Percentages##

nodes_to_percentize <- c(nodes[2:length(nodes$name),1])[[1]]

#percentize function#
percentize <- function(string) {
  nodes[name == string, name := paste(string, " (", 
                                round(bat[,string] / bat$PA * 100, digits = 2), 
                                "%)", sep = "")] 
}

#apply percentize function to all lines in nodes
for (i in nodes_to_percentize) {
  percentize(as.character(i))
}


##Make Sankey Diagram##
sankeyNetwork(Links = links, Nodes = nodes, Source = "IDsource", Target = "IDtarget",
              Value = "value", NodeID = "name", NodeGroup = "group", 
              LinkGroup = "group", fontSize = 15, 
              colourScale = JS("d3.scaleOrdinal(d3.schemeCategory20);"), 
              nodePadding = 8.5)