Interactive Figures for Sport Science in R

Interactive figures can have great use in sport science, from visualising training reports to communicating test scores. One R package that allows the creating of interactive figures, using R code, is plotly.

To demonstrate how plotly works and how it may be of use to visualise your own dataset, consider the example below.

First, we create dummy data:

# To create interactive figures - declare netball playing position
PlayingPosition = c("C", "WA", "WD", "GA", "GD", "GS", "GK")
# Declare four seperate training sessions
TrainingSession = c("Main Training", "Courtwork", "Match Simulation", "Long Court")
# Set the seed, to reuse the same set of distance variables
set.seed(28)
# Create a summary data.frame containing dummy total and sprint distance data, across all training sessions
PhysicalOutputData = data.frame(PlayingPosition = rep((PlayingPosition), each = 4),
TrainingSession = rep((TrainingSession), each = 1),
TotalDistance = runif(28, 450, 950),
SprintDistance = runif(28, 0, 200))
# Round the total and sprint distance columns to 1 decimal place
PhysicalOutputData$TotalDistance <- round(PhysicalOutputData$TotalDistance, digits = 1)
PhysicalOutputData$SprintDistance <- round(PhysicalOutputData$SprintDistance, digits = 1)

Next, we load a package called dplyr, that is very useful once you start to use R reguarly! This package allows you to manipulate data.frames to quickly calculate means and SDs. You can see an example of this in our next step below:

# As an example, create a summary column looking at weekly totals
require(dplyr)
WeekSummaryData <- PhysicalOutputData %>%
group_by(PlayingPosition) %>%
summarise(TD = sum(TotalDistance),
SD = sum(SprintDistance))

Now we load the plotly package (you may need to install this first in R) and using the code below, create our first interactive figure!

# Load required package
require(plotly)
# Plot - basic plotly image
plot_ly(data = WeekSummaryData, x = ~TD, y = ~PlayingPosition, type = 'bar', orientation = 'h',
color = ~ PlayingPosition) %>%
layout(title = "Weekly Total Distance Covered",
yaxis = list(title = ""),
xaxis = list(title = "Total Distance (m)"),
showlegend = FALSE)

Plotly1

The plot above is a static plot but in the “Plots” tab of R, you should now be able to hover over the plot, with each position and the total distance covered for the week displayed.

To change the hover information, use the code below to not only display the total distance but the total sprint distance covered.

# To alter the hover text - show the sprint distance covered
plot_ly(WeekSummaryData, x = ~TD, y = ~PlayingPosition,
type = 'bar',
orientation = 'h',
hoverinfo = 'text',
text = ~paste('Total Distance:', TD,'m',
' ',
'Sprint Distance:', SD, 'm'),
color = ~ PlayingPosition) %>%
layout(title = "Weekly Total Distance Covered",
yaxis = list(title = ""),
xaxis = list(title = "Total Distance (m)"),
showlegend = FALSE)

Use the above code as a template for your own reporting – which you can now visualise a great deal of information in just one figure!

Creating publication-worthy figures in R

Apologies for my lengthy delay in updating this blog – a great deal has occurred in the past 12 months!

A popular question I am asked is: “How can I use R to create figures for my thesis and/ or publications?” I used R to create figures for my (now submitted!) PhD thesis and a recent publication in Journal of Sports Sciences. The code below will give you high-resolution figures that you can also use in your own scientific work!

First, create a dummy dataset so we can get plotting:

# Create a list of netball athlete positions
PlayingPosition = c("C", "WA", "WD", "GA", "GD", "GS", "GK")
# Declare two playing standards, elite and junior-elite
PlayingStandard = c("Elite", "Junior Elite")
# Set the seed, to reuse the same set of random variables
set.seed(14)
# Create a summary data.frame containing dummy total distance data, across both playing standards
SummaryData = data.frame(PlayingPosition = rep((PlayingPosition), each = 2),
 PlayingStandard = rep((PlayingStandard), each = 1),
 TotalDistance = runif(14, 450, 950))
# Round the total distance column to 0 decimal places
SummaryData$TotalDistance <- round(SummaryData$TotalDistance, digits = 0)

Next, load the required package and run the code to create the figure:

# Load required package
require(ggplot2)
# Creating a publication worthy figure
ggplot(data = SummaryData, aes(x = PlayingPosition, y = TotalDistance, fill = PlayingStandard)) +
 geom_bar(width=.7, stat="identity", position="dodge", colour="Black") +
 scale_fill_grey() +
 theme_bw() +
 ylab("Total Distance (m) \n") +
 scale_y_continuous(limits = c(0, 1000), expand = c(0, 0), breaks = seq(0, 1000, by = 250)) +
 theme(legend.position = "bottom",
 legend.title = element_blank(),
 axis.text.x = element_text(size = 12, colour = "black"),
 axis.text.y = element_text(size = 12, colour = "black"),
 axis.ticks.x = element_blank(),
 plot.background = element_blank(),
 plot.title = element_blank(),
 axis.title.x = element_blank(),
 axis.title.y = element_text(face = "bold", size = 14),
 axis.line.x = element_line(color = "black", size = .5),
 axis.line.y = element_line(color = "black", size = .5),
 panel.grid.major = element_blank(),
 panel.grid.minor = element_blank(),
 panel.border = element_blank())

Although this figure looks OK, the figure legend is plotted. This is important, so the colours associated with elite and junior-elite levels are known however, legends are typically not included in scientific work. Rather, the colours and levels are included in the accompanying figure caption. Run the code below to remove this figure legend completely. Alternatively, if you wish to retain the figure legend, you can alter the position by replacing legend.position = "bottom" with legend.position = "right"

# Remove the legend
ggplot(data = SummaryData, aes(x = PlayingPosition, y = TotalDistance, fill = PlayingStandard)) +
geom_bar(width=.7, stat="identity", position="dodge", colour="Black") +
scale_fill_grey() +
theme_bw() +
ylab("Total Distance (m) \n") +
scale_y_continuous(limits = c(0, 1000), expand = c(0, 0), breaks = seq(0, 1000, by = 250)) +
theme(legend.position = "none",
legend.title = element_blank(),
axis.text.x = element_text(size = 12, colour = "black"),
axis.text.y = element_text(size = 12, colour = "black"),
axis.ticks.x = element_blank(),
plot.background = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(face = "bold", size = 14),
axis.line.x = element_line(color = "black", size = .5),
axis.line.y = element_line(color = "black", size = .5),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())

Although this figure will look great in the “Plots” section of RStudio, it does not export as a high-resolution figure. Most journals will require figures in either .png or .TIFF format and thankfully, we can do this in R!

To export a high-resolution figure to your Desktop, we must first assign the plot then use another package called gridExtra. You may need to first install this package before running the code below. If you work on a Windows machine, you will also need to change the file location. Mac users – the code below should be applicable!

# To export a high-resolution figure
Figure1 <- ggplot(data = SummaryData, aes(x = PlayingPosition, y = TotalDistance, fill = PlayingStandard)) +
geom_bar(width=.7, stat="identity", position="dodge", colour="Black") +
scale_fill_grey() +
theme_bw() +
ylab("Total Distance (m) \n") +
scale_y_continuous(limits = c(0, 1000), expand = c(0, 0), breaks = seq(0, 1000, by = 250)) +
theme(legend.position = "none",
legend.title = element_blank(),
axis.text.x = element_text(size = 12, colour = "black"),
axis.text.y = element_text(size = 12, colour = "black"),
axis.ticks.x = element_blank(),
plot.background = element_blank(),
plot.title = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(face = "bold", size = 14),
axis.line.x = element_line(color = "black", size = .5),
axis.line.y = element_line(color = "black", size = .5),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank())

# Load required package
require(gridExtra)
# Save this figure to your desktop
png("~/Desktop/Figure1.png", width = 7, height = 6, units = 'in', res = 300)
grid.arrange(Figure1) # Make plot
dev.off()

For a .TIFF format, simply change the extension to .TIFF. You can also alter the width and height of the respective figure, should you require a landscape image. The high-resolution image is below:

Figure1

Happy plotting!

Validity by Linear Regression in R

I am currently analysing the data for one of my PhD studies, specifically, assessing the validity of an athlete tracking system against the current gold standard. Naturally, I wanted to run the analysis in R so set about compiling code to do so.

Below is an example of a simple linear relationship between a practical measure, such as a global positioning system (GPS) and a criterion measure or the VICON motion analysis system. The dependent variable of interest is peak velocity attained during an agility circuit. I am interested in the typical error of the estimate (TEE) and have included code for log transforming data, to obtain an estimation of the errors as percents of the mean. The code is designed to complement and should replicate the spreadsheet for calculating validity by Professor Will Hopkins. For further information on the TEE obtained by these spreadsheets, click here to view Will’s informative website.

To start with, construct a list of 45 random athlete names:

Athletes = c("Gus", "Hudson", "Bobby", "Tom", "Jessie", "Brent", "Sue", "Cam", "Steve", "Andrew", "Tony", "Jessica", 
"Heather", "Ritchie", "Joe", "Blair", "Elliott", "Sean", "Jarrad", "Dave", "Michele", "Kristine", "Lee", 
"Sharyn", "Ollie", "Marni", "Aaron", "Thomas", "Alan", "Wayan", "Valeryia", "Daria", "Fay", "Patricia", 
"Nancy", "Arthur", "John", "Isobel", "Molly", "Amanda", "Wendy", "Helen", "Bart", "Peter", "Stu")

Now, create a data.frame of dummy peak velocity data, from our practical and criterion measures, for each athlete. It is important you run the set.seed function to ensure replication of data.

 
# Set the seed
set.seed(90)
# Create a data.frame of dummy peak velocity data from two different tracking systems
ValidityDataset <- data.frame(Name = rep((Athletes), each = 1),
Criterion = runif(45, 4.5, 6.7),
Practical = runif(45, 4.1, 8))

Run the code below to obtain basic summary statistics for each measure of peak velocity.

 
# Return basic summary statistics 
Criterion_Mean <- mean(ValidityDataset$Criterion)
Criterion_SD <- sd(ValidityDataset$Criterion)
Practical_Mean <- mean(ValidityDataset$Practical)
Practical_SD <- sd(ValidityDataset$Practical)
SampleSize <- length(ValidityDataset$Name)

You will see that the mean of peak velocity derived from GPS is 6.1 ± 1.1 m·s¯1 whilst the Vicon is 5.7 ± 0.6 m·s¯1 whilst our n = 45. To view this data as a figure in our favourite plotting package, ggplot2 we first have to “wrangle” the data into long format or repeated measures format. This is quite straightforward and can be done by simply running the code below. The package reshape2 gives the flexibility to quickly turn data from wide to long format. You may need to install it into R before running the first line.

# Load the required package
require(reshape2)
ValidityDatasetLong <- melt(ValidityDataset, id.vars="Name",
value.name="PeakVelocity", variable.name="Measure")

Now our data is in long format, we can easily calculate means and SD’s. There is the option to calculate other statistics, including the SEM however I advise against using that one!

# Load the required package
require(plyr)
ValiditySummaryData <- ddply(ValidityDatasetLong, c("Measure"), summarise,
Mean = mean(PeakVelocity),
SD = sd(PeakVelocity))

The line of code below will create a lovely barplot of the peak velocity attained from our two tracking systems.

require(ggplot2)
ggplot(ValiditySummaryData, aes(x=Measure, y=Mean, fill=Measure)) +
geom_bar(stat="identity", color="black",
position=position_dodge()) +
geom_errorbar(aes(ymin=Mean-SD, ymax=Mean+SD), width=.2,
position=position_dodge(.9)) +
coord_cartesian(ylim=c(0,8)) +
xlab("\n Tracking System") +
ylab(expression(Peak ~ Velocity ~ (m.s^-1))) +
theme_classic() +
theme(legend.position="none",
axis.title.y = element_text(vjust=1.5))

MeanSD_UpperLower

If you prefer to present only the upper limits of the SD bars, use the code below:

ggplot(ValiditySummaryData, aes(x=Measure, y=Mean, fill=Measure)) +
geom_bar(stat="identity", color="black",
position=position_dodge()) +
geom_errorbar(aes(ymin=Mean, ymax=Mean+SD), width=.2,
position=position_dodge(.9)) +
coord_cartesian(ylim=c(0,8)) +
xlab("\n Tracking System") +
ylab(expression(Peak ~ Velocity ~ (m.s^-1))) +
theme_classic() +
theme(legend.position="none",
axis.title.y = element_text(vjust=1.5))

MeanSD_Upper

Now, we create a basic linear regression model of the criterion and practical values.

# Linear model of Criterion as a function of the Practical measure
PeakVelocityLM <- lm(Criterion ~ Practical, data = ValidityDataset)
# Summary of the linear regression model
summary(PeakVelocityLM)
# Return upper and lower confidence limits at the 90% level
confint(PeakVelocityLM, level = 0.90)
# Extract the Intercept
Intercept_PeakVelocityLM <- coef(PeakVelocityLM)["(Intercept)"]
# Extract the Slope
Slope_PeakVelocityLM <- coef(PeakVelocityLM)["Practical"]

By running summary(PeakVelocityLM) we can see that the intercept is 4.75 whilst the slope is 0.15. If you wish to present the 95% CI, simply change the level to = 0.95.

To obtain a plot of the regression model, run the following code:

ggplot(ValidityDataset, aes(x = Criterion, y = Practical)) +
geom_point() +
stat_smooth(method = "lm", col = "red", se = FALSE) +
xlab("\n Criterion") +
ylab("Practical \n") +
theme_classic()

LinearRegression

I found a cool piece of code from another R blog that plotted the above figure with the model summary data as a title. I made a few tweaks, to enable a clear background and no SE! You can obtain this figure by running the following code:

# Function to extract values out of linear regression model
ggplotRegression <- function (fit) {
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point() +
stat_smooth(method = "lm", col = "red", se = FALSE) +
xlab("\n Criterion") +
ylab("Practical \n") +
theme_classic() +
labs(title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
"Intercept =",signif(fit$coef[[1]],5 ),
" Slope =",signif(fit$coef[[2]], 5),
" P =",signif(summary(fit)$coef[2,4], 5)))}
# Run the following line to obtain the figure
ggplotRegression(PeakVelocityLM)

LinearRegression_Detail

Now we move on to calculating the predicted and residual values, plus the difference in raw units between the practical and criterion measures.

# Calculate the predicted value
ValidityDataset$Predicted <- (Intercept_PeakVelocityLM + Slope_PeakVelocityLM *
ValidityDataset$Practical)
# Calculate the residual value
ValidityDataset$Residual <- ValidityDataset$Criterion - ValidityDataset$Predicted
# Calculate the difference b/w Practical and Criterion
ValidityDataset$PracMinusCrit <- ValidityDataset$Practical - ValidityDataset$Criterion

To obtain a neat plot of the residuals versus the predicted values, run the following line of code:

# Plot the residuals versus predicted
ggplot(ValidityDataset, aes(x=Predicted, y=Residual)) +
geom_point()+
geom_hline(yintercept=0, col="red", linetype="solid") +
coord_cartesian(ylim=c(-1.5,1.5), xlim=c(5.25,6)) +
xlab("\n Predicted") +
ylab("Residual \n") +
theme_classic()

ResidualVPredicted

Now, for the fun stuff! The code below calculates the mean bias in raw and standardised units, plus the SD of these.

# Calculate the mean bias in raw units
MeanBias_RawUnits <- mean(ValidityDataset$PracMinusCrit)
# Calculate the mean bias, standardised
MeanBias_Standardised <- MeanBias_RawUnits/Criterion_SD
# Calculate the standard deviation of the Practical minus Criterion
SD_PracMinusCrit <- sd(ValidityDataset$PracMinusCrit)
# Calculate the SD of bias, standardised
SDBias_Standardised <- SD_PracMinusCrit/Criterion_SD

The smallest important difference in raw units is also calculated.

# Call out the smallest important difference
SmallestImportantDiff <- 0.2
# In standardised units
RawUnits_SmallestImportantDiff <- SmallestImportantDiff*Criterion_SD

The smallest important difference in raw units is 0.12. This can be interpreted using Hopkins’s modified Cohen scale of:

  • < 0.2 = Trivial
  • 0.2 to 0.6 = Small
  • 0.6 to 1.2 = Moderate
  • 1.2 to 2.0 = Large
  • > 2.0 = Very large

Below is the code for calculating the typical error of the estimate (TEE) in raw units and standardised. The standardisation is the typical error divided by the SD of the criterion.

# Extract the typical error of the estimate - raw units
TEE_Raw <- summary(PeakVelocityLM)$sigma
# Standardised
TEE_Standardised <- TEE_Raw/Criterion_SD
# Calculate the pearson correlation
PearsonCorrelation <- cor(ValidityDataset$Criterion, ValidityDataset$Practical, method="pearson")
# Calculate Bland-Altman 95% LOA
BlandAltman <- SD_PracMinusCrit*1.96

The TEE in raw units is 0.60 whilst the standardised value is 0.97. The Pearson correlation is calculated at 0.28.

The Bland-Altman ± 95% limits of agreement (LOA) are also given, at 2.20. A proposed practical application of this is to subtract the bias from the practical. If the resulting value is outside the ± limits then you consider that there is a significant difference. However, this interpretation does not take into account the smallest important difference.

Now to log transform values, so we can obtain means as a percentage. The following code allows for the back-transformed mean and the SD as a coefficient of variation (CV %).

# Log transform the values
ValidityDataset$Criterion_LogTransform <- log(ValidityDataset$Criterion)*100
ValidityDataset$Practical_LogTransform <- log(ValidityDataset$Practical)*100
# Back transform the mean - Criterion
Criterion_Mean_BackTransform <- exp(mean(ValidityDataset$Criterion_LogTransform)/100)
# SD as a factor - Criterion
Criterion_SD_Factor <- exp(sd(ValidityDataset$Criterion_LogTransform)/100)
# SD as a CV (%) - Criterion
Criterion_SD_CV <- (Criterion_SD_Factor - 1)*100
# Back transform the mean - Practical
Practical_Mean_BackTransform <- exp(mean(ValidityDataset$Practical_LogTransform)/100)
# SD as a factor - Practical
Practical_SD_Factor <- exp(sd(ValidityDataset$Practical_LogTransform)/100)
# SD as a CV (%) - Practical
Practical_SD_CV <- (Practical_SD_Factor - 1)*100

The SD as a CV is 11.5%.

We will need to run another linear regression, using the log-transformed values.

# Run a linear model of the log transformed data
LogModel_Validity <- lm(Criterion_LogTransform ~ Practical_LogTransform, data = ValidityDataset)
# Extract the coefficients
Intercept_CriterionLogTransform <- LogModel_Validity$coef[1]
Intercept_CriterionLogTransform <- exp(Intercept_CriterionLogTransform/100)
Slope_PracticalLogTransform <- LogModel_Validity$coef[2]
# Extract the upper and lower CL's @ 90%
CI_CriterionLogTransform <- confint(LogModel_Validity, "(Intercept)", level = .90)
CI_CriterionLogTransform <- exp(CI_CriterionLogTransform/100)
CI_PracticalLogTransform <- confint(LogModel_Validity, "Practical_LogTransform", level = .90)
# Calculate predicted values
ValidityDataset$Predicted_LogTransform <- 100*log(Intercept_CriterionLogTransform)+Slope_PracticalLogTransform*ValidityDataset$Practical_LogTransform
# Calculate the residuals
ValidityDataset$Residual_LogTransform <- ValidityDataset$Criterion_LogTransform-ValidityDataset$Predicted_LogTransform
ValidityDataset$PracMinusCrit_LogTransform <- ValidityDataset$Practical_LogTransform-ValidityDataset$Criterion_LogTransform
# Calculate the mean bias in raw units
MeanBias_RawUnits_LogTransform <- 100*(exp(mean(ValidityDataset$PracMinusCrit_LogTransform)/100)-1)
# Mean bias as a %
MeanBias_Percentage_LogTransform <- 100*exp(MeanBias_RawUnits_LogTransform/100)-100
# Mean bias as a factor
MeanBias_Factor_LogTransform <- MeanBias_Percentage_LogTransform/100 + 1
# Mean bias as standardized
MeanBias_Standardized_LogTransform <- log(1+MeanBias_RawUnits_LogTransform/100)/log(1+Criterion_SD_CV/100)

The output of the overall mean bias is 6.5%. Now, extract the TEE as a CV (%) and the resulting Pearson correlation.

# Extract the typical error of the estimate - raw units
TEE_CV <- summary(LogModel_Validity)$sigma
TEE_CVPercentage <- 100*(exp(TEE_CV/100)-1)
TEE_Factor <- exp(TEE_CV/100)
TEE_Standardised <- log(TEE_CVPercentage/100+1)/log(Criterion_SD_CV/100+1)
# Calculate the pearson correlation
PearsonCorrelation_LogTransform <- cor(ValidityDataset$Criterion_LogTransform,
ValidityDataset$Practical_LogTransform,
method="pearson") 

The TEE as a CV is 11.1% and when standardised is 0.97, which can be interpreted as large on the Hopkins modified Cohen scale. A Pearson correlation of 0.30 is also achieved.

After all our coding, plotting and analysis, would you use the GPS system to measure peak velocity?

Introduction to R and A Basic Analysis of Athlete Load

R is a free, open-source programming language and software environment for statistical computing and graphics. Learning how to use R for data analysis and visualisation purposes can be a daunting task. However, there are a number of free online resources to guide basic analysis and troubleshoot where possible. These include:

  • Cookbook for R: An online guide to provide solutions for common tasks and problems when analysing data in R
  •  R Tutorial: An introduction to statistics that explains basic concepts in R
  • Quick-R: A website that assists with data input, management and statistics in R
  • R-bloggers: A news and information site that pulls together blog posts on R
  • Stack Overflow: A question and answer forum on all things code, statistics and plotting

The above websites are a fantastic resource on how to get started in R with basic analysis. To complement, I have constructed a basic guide for sport science and physiology users using athlete load as an example. Personally, I prefer to work in RStudio, which provides a free, friendly interface to run code and view plots.

To start, run the following code to load a .csv or .txt file into R and name the file as “RawLoadData”. You will need to substitute the file location below with your own.

# Read a .csv file into R
RawLoadData <- read.csv("/Volumes/Research/Thesis/Manuscripts/AthleteLoadData.csv")
# Read a .txt file into R
RawLoadData <- read.table("/Volumes/Research/Thesis/Manuscripts/AthleteLoadData.txt")

R uses data.frames and matrices to store data. The difference between the two, is a matrix requires all rows and columns to be of the same class, numeric or factor, for example. A data.frame allows you to have a mixture of the two. You can switch between the two using as.data.frame and as.matrix, although be aware that if you convert a data.frame with different classes, they will all be characters in a matrix. To create a data.frame or matrix, use the following code:

# Create a data.frame
RawLoadData <- as.data.frame(RawLoadData)
# Create a matrix
RawLoadData <- as.matrix(RawLoadData)

To create a data.frame of dummy athlete load data collected over seven days, use the code below.

# Create a list of athlete names
Athletes <- c("Charles", "Mia", "Alfie", "Sophie")
# Call out the constants
NumberOfAthletes <- 4
DaysOfLoad <- 7
set.seed(28)
# Create the data.frame
RawLoadData <- data.frame(Athletes = rep(Athletes, DaysOfLoad),
Day = rep(1:DaysOfLoad, each = NumberOfAthletes),
Load = runif(NumberOfAthletes * DaysOfLoad, min = 0, max = 100))

The structure of a data.frame can be accessed by typing and running the code below. This allows us to see that the dataset consists of athlete names, or factors, days or integers and load that consists of a numeric variable.

str(RawLoadData)

Columns of a data.frame can be viewed by typing and running RawLoadData$Athletes however, this command will not work for a matrix. To create a new column and add to our existing `data.frame`, such as the playing position of each athlete, type and run the following code:

# To create a new column
RawLoadData$PlayingPosition <- c("Midcourt")

Summary statistics of grouped data can easily be calculated with assistance from the plyr package, that will need to be installed into R prior to first use. To calculate the mean and SD of load, purely as an example, for each day use the following:

# Load the required package
require(plyr)
# To calculate the mean and SD for load, across each day
SummaryLoadData = ddply(RawLoadData, c("Day"), summarise,
Mean = mean(Load),
SD = sd(Load))

The above data can then be plotted the ggplot2 package which needs to first be installed into R. Use the code below to visualise the mean data:

# Load the required package
require(ggplot2)
# Basic plot of the mean and SD of load over a seven day period
ggplot(SummaryLoadData, aes(x = Day, y = Mean)) +
geom_bar(stat = "identity")

Load_BasicPlot

A few tweaks will deliver us a much more visually pleasing plot. To have a white background, coloured bars, clearer axis labels and bold ticks plus marks, use the following code:

# A few tweaks to create a neater plot
ggplot(SummaryLoadData, aes(x = Day, y = Mean, fill = factor(Day))) +
geom_bar(stat = "identity") +
ylab("Average Load (AU)\n") +
xlab("\nDay") +
scale_y_continuous(expand = c(0, 0), limits = c(0, 80)) +
scale_x_continuous(breaks = c(1:7)) +
theme_classic() +
theme(legend.position = "none",
axis.line = element_line(colour = "black", size = .75, linetype = "solid"),
axis.title.x = element_text(face = "bold", size = 15),
axis.title.y = element_text(face = "bold", size = 15),
axis.text.y = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
axis.ticks = element_line(size = .5))

Load_Neater

To plot individual load data, use the following code:

# Individual responses
ggplot(RawLoadData, aes(x = Day, y = Load, fill = factor(Athletes))) +
geom_bar(stat = "identity") +
ylab("Load (AU)\n") +
xlab("\nDay") +
scale_y_continuous(expand = c(0, 0), limits = c(0, 100)) +
scale_x_continuous(breaks = c(1:7)) +
theme_classic() +
theme(strip.text.x = element_text(size = 12, face = "bold"),
strip.background = element_rect(colour = "black", size = 1.5),
legend.position = "none",
axis.line = element_line(colour = "black", size = .75, linetype = "solid"),
axis.title.x = element_text(face = "bold", size = 12),
axis.title.y = element_text(face = "bold", size = 12),
axis.text.y = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
axis.ticks = element_line(size = .5)) +
facet_wrap(~ Athletes)

Load_IndividResps

To overlay the average load over each individual’s data, use the following code:

# Extend the Summary Load data.frame
SummaryLoadData <- SummaryLoadData[rep(seq_len(nrow(SummaryLoadData)), each=4),]
# Add the mean column to the RawLoadData frame
RawLoadData$Mean <- SummaryLoadData$Mean
# To plot individual athlete load data
ggplot(RawLoadData, aes(x = Day, y = Load, fill = factor(Athletes))) +
geom_bar(stat = "identity") +
geom_line(aes(y = Mean, x = Day), color = "Black", size = 1) +
geom_point(aes(y = Mean, x = Day), color = "Black", size = 1.5) +
ylab("Load (AU)\n") +
xlab("\nDay") +
scale_y_continuous(expand = c(0, 0), limits = c(0, 100)) +
scale_x_continuous(breaks = c(1:7)) +
theme_classic() +
theme(strip.text.x = element_text(size = 12, face = "bold"),
strip.background = element_rect(colour = "black", size = 1.5),
legend.position = "none",
axis.line = element_line(colour = "black", size = .75, linetype = "solid"),
axis.title.x = element_text(face = "bold", size = 12),
axis.title.y = element_text(face = "bold", size = 12),
axis.text.y = element_text(face = "bold"),
axis.text.x = element_text(face = "bold"),
axis.ticks = element_line(size = .5)) +
facet_wrap(~ Athletes)

Load_IndividMeans

The above is only a small introduction to R’s analysis and visualising capabilities. How do you analyse and present athlete load data?

Analysing and Visualising Repeated Measures Data

Scientists working in exercise physiology often design experiments containing repeated measurements on different athletes or participants over the course of time.

One example, in a sport science setting, is the monitoring of a team-sport athlete’s response to training and competition loads. The countermovement jump (CMJ) is used to monitor an athlete’s neuromuscular status. A CMJ will often be performed by an athlete prior to training or match and monitored over the course of a week, tournament or season.

To show how repeated measures data can be analysed and visualised in R, I have created a (hypothetical) example of different athletes performing two trials of a CMJ at two different times of the day and monitored over a three day period. I have chosen Peak Velocity as my dependent variable purely for display purposes only. A useful variable to monitor the neuromuscular status of Australian Rules athletes appears to be Flight Time:Contraction Time.

Athletes = c("Gus", "Hudson", "Bobby", "Tom", "Jessie")
# CMJ performed at two different time points
TimeOfDay = c("AM", "PM")
# Set the start date of data collection
StartDate = as.Date("2016-02-01")
# Set the seed, to reuse the same set of random variables
set.seed(60)
# Create a data.frame containing dummy raw CMJ data
CMJRawData = data.frame(Name = rep((Athletes), each = 4),
Day = rep((weekdays(StartDate + 0:2)), each = 20),
Trial = as.numeric(rep(1:2, each = 1)),
TimeOfDay = rep((TimeOfDay), each = 2),
PeakVelocity = runif(60, 1.5, 2.8))

Plots can be created in R using the base packages however, I prefer to use ggplot2 due to it’s easy to follow syntax and ability to create complex figures in a visually pleasing manner. This package will need to be installed into R prior to loading.

# Load the required ggplot2 package
require(ggplot2)
# Create a basic box and whisker plot to visualise Peak Velocity
ggplot(data = CMJRawData, aes(x = Day, y = PeakVelocity)) +
 geom_boxplot(aes(fill = TimeOfDay))

BW_Basic
The above plot is OK however, I personally prefer a cleaner plotting background plus emphasised ticks and correct axis labels. The code below creates a much more visually pleasing plot that can be used for presentation or scientific purposes.

# Create a neater looking plot with correct scientific notation
ggplot(data = CMJRawData, aes(x = Day, y = PeakVelocity)) +
geom_boxplot(aes(fill = TimeOfDay)) +
ylab(expression(Peak ~ Velocity ~ (m.s^-1))) +
scale_y_continuous(expand = c(0, 0), limits = c(0, 4)) +
theme_classic() +
theme(legend.title = element_blank(),
legend.text = element_text(size = 13),
strip.text.x = element_text(size = 15, face = "bold"),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(color="black", size = 15, vjust=1.5),
axis.line.y = element_line(colour = "black"),
legend.position = "bottom") +
facet_wrap(~ Day, scales="free_x")

BW_Clearer

I found the above plot much easier to read. The labels or ticks can be highlighted by including face = “bold” where appropriate. The y-axis scale can also be adjusted to zoom in on the figure and lose the white space by simply configuring the line:

scale_y_continuous(expand = c(0, 0), limits = c(0, 4))

To obtain summary statistics for a repeated measures dataset, install the package “psych” into R and run the following code. Other statistics, including the SE, can also be obtained by substituting “se” for “sd” in the line of code below.

# Load the required package
require(psych)
# Obtain the Mean and SD for Peak Velocity, over each day and time of day
PeakVelocity = ddply(CMJRawData, c("Day", "TimeOfDay"), summarise,
Mean = mean(PeakVelocity),
SD = sd(PeakVelocity))

Of interest to many scientists and practitioners is the individual response to training. This can be calculated and plotted using the code below, to track athletes over time. Note: I have calculated a mean Peak Velocity measure from the two trials at each time of day.

# Calculate the Mean and SD for Peak Velocity, for each athlete, over each day and time of day
PeakVelocityIR = ddply(CMJRawData, c("Day", "TimeOfDay", "Name"), summarise,
Mean = mean(PeakVelocity),
SD = sd(PeakVelocity))
# Plot individual responses across time for each day
ggplot(data = PeakVelocityIR, aes(x = TimeOfDay, y = Mean, colour = Name, group = Name)) +
geom_point() +
geom_line() +
ylab(expression(Mean ~ Peak ~ Velocity ~ (m.s^-1))) +
xlab("\nTime of Day") +
scale_y_continuous(expand = c(0, 0), limits = c(1.5, 2.75)) +
theme_classic() +
theme(legend.text = element_text(size = 13),
strip.text.x = element_text(size = 15, face = "bold"),
axis.title.y = element_text(color="black", size = 15, vjust=1.5),
axis.title.x = element_text(color="black", size = 15),
axis.line.y = element_line(colour = "black")) +
facet_wrap(~ Day, scales="free_x") 

IndividResponses

Individual responses can also be displayed with a mean or group average overlay. This is displayed below.

# Create a new column, called name, for plotting
PeakVelocity$Name = c("Average")
# Plot the individual responses plus means
ggplot(PeakVelocity, aes(x = TimeOfDay, y = Mean, group = Name)) +
geom_point(colour = "black", size = 4) +
geom_line(colour = "black") +
geom_point(data = PeakVelocityIR, aes(x = TimeOfDay, y = Mean,
group = Name, colour = Name)) +
geom_line(data = PeakVelocityIR, aes(x = TimeOfDay, y = Mean,
group = Name, colour = Name)) +
ylab(expression(Mean ~ Peak ~ Velocity ~ (m.s^-1))) +
xlab("\nTime of Day") +
scale_y_continuous(expand = c(0, 0), limits = c(1.5, 2.75)) +
theme_classic() +
theme(legend.text = element_text(size = 13),
strip.text.x = element_text(size = 15, face = "bold"),
axis.title.y = element_text(color="black", size = 15, vjust=1.5),
axis.title.x = element_text(color="black", size = 15),
axis.line.y = element_line(colour = "black")) +
facet_wrap(~ Day, scales="free_x")

IndividualResp_Mean

The above is only a sample of how repeated measures data can be visualised. What methods do you use to display repeated measures data? How do you clearly communicate individual responses?