R Parallel with Genomic Prediction Application
Published:
Material developed for a workshop on R Parallel with Genomic Selection Application for the Genomic Selection Studies group at University of Florida, Gainesville, FL.
Material developed for a workshop on R Parallel with Genomic Selection Application for the Genomic Selection Studies group at University of Florida, Gainesville, FL. If you find any issue with it or have suggestions, please let me know. Here, we show how to run in parallel a cross-validation analysis in genomic prediction purposes with R and in a HPC. Dataset adapted from Endelman, Jeffrey B., et al. “Genetic variance partitioning and genome-wide prediction with allele dosage information in autotetraploid potato.” Genetics (2018): genetics-300685. The objective is to compare the predictive ability of two different methodologies (Ridge Regression and Lasso) using BGLR
package. The SolData.Rdata
used in this tutorial can be found at here
Data Set
load("data/SolData.Rdata")
ls()
## [1] "clones" "M" "y"
str(clones); str(y); str(M)
## chr [1:549] "MSZ194-2" "W14184-5" "MSP239-1" "W9632-1" "AF4648-2" ...
## num [1:549] 52.3 48 52.3 54 62.8 ...
## int [1:549, 1:3895] 3 3 2 2 1 1 2 2 3 2 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:549] "MSZ194-2" "W14184-5" "MSP239-1" "W9632-1" ...
## ..$ : chr [1:3895] "solcap_snp_c1_1" "solcap_snp_c1_1000" "solcap_snp_c1_10000" "solcap_snp_c1_10001" ...
Y
is the response variable, M
is the molecular information, Clones
are the name of the genotypes.
Creating training and testing populations
Here we randomly split the dataset into 5 groups (1 to 5) to follow a 5-fold validation scheme. Hereafter, trn
for training population and tst
for testing population.
tst.pop <- rep(1:5,length=549)
set.seed(1234)
tst.pop <- sample(tst.pop,549,FALSE)
Evaluating without loop for k=1
Due to time constraint, we will show all the analysis with really low number of iteractions/burnIns.
library(BGLR)
pa <- rep(NA,2) #vector to collect the predictive ability
rmse <- rep(NA,2) #vector to collect the RMSE
start.time <- Sys.time()
y.trn <- y
y.trn[tst.pop==1] <- NA
## Ridge Regression
ETA_1 <- list(PED1=list(X=M, model="BRR",saveEffects=FALSE))
fit <- BGLR(y.trn, ETA=ETA_1,
nIter = 100, burnIn = 50,thin = 5,
verbose = FALSE, saveAt = paste0("output/model_1"))
pa[1] <- cor(fit$yHat[tst.pop==1],y[tst.pop==1])
rmse[1] <- mean((fit$yHat[tst.pop==1]-y[tst.pop==1])^2)
## Bayesian Lasso
ETA_2 <- list(PED1=list(X=M, model="BL",saveEffects=FALSE))
fit <- BGLR(y.trn, ETA=ETA_2,
nIter = 100, burnIn = 50,thin = 5,
verbose = FALSE, saveAt = paste0("output/model_2"))
pa[2] <- cor(fit$yHat[tst.pop==1],y[tst.pop==1])
rmse[2] <- mean((fit$yHat[tst.pop==1]-y[tst.pop==1])^2)
end.time <- Sys.time()
(time.taken <- end.time - start.time)
## Time difference of 5.254325 secs
Evaluating with a for loop for all k’s
pa <- matrix(NA,2,5) #matrix to collect the predictive ability
rmse <- matrix(NA,2,5) #matrix to collect the RMSE
time <- rep(NA,5) #vector to collect time taken
## Building models outside the loop
ETA_1 <- list(PED1=list(X=M, model="BRR",saveEffects=FALSE))
ETA_2 <- list(PED1=list(X=M, model="BL",saveEffects=FALSE))
for(k in 1:5){
start.time <- Sys.time()
y.trn <- y
y.trn[tst.pop==k] <- NA
fit <- BGLR(y.trn, ETA=ETA_1,
nIter = 100, burnIn = 50,thin = 5,
verbose = FALSE, saveAt = paste0("output/model_1_",k))
pa[1,k] <- cor(fit$yHat[tst.pop==k],y[tst.pop==k])
rmse[1,k] <- mean((fit$yHat[tst.pop==k]-y[tst.pop==k])^2)
## Bayesian Lasso
fit <- BGLR(y.trn, ETA=ETA_2,
nIter = 100, burnIn = 50,thin = 5,
verbose = FALSE, saveAt = paste0("output/model_2_",k))
pa[2,k] <- cor(fit$yHat[tst.pop==k],y[tst.pop==k])
rmse[2,k] <- mean((fit$yHat[tst.pop==k]-y[tst.pop==k])^2)
end.time <- Sys.time()
time[k] <- end.time - start.time
}
print(pa)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.3401508 0.3769742 0.2584141 0.1628996 0.1822812
## [2,] 0.3392227 0.3157449 0.1920118 0.2049281 0.1780244
print(rmse)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 142.5198 174.0918 156.8485 165.2986 138.5694
## [2,] 143.1469 176.6930 171.2460 166.5368 146.3603
print(time)
## [1] 3.341892 3.327272 3.142947 3.033428 3.065488
sum(time)
## [1] 15.91103
Evaluating with a lapply loop for all k’s
## Building models outside the loop
ETA_1 <- list(PED1=list(X=M, model="BRR",saveEffects=FALSE))
ETA_2 <- list(PED1=list(X=M, model="BL",saveEffects=FALSE))
## Index data.frame to loop
index <- expand.grid(k=1:5,ETA=1:2)
x <- 1:nrow(index)
## Creating a function to put in the loop
fit.BGLR <- function(x){
## Collecting Loop Info
k=index$k[x]
ind.eta = index$ETA[x]
## Which ETA
if(index$ETA[x]==1){
ETA=ETA_1
}else{
ETA=ETA_2
}
## Which k-fold
y.trn <- y
y.trn[tst.pop==k] <- NA
fit <- BGLR(y.trn, ETA=ETA,
nIter = 100, burnIn = 50,thin = 5,
verbose = FALSE, saveAt = paste0("output/model_",ind.eta,"_",k))
pa <- cor(fit$yHat[tst.pop==k],y[tst.pop==k])
rmse <- mean((fit$yHat[tst.pop==k]-y[tst.pop==k])^2)
return(data.frame(k,ind.eta,pa,rmse))
}
## With lapply function
system.time(
output <- lapply(x,fit.BGLR)
)
## user system elapsed
## 15.688 0.320 16.056
## Basically the same time as the for loop
Evaluating with a parLapply loop for all k’s
Differences to lapply
: you need to call the packages within the function, export all the pertinent objects for the nodes, the output is printed.
## Building models outside the loop
ETA_1 <- list(PED1=list(X=M, model="BRR",saveEffects=FALSE))
ETA_2 <- list(PED1=list(X=M, model="BL",saveEffects=FALSE))
## Index data.frame to loop
index <- expand.grid(k=1:5,ETA=1:2)
x <- 1:nrow(index)
## Creating a function to put in the loop
fit.BGLR <- function(x){
## You need to call the library inside the function
library(BGLR)
## Collecting Loop Info
k=index$k[x]
ind.eta = index$ETA[x]
## Which ETA
if(index$ETA[x]==1){
ETA=ETA_1
}else{
ETA=ETA_2
}
## Which k-fold
y.trn <- y
y.trn[tst.pop==k] <- NA
fit <- BGLR(y.trn, ETA=ETA,
nIter = 100, burnIn = 50,thin = 5,
verbose = FALSE, saveAt = paste0("output/model_",ind.eta,"_",k))
pa <- cor(fit$yHat[tst.pop==k],y[tst.pop==k])
rmse <- mean((fit$yHat[tst.pop==k]-y[tst.pop==k])^2)
print(data.frame(k,ind.eta,pa,rmse))
}
## With lapply function
library(parallel)
## 2 cores
no_cores <- 2
cl <- makeCluster(no_cores)
clusterExport(cl=cl,varlist = list("y","index","ETA_1","ETA_2","tst.pop"))
system.time(
out <- parLapply(cl, x, fit.BGLR)
)
## user system elapsed
## 0.005 0.000 11.456
stopCluster(cl)
If you want to save as the parallel works occur, add this line before the end of the loop (only Linux/MAC):
system(paste0("echo '",paste(k,ind.eta,pa,rmse),"' >> output.txt"))
It is a good procedure to always let one CPU free, to do it automatically in the Slurm environment:
no_cores <- as.integer(Sys.getenv("SLURM_CPUS_ON_NODE"))-1
Slurm file example in the HPC (with HPG limits)
#!/bin/bash
#SBATCH --job-name=MyJob # Job name
#SBATCH --mail-type=END,FAIL # Mail events (NONE, BEGIN, END, FAIL, ALL)
#SBATCH --mail-user=myemail@domain.com # Where to send mail
#SBATCH --nodes=1 # Run on a single node
#SBATCH --ntasks=1 # Run on a single task (single R file)
#SBATCH --cpus-per-task=10 # number of threads per node (MAX=32)
#SBATCH --mem=128gb # Job memory request per node (MAX=128gb)
#SBATCH --time=96:00:00 # Time limit hrs:min:sec (MAX=96h)
#SBATCH --output=%j.log # Standard output and error log
#SBATCH --account=user
#SBATCH --qos=user-b
date
module load R/3.5.1
Rscript MyScript.R
date
An way to handle unpredicted errors in the loop with try() and to not crash the loop
Here, ETA_3
is a wrong model and it results an error in the BGLR()
function. Then, in this case, the try()
function will collect an try-error
object. In the following function fit.BGLR()
, we included an try()
encapsulating BGLR()
function. If it is an error, return just NA
values for our output, if not, keep going and compute the statistics.
## Building models outside the loop
ETA_1 <- list(PED1=list(X=M, model="BRR",saveEffects=FALSE))
ETA_2 <- list(PED1=list(X=M, model="BL",saveEffects=FALSE))
## Creating ETA_3 as an errorenous model
fit <- NULL
ETA_3 <- "error"
fit <- BGLR(y.trn, ETA=ETA_3,
nIter = 100, burnIn = 50,thin = 5,
verbose = FALSE, saveAt = paste0("output/model_3"))
fit <- NULL
fit <- try(BGLR(y.trn, ETA=ETA_3,
nIter = 100, burnIn = 50,thin = 5,
verbose = FALSE, saveAt = paste0("output/model_3")))
class(fit)
## Index data.frame to loop
index <- expand.grid(k=1:5,ETA=1:3)
x <- 1:nrow(index)
## Handling it with try()
fit <- try(BGLR(y.trn, ETA=ETA_3,
nIter = 100, burnIn = 50,thin = 5,
verbose = FALSE, saveAt = paste0("output/model_3")),silent = TRUE
)
class(fit)
## Index data.frame to loop
index <- expand.grid(k=1:5,ETA=1:3)
x <- 1:nrow(index)
## Creating a function to put in the loop
fit.BGLR <- function(x){
## You need to call the library inside the function
library(BGLR)
## Collecting Loop Info
k=index$k[x]
ind.eta = index$ETA[x]
pa=NA
rmse=NA
## Which ETA
if(index$ETA[x]==1)
ETA=ETA_1
if(index$ETA[x]==2)
ETA=ETA_2
if(index$ETA[x]==3)
ETA=ETA_3
## Which k-fold
y.trn <- y
y.trn[tst.pop==k] <- NA
fit <- try(BGLR(y.trn, ETA=ETA,
nIter = 100, burnIn = 50,thin = 5,
verbose = FALSE, saveAt = paste0("output/model_",ind.eta,"_",k)))
if(class(fit)!="BGLR"){
print(data.frame(k,ind.eta,pa,rmse))
}else{
pa <- cor(fit$yHat[tst.pop==k],y[tst.pop==k])
rmse <- mean((fit$yHat[tst.pop==k]-y[tst.pop==k])^2)
print(data.frame(k,ind.eta,pa,rmse))
}
}
## With lapply function
library(parallel)
## 2 cores
no_cores <- 2
cl <- makeCluster(no_cores)
clusterExport(cl=cl,varlist = list("y","index","ETA_1","ETA_2","ETA_3","tst.pop"))
system.time(
out <- parLapply(cl, x, fit.BGLR)
)
stopCluster(cl)
print(out)
RAM Memory Tips
- R is really bad at RAM Memory management!
- In each loop always try to load the low possible ammount of data to save RAM memory
- In R environment, always prefer
RData
format for outputs and inputs - In the our example if a big
M
is used with several models, one option is to save the each model in aRData
file and load the correct one withload(past0(...))
in each loop. This save memmory and you can use more CPUs in the process - When setting your parallel process, debug the code! Check if everything is correct. In our example, a good procedure is first run it with a partial dataset and a low number of iteractions to check if everything is correct. It is really frustrating to send a process, wait days, and realize that bugs in the code