Model-evaluation (ML)

A. SummarizedExperiment (SE)

Convert raw data from SE S4 class, to mlr’s “task”

data(Golub_Merge, package = 'golubEsets') # ExpressionSet 
smallG<-Golub_Merge[200:259,]
smallG
#> ExpressionSet (storageMode: lockedEnvironment)
#> assayData: 60 features, 72 samples 
#>   element names: exprs 
#> protocolData: none
#> phenoData
#>   sampleNames: 39 40 ... 33 (72 total)
#>   varLabels: Samples ALL.AML ... Source (11 total)
#>   varMetadata: labelDescription
#> featureData: none
#> experimentData: use 'experimentData(object)'
#>   pubMedIds: 10521349 
#> Annotation: hu6800

library(SummarizedExperiment)
smallG_SE<-makeSummarizedExperimentFromExpressionSet(smallG)

# functional:
task_SE_Functional<-Fun_SE_to_taskFunc(smallG_SE, param.Y.name = 'ALL.AML', param.covariates = NULL, param_positive_y_level = 'ALL', task_return_format = 'functional', task_type = 'classif') ## will work with either 1 or multiple assayS
task_SE_Functional
#> Supervised task: DF_functionals
#> Type: classif
#> Target: ALL.AML
#> Observations: 72
#> Features:
#>    numerics     factors     ordered functionals 
#>           0           0           0           1 
#> Missings: FALSE
#> Has weights: FALSE
#> Has blocking: FALSE
#> Has coordinates: FALSE
#> Classes: 2
#> ALL AML 
#>  47  25 
#> Positive class: ALL


# non-functional:
## 1. directly, but into DF
extracted_DF_from_task_SE<-getTaskData(task_SE_Functional, functionals.as = "dfcols") # keep matrix
extracted_DF_from_task_SE[,1:10] %>% str
#> 'data.frame':    72 obs. of  10 variables:
#>  $ ALL.AML        : Factor w/ 2 levels "ALL","AML": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ exprs.D13627_at: num  330 544 978 1035 3895 ...
#>  $ exprs.D13628_at: num  229 147 110 237 106 256 144 84 -7 -3 ...
#>  $ exprs.D13630_at: num  359 289 609 485 866 663 673 401 480 273 ...
#>  $ exprs.D13633_at: num  -9 57 207 302 475 0 112 257 244 252 ...
#>  $ exprs.D13634_at: num  115 248 91 58 244 245 98 182 186 241 ...
#>  $ exprs.D13635_at: num  31 -43 40 31 84 -159 -7 -2 62 111 ...
#>  $ exprs.D13636_at: num  195 23 -60 317 449 -262 386 295 177 51 ...
#>  $ exprs.D13637_at: num  161 137 -94 -96 432 -535 136 86 99 143 ...
#>  $ exprs.D13639_at: num  456 3336 655 2771 3575 ...


## 2. Fun_SE_to_taskFunc(..., task_return_format = 'dfcols')
task_SE_NON_Functional<-Fun_SE_to_taskFunc(smallG_SE, param.Y.name = 'ALL.AML', param.covariates = NULL, param_positive_y_level = 'ALL', task_return_format = 'dfcols', task_type = 'classif') ## will work with either 1 or multiple assayS

## 3. functional_to_NonFunctional_task_function(task_functional)
task_SE_NON_Functional_alt<-functional_to_NonFunctional_task_function(task_SE_Functional)



## 4. designated function ## TBA
# extracted = extractFDAFeatures(task_SE_Functional, feat.methods = list("exprs" = all))

Single assay ML demonstration

Direct

library(class)
smallG_train<-exprs(smallG)[,1:40]     %>% t 
smallG_test <-exprs(smallG)[,-c(1:40)] %>% t
knn_pred<-knn(smallG_train, smallG_test, cl = smallG$ALL.AML[1:40], k = 1, prob=TRUE)
table(smallG$ALL.AML[-c(1:40)], knn_pred)
#>      knn_pred
#>       ALL AML
#>   ALL  18   3
#>   AML   4   7

MLInterface

library(MLInterfaces)
#> Warning: package 'MLInterfaces' was built under R version 3.5.1
#> Warning: package 'XML' was built under R version 3.5.2
krun<-MLearn(formula = ALL.AML~., data = smallG, .method = knnI(k=1), trainInd = 1:40)
krun
#> MLInterfaces classification output container
#> The call was:
#> MLearn(formula = ALL.AML ~ ., data = smallG, .method = knnI(k = 1), 
#>     trainInd = 1:40)
#> Predicted outcome distribution for test set:
#> 
#> ALL AML 
#>  22  10 
#> Summary of scores on test set (use testScores() method for details):
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>       1       1       1       1       1       1
confuMat(krun)
#>      predicted
#> given ALL AML
#>   ALL  18   3
#>   AML   4   7

mlr

task_train<-task_SE_Functional %>% subsetTask(subset = 1:40)
task_test <-task_SE_Functional %>% subsetTask(subset = 41:72)
classif.lrn = makeLearner("classif.knn")
model<-train(classif.lrn, task_train)
Predict<-model %>% predict(task_test)
Predict %>% calculateConfusionMatrix()
#>         predicted
#> true     ALL AML -err.-
#>   ALL     18   3      3
#>   AML      4   7      4
#>   -err.-   4   3      7

B. MultiAssayExperiment (MAE)

Two data examples:
1. miniACC, balanced, without ‘dropouts’.
2. Customized, non-balanced, with ‘dropouts’.

Convert raw data from MAE S4 class, to mlr’s “task”

1. miniACC

library(MultiAssayExperiment)
miniACC
#> A MultiAssayExperiment object of 5 listed
#>  experiments with user-defined names and respective classes. 
#>  Containing an ExperimentList class object of length 5: 
#>  [1] RNASeq2GeneNorm: SummarizedExperiment with 198 rows and 79 columns 
#>  [2] gistict: SummarizedExperiment with 198 rows and 90 columns 
#>  [3] RPPAArray: SummarizedExperiment with 33 rows and 46 columns 
#>  [4] Mutations: matrix with 97 rows and 90 columns 
#>  [5] miRNASeqGene: SummarizedExperiment with 471 rows and 80 columns 
#> Features: 
#>  experiments() - obtain the ExperimentList instance 
#>  colData() - the primary/phenotype DataFrame 
#>  sampleMap() - the sample availability DataFrame 
#>  `$`, `[`, `[[` - extract colData columns, subset, or experiment 
#>  *Format() - convert into a long or wide DataFrame 
#>  assays() - convert ExperimentList to a SimpleList of matrices
# miniACC %>% sampleMap %>% data.frame %>% dplyr::select(primary, assay) %>% table # no replicates within same assay

task_Functional_MAE<-Fun_MAE_to_taskFunc(miniACC, param.Y.name = 'vital_status', param.covariates = c('gender','days_to_death'), param_positive_y_level = '1', task_type = 'classif')
task_Functional_MAE
#> Supervised task: DF_functionals
#> Type: classif
#> Target: vital_status
#> Observations: 385
#> Features:
#>    numerics     factors     ordered functionals 
#>           1           5           0           5 
#> Missings: TRUE
#> Has weights: FALSE
#> Has blocking: FALSE
#> Has coordinates: FALSE
#> Classes: 2
#>   0   1 
#> 248 137 
#> Positive class: 1
extracted_DF_from_task_MAE_functionals<-getTaskData(task_Functional_MAE, functionals.as = "matrix") # keep functionals
extracted_DF_from_task_MAE_functionals[,1:10] %>% glimpse
#> Observations: 385
#> Variables: 10
#> $ Unique_sample_id <fct> RNASeq2GeneNorm_TCGA-OR-A5J1_TCGA-OR-A5J1-01A...
#> $ assay            <fct> RNASeq2GeneNorm, RNASeq2GeneNorm, RNASeq2Gene...
#> $ primary          <fct> TCGA-OR-A5J1, TCGA-OR-A5J2, TCGA-OR-A5J3, TCG...
#> $ colname          <fct> TCGA-OR-A5J1-01A-11R-A29S-07, TCGA-OR-A5J2-01...
#> $ gender           <fct> male, female, female, male, female, female, m...
#> $ days_to_death    <int> 1355, 1677, NA, 365, NA, 490, 579, NA, 922, 5...
#> $ vital_status     <fct> 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, ...
#> $ RNASeq2GeneNorm  <dbl> <matrix[25 x 198]>
#> $ gistict          <dbl> <matrix[25 x 198]>
#> $ RPPAArray        <dbl> <matrix[25 x 33]>

extracted_DF_from_task_MAE_dfcols<-getTaskData(task_Functional_MAE, functionals.as = "dfcols") # concatonate functionals
extracted_DF_from_task_MAE_dfcols[,1:10] %>% glimpse
#> Observations: 385
#> Variables: 10
#> $ Unique_sample_id       <fct> RNASeq2GeneNorm_TCGA-OR-A5J1_TCGA-OR-A5...
#> $ assay                  <fct> RNASeq2GeneNorm, RNASeq2GeneNorm, RNASe...
#> $ primary                <fct> TCGA-OR-A5J1, TCGA-OR-A5J2, TCGA-OR-A5J...
#> $ colname                <fct> TCGA-OR-A5J1-01A-11R-A29S-07, TCGA-OR-A...
#> $ gender                 <fct> male, female, female, male, female, fem...
#> $ days_to_death          <int> 1355, 1677, NA, 365, NA, 490, 579, NA, ...
#> $ vital_status           <fct> 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, ...
#> $ RNASeq2GeneNorm.DIRAS3 <dbl> 1487.0317, 9.6631, 18.9602, 760.6507, 1...
#> $ RNASeq2GeneNorm.MAPK14 <dbl> 778.5783, 2823.6469, 1061.7686, 806.351...
#> $ RNASeq2GeneNorm.YAP1   <dbl> 1009.6061, 2305.0590, 1561.2502, 713.40...

2. Customized

library(MultiAssayExperiment)

patient.data <- data.frame(sex=c("M", "F", "M", "F", "F"),
                           age=38:42,
                           row.names=c("Jack", "Jill", "Bob", "Barbara","Meg"))
## assay A
arraydat <- matrix(seq(101, 108), ncol=4,
                    dimnames=list(c("ENST00000294241", "ENST00000355076"),
                                  c("array1", "array2", "array3", "array4")))
coldat <- data.frame(slope53=rnorm(4), row.names=c("array1", "array2", "array3", "array4"))
exprdat <- SummarizedExperiment(arraydat, colData=coldat)
exprmap <- data.frame(primary=c("Jill", "Jill", "Meg", "Barbara"),
                       colname=c("array1", "array2", "array3", "array4"),
                       stringsAsFactors = FALSE)
## assay B
methyldat <-
    matrix(1:10, ncol=5,
           dimnames=list(c("ENST00000355076", "ENST00000383706"),
                         c("methyl1", "methyl2", "methyl3",
                           "methyl4", "methyl5")))
methylmap <- data.frame(primary = c("Jack", "Jack", "Jack", "Meg", "Bob"),
                         colname = c("methyl1", "methyl2", "methyl3", "methyl4", "methyl5"),
                         stringsAsFactors = FALSE)

myMultiAssay <- MultiAssayExperiment(list("A" = exprdat, "B" = methyldat), patient.data, list(A = exprmap, B = methylmap) %>% listToMap)
myMultiAssay
#> A MultiAssayExperiment object of 2 listed
#>  experiments with user-defined names and respective classes. 
#>  Containing an ExperimentList class object of length 2: 
#>  [1] A: SummarizedExperiment with 2 rows and 4 columns 
#>  [2] B: matrix with 2 rows and 5 columns 
#> Features: 
#>  experiments() - obtain the ExperimentList instance 
#>  colData() - the primary/phenotype DataFrame 
#>  sampleMap() - the sample availability DataFrame 
#>  `$`, `[`, `[[` - extract colData columns, subset, or experiment 
#>  *Format() - convert into a long or wide DataFrame 
#>  assays() - convert ExperimentList to a SimpleList of matrices
myMultiAssay %>% sampleMap %>% data.frame %>% select(primary, assay) %>% table # Yes replicates within same assay, and non-balanced  / dropouts!!!
#>          assay
#> primary   A B
#>   Barbara 1 0
#>   Bob     0 1
#>   Jack    0 3
#>   Jill    2 0
#>   Meg     1 1

# myMultiAssay %>% sampleMap %>% data.frame %>% filter(assay == 'A')
# myMultiAssay$sex

task_Functional_MAE_customized<-Fun_MAE_to_taskFunc(myMultiAssay, param.Y.name = 'sex', param.covariates = NULL, param_positive_y_level = 'M', task_type = 'classif')

Multi-assay ML demonstration

mlr: vertical integration

Unless the learner has sepecific implementation for functional data, it will be automatically converted into standard (non-functional) task.
bartMachine model was chosed only because it has a built-in NA handling. Any other ‘learner’ from mlr could be demonstrated instead.


library(bartMachine)
classif_lrn_bartMachine<-makeLearner("classif.bartMachine")
model_bartMachine<-train(classif_lrn_bartMachine, task_Functional_MAE)
#> bartMachine initializing with 50 trees...
#> bartMachine vars checked...
#> bartMachine java init...
#> bartMachine factors created...
#> bartMachine before preprocess...
#> bartMachine after preprocess... 1868 total features...
#> warning: cannot use MSE of linear model for s_sq_y if p > n. bartMachine will use sample var(y) instead.
#> bartMachine sigsq estimated...
#> bartMachine training data finalized...
#> Now building bartMachine for classification ...Covariate importance prior ON. Missing data feature ON. 
#> evaluating in sample data...done
Predict_bartMachine<-model_bartMachine %>% predict(task_Functional_MAE)
Predict_bartMachine %>% calculateConfusionMatrix()
#>         predicted
#> true       0   1 -err.-
#>   0      248   0      0
#>   1        0 137      0
#>   -err.-   0   0      0

TBA: built-in CPOs / see detailed vignette.

Session information

sessionInfo()
#> R version 3.5.0 (2018-04-23)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 14393)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=English_United States.1252 
#> [2] LC_CTYPE=English_United States.1252   
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C                          
#> [5] LC_TIME=English_United States.1252    
#> 
#> attached base packages:
#> [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
#> [8] methods   base     
#> 
#> other attached packages:
#>  [1] bartMachine_1.2.4.2         missForest_1.4             
#>  [3] itertools_0.1-3             iterators_1.0.10           
#>  [5] foreach_1.4.4               randomForest_4.6-14        
#>  [7] car_3.0-0                   carData_3.0-1              
#>  [9] bartMachineJARs_1.1         rJava_0.9-10               
#> [11] bindrcpp_0.2.2              MultiAssayExperiment_1.6.0 
#> [13] MLInterfaces_1.62.0         cluster_2.0.7-1            
#> [15] annotate_1.58.0             XML_3.98-1.16              
#> [17] AnnotationDbi_1.42.1        class_7.3-14               
#> [19] SummarizedExperiment_1.10.1 DelayedArray_0.6.0         
#> [21] BiocParallel_1.14.1         matrixStats_0.53.1         
#> [23] Biobase_2.40.0              GenomicRanges_1.32.3       
#> [25] GenomeInfoDb_1.16.0         IRanges_2.14.10            
#> [27] S4Vectors_0.18.2            BiocGenerics_0.26.0        
#> [29] mlr_2.13.9000               ParamHelpers_1.12          
#> [31] Bioc2mlr_0.1.0              magrittr_1.5               
#> [33] forcats_0.3.0               stringr_1.3.1              
#> [35] dplyr_0.7.8                 purrr_0.2.5                
#> [37] readr_1.1.1                 tidyr_0.8.2                
#> [39] tibble_2.0.1                ggplot2_3.1.0              
#> [41] tidyverse_1.2.1             BiocStyle_2.8.2            
#> 
#> loaded via a namespace (and not attached):
#>   [1] readxl_1.1.0           backports_1.1.3        fastmatch_1.1-0       
#>   [4] plyr_1.8.4             igraph_1.2.2           lazyeval_0.2.1        
#>   [7] splines_3.5.0          ggvis_0.4.3            crosstalk_1.0.0       
#>  [10] digest_0.6.18          htmltools_0.3.6        fansi_0.4.0           
#>  [13] gdata_2.18.0           checkmate_1.9.1        memoise_1.1.0         
#>  [16] BBmisc_1.11            sfsmisc_1.1-2          openxlsx_4.1.0        
#>  [19] modelr_0.1.2           rda_1.0.2-2            pkgdown_1.3.0         
#>  [22] colorspace_1.4-0       blob_1.1.1             rvest_0.3.2           
#>  [25] haven_1.1.2            xfun_0.4               crayon_1.3.4          
#>  [28] RCurl_1.95-4.10        jsonlite_1.6           roxygen2_6.1.1        
#>  [31] genefilter_1.62.0      bindr_0.1.1            survival_2.41-3       
#>  [34] glue_1.3.0             gtable_0.2.0           zlibbioc_1.26.0       
#>  [37] XVector_0.20.0         kernlab_0.9-27         prabclus_2.2-6        
#>  [40] DEoptimR_1.0-8         abind_1.4-5            scales_1.0.0          
#>  [43] mvtnorm_1.0-8          DBI_1.0.0              Rcpp_1.0.0            
#>  [46] xtable_1.8-3           foreign_0.8-70         bit_1.1-14            
#>  [49] mclust_5.4.2           htmlwidgets_1.3        httr_1.4.0            
#>  [52] threejs_0.3.1          RColorBrewer_1.1-2     fpc_2.1-11.1          
#>  [55] modeltools_0.2-22      pkgconfig_2.0.2        flexmix_2.3-14        
#>  [58] nnet_7.3-12            utf8_1.1.4             tidyselect_0.2.5      
#>  [61] rlang_0.3.1            later_0.7.5            munsell_0.5.0         
#>  [64] mlbench_2.1-1          cellranger_1.1.0       tools_3.5.0           
#>  [67] cli_1.0.1              RSQLite_2.1.1          pls_2.6-0             
#>  [70] broom_0.5.0            evaluate_0.12          yaml_2.2.0            
#>  [73] knitr_1.21             bit64_0.9-7            fs_1.2.3              
#>  [76] zip_1.0.0              robustbase_0.93-3      nlme_3.1-137          
#>  [79] mime_0.6               xml2_1.2.0             compiler_3.5.0        
#>  [82] rstudioapi_0.7         curl_3.2               stringi_1.2.4         
#>  [85] desc_1.2.0             lattice_0.20-35        trimcluster_0.1-2.1   
#>  [88] Matrix_1.2-14          commonmark_1.5         gbm_2.1.3             
#>  [91] pillar_1.3.1           data.table_1.12.0      bitops_1.0-6          
#>  [94] httpuv_1.4.5           R6_2.3.0               hwriter_1.3.2         
#>  [97] bookdown_0.7           promises_1.0.1         rio_0.5.10            
#> [100] codetools_0.2-15       MASS_7.3-49            gtools_3.8.1          
#> [103] assertthat_0.2.0       rprojroot_1.3-2        withr_2.1.2           
#> [106] GenomeInfoDbData_1.1.0 diptest_0.75-7         hms_0.4.2             
#> [109] grid_3.5.0             rpart_4.1-13           rmarkdown_1.11        
#> [112] parallelMap_1.4        shiny_1.2.0            lubridate_1.7.4       
#> [115] base64enc_0.1-3

S4 to task conversion

Dror Berel

2019-01-31