V1_S4_to_task_conversion.Rmd
data(Golub_Merge, package = 'golubEsets') # ExpressionSet
smallG<-Golub_Merge[200:259,]
smallG
#> ExpressionSet (storageMode: lockedEnvironment)
#> assayData: 60 features, 72 samples
#> element names: exprs
#> protocolData: none
#> phenoData
#> sampleNames: 39 40 ... 33 (72 total)
#> varLabels: Samples ALL.AML ... Source (11 total)
#> varMetadata: labelDescription
#> featureData: none
#> experimentData: use 'experimentData(object)'
#> pubMedIds: 10521349
#> Annotation: hu6800
library(SummarizedExperiment)
smallG_SE<-makeSummarizedExperimentFromExpressionSet(smallG)
# functional:
task_SE_Functional<-Fun_SE_to_taskFunc(smallG_SE, param.Y.name = 'ALL.AML', param.covariates = NULL, param_positive_y_level = 'ALL', task_return_format = 'functional', task_type = 'classif') ## will work with either 1 or multiple assayS
task_SE_Functional
#> Supervised task: DF_functionals
#> Type: classif
#> Target: ALL.AML
#> Observations: 72
#> Features:
#> numerics factors ordered functionals
#> 0 0 0 1
#> Missings: FALSE
#> Has weights: FALSE
#> Has blocking: FALSE
#> Has coordinates: FALSE
#> Classes: 2
#> ALL AML
#> 47 25
#> Positive class: ALL
# non-functional:
## 1. directly, but into DF
extracted_DF_from_task_SE<-getTaskData(task_SE_Functional, functionals.as = "dfcols") # keep matrix
extracted_DF_from_task_SE[,1:10] %>% str
#> 'data.frame': 72 obs. of 10 variables:
#> $ ALL.AML : Factor w/ 2 levels "ALL","AML": 1 1 1 1 1 1 1 1 1 1 ...
#> $ exprs.D13627_at: num 330 544 978 1035 3895 ...
#> $ exprs.D13628_at: num 229 147 110 237 106 256 144 84 -7 -3 ...
#> $ exprs.D13630_at: num 359 289 609 485 866 663 673 401 480 273 ...
#> $ exprs.D13633_at: num -9 57 207 302 475 0 112 257 244 252 ...
#> $ exprs.D13634_at: num 115 248 91 58 244 245 98 182 186 241 ...
#> $ exprs.D13635_at: num 31 -43 40 31 84 -159 -7 -2 62 111 ...
#> $ exprs.D13636_at: num 195 23 -60 317 449 -262 386 295 177 51 ...
#> $ exprs.D13637_at: num 161 137 -94 -96 432 -535 136 86 99 143 ...
#> $ exprs.D13639_at: num 456 3336 655 2771 3575 ...
## 2. Fun_SE_to_taskFunc(..., task_return_format = 'dfcols')
task_SE_NON_Functional<-Fun_SE_to_taskFunc(smallG_SE, param.Y.name = 'ALL.AML', param.covariates = NULL, param_positive_y_level = 'ALL', task_return_format = 'dfcols', task_type = 'classif') ## will work with either 1 or multiple assayS
## 3. functional_to_NonFunctional_task_function(task_functional)
task_SE_NON_Functional_alt<-functional_to_NonFunctional_task_function(task_SE_Functional)
## 4. designated function ## TBA
# extracted = extractFDAFeatures(task_SE_Functional, feat.methods = list("exprs" = all))
library(MLInterfaces)
#> Warning: package 'MLInterfaces' was built under R version 3.5.1
#> Warning: package 'XML' was built under R version 3.5.2
krun<-MLearn(formula = ALL.AML~., data = smallG, .method = knnI(k=1), trainInd = 1:40)
krun
#> MLInterfaces classification output container
#> The call was:
#> MLearn(formula = ALL.AML ~ ., data = smallG, .method = knnI(k = 1),
#> trainInd = 1:40)
#> Predicted outcome distribution for test set:
#>
#> ALL AML
#> 22 10
#> Summary of scores on test set (use testScores() method for details):
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 1 1 1 1 1 1
confuMat(krun)
#> predicted
#> given ALL AML
#> ALL 18 3
#> AML 4 7
task_train<-task_SE_Functional %>% subsetTask(subset = 1:40)
task_test <-task_SE_Functional %>% subsetTask(subset = 41:72)
classif.lrn = makeLearner("classif.knn")
model<-train(classif.lrn, task_train)
Predict<-model %>% predict(task_test)
Predict %>% calculateConfusionMatrix()
#> predicted
#> true ALL AML -err.-
#> ALL 18 3 3
#> AML 4 7 4
#> -err.- 4 3 7
Two data examples:
1. miniACC, balanced, without ‘dropouts’.
2. Customized, non-balanced, with ‘dropouts’.
library(MultiAssayExperiment)
miniACC
#> A MultiAssayExperiment object of 5 listed
#> experiments with user-defined names and respective classes.
#> Containing an ExperimentList class object of length 5:
#> [1] RNASeq2GeneNorm: SummarizedExperiment with 198 rows and 79 columns
#> [2] gistict: SummarizedExperiment with 198 rows and 90 columns
#> [3] RPPAArray: SummarizedExperiment with 33 rows and 46 columns
#> [4] Mutations: matrix with 97 rows and 90 columns
#> [5] miRNASeqGene: SummarizedExperiment with 471 rows and 80 columns
#> Features:
#> experiments() - obtain the ExperimentList instance
#> colData() - the primary/phenotype DataFrame
#> sampleMap() - the sample availability DataFrame
#> `$`, `[`, `[[` - extract colData columns, subset, or experiment
#> *Format() - convert into a long or wide DataFrame
#> assays() - convert ExperimentList to a SimpleList of matrices
# miniACC %>% sampleMap %>% data.frame %>% dplyr::select(primary, assay) %>% table # no replicates within same assay
task_Functional_MAE<-Fun_MAE_to_taskFunc(miniACC, param.Y.name = 'vital_status', param.covariates = c('gender','days_to_death'), param_positive_y_level = '1', task_type = 'classif')
task_Functional_MAE
#> Supervised task: DF_functionals
#> Type: classif
#> Target: vital_status
#> Observations: 385
#> Features:
#> numerics factors ordered functionals
#> 1 5 0 5
#> Missings: TRUE
#> Has weights: FALSE
#> Has blocking: FALSE
#> Has coordinates: FALSE
#> Classes: 2
#> 0 1
#> 248 137
#> Positive class: 1
extracted_DF_from_task_MAE_functionals<-getTaskData(task_Functional_MAE, functionals.as = "matrix") # keep functionals
extracted_DF_from_task_MAE_functionals[,1:10] %>% glimpse
#> Observations: 385
#> Variables: 10
#> $ Unique_sample_id <fct> RNASeq2GeneNorm_TCGA-OR-A5J1_TCGA-OR-A5J1-01A...
#> $ assay <fct> RNASeq2GeneNorm, RNASeq2GeneNorm, RNASeq2Gene...
#> $ primary <fct> TCGA-OR-A5J1, TCGA-OR-A5J2, TCGA-OR-A5J3, TCG...
#> $ colname <fct> TCGA-OR-A5J1-01A-11R-A29S-07, TCGA-OR-A5J2-01...
#> $ gender <fct> male, female, female, male, female, female, m...
#> $ days_to_death <int> 1355, 1677, NA, 365, NA, 490, 579, NA, 922, 5...
#> $ vital_status <fct> 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, ...
#> $ RNASeq2GeneNorm <dbl> <matrix[25 x 198]>
#> $ gistict <dbl> <matrix[25 x 198]>
#> $ RPPAArray <dbl> <matrix[25 x 33]>
extracted_DF_from_task_MAE_dfcols<-getTaskData(task_Functional_MAE, functionals.as = "dfcols") # concatonate functionals
extracted_DF_from_task_MAE_dfcols[,1:10] %>% glimpse
#> Observations: 385
#> Variables: 10
#> $ Unique_sample_id <fct> RNASeq2GeneNorm_TCGA-OR-A5J1_TCGA-OR-A5...
#> $ assay <fct> RNASeq2GeneNorm, RNASeq2GeneNorm, RNASe...
#> $ primary <fct> TCGA-OR-A5J1, TCGA-OR-A5J2, TCGA-OR-A5J...
#> $ colname <fct> TCGA-OR-A5J1-01A-11R-A29S-07, TCGA-OR-A...
#> $ gender <fct> male, female, female, male, female, fem...
#> $ days_to_death <int> 1355, 1677, NA, 365, NA, 490, 579, NA, ...
#> $ vital_status <fct> 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, ...
#> $ RNASeq2GeneNorm.DIRAS3 <dbl> 1487.0317, 9.6631, 18.9602, 760.6507, 1...
#> $ RNASeq2GeneNorm.MAPK14 <dbl> 778.5783, 2823.6469, 1061.7686, 806.351...
#> $ RNASeq2GeneNorm.YAP1 <dbl> 1009.6061, 2305.0590, 1561.2502, 713.40...
library(MultiAssayExperiment)
patient.data <- data.frame(sex=c("M", "F", "M", "F", "F"),
age=38:42,
row.names=c("Jack", "Jill", "Bob", "Barbara","Meg"))
## assay A
arraydat <- matrix(seq(101, 108), ncol=4,
dimnames=list(c("ENST00000294241", "ENST00000355076"),
c("array1", "array2", "array3", "array4")))
coldat <- data.frame(slope53=rnorm(4), row.names=c("array1", "array2", "array3", "array4"))
exprdat <- SummarizedExperiment(arraydat, colData=coldat)
exprmap <- data.frame(primary=c("Jill", "Jill", "Meg", "Barbara"),
colname=c("array1", "array2", "array3", "array4"),
stringsAsFactors = FALSE)
## assay B
methyldat <-
matrix(1:10, ncol=5,
dimnames=list(c("ENST00000355076", "ENST00000383706"),
c("methyl1", "methyl2", "methyl3",
"methyl4", "methyl5")))
methylmap <- data.frame(primary = c("Jack", "Jack", "Jack", "Meg", "Bob"),
colname = c("methyl1", "methyl2", "methyl3", "methyl4", "methyl5"),
stringsAsFactors = FALSE)
myMultiAssay <- MultiAssayExperiment(list("A" = exprdat, "B" = methyldat), patient.data, list(A = exprmap, B = methylmap) %>% listToMap)
myMultiAssay
#> A MultiAssayExperiment object of 2 listed
#> experiments with user-defined names and respective classes.
#> Containing an ExperimentList class object of length 2:
#> [1] A: SummarizedExperiment with 2 rows and 4 columns
#> [2] B: matrix with 2 rows and 5 columns
#> Features:
#> experiments() - obtain the ExperimentList instance
#> colData() - the primary/phenotype DataFrame
#> sampleMap() - the sample availability DataFrame
#> `$`, `[`, `[[` - extract colData columns, subset, or experiment
#> *Format() - convert into a long or wide DataFrame
#> assays() - convert ExperimentList to a SimpleList of matrices
myMultiAssay %>% sampleMap %>% data.frame %>% select(primary, assay) %>% table # Yes replicates within same assay, and non-balanced / dropouts!!!
#> assay
#> primary A B
#> Barbara 1 0
#> Bob 0 1
#> Jack 0 3
#> Jill 2 0
#> Meg 1 1
# myMultiAssay %>% sampleMap %>% data.frame %>% filter(assay == 'A')
# myMultiAssay$sex
task_Functional_MAE_customized<-Fun_MAE_to_taskFunc(myMultiAssay, param.Y.name = 'sex', param.covariates = NULL, param_positive_y_level = 'M', task_type = 'classif')
Unless the learner has sepecific implementation for functional data, it will be automatically converted into standard (non-functional) task.
bartMachine model was chosed only because it has a built-in NA handling. Any other ‘learner’ from mlr could be demonstrated instead.
library(bartMachine)
classif_lrn_bartMachine<-makeLearner("classif.bartMachine")
model_bartMachine<-train(classif_lrn_bartMachine, task_Functional_MAE)
#> bartMachine initializing with 50 trees...
#> bartMachine vars checked...
#> bartMachine java init...
#> bartMachine factors created...
#> bartMachine before preprocess...
#> bartMachine after preprocess... 1868 total features...
#> warning: cannot use MSE of linear model for s_sq_y if p > n. bartMachine will use sample var(y) instead.
#> bartMachine sigsq estimated...
#> bartMachine training data finalized...
#> Now building bartMachine for classification ...Covariate importance prior ON. Missing data feature ON.
#> evaluating in sample data...done
Predict_bartMachine<-model_bartMachine %>% predict(task_Functional_MAE)
Predict_bartMachine %>% calculateConfusionMatrix()
#> predicted
#> true 0 1 -err.-
#> 0 248 0 0
#> 1 0 137 0
#> -err.- 0 0 0
TBA: built-in CPOs / see detailed vignette.
Session information
sessionInfo()
#> R version 3.5.0 (2018-04-23)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 14393)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=English_United States.1252
#> [2] LC_CTYPE=English_United States.1252
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.1252
#>
#> attached base packages:
#> [1] parallel stats4 stats graphics grDevices utils datasets
#> [8] methods base
#>
#> other attached packages:
#> [1] bartMachine_1.2.4.2 missForest_1.4
#> [3] itertools_0.1-3 iterators_1.0.10
#> [5] foreach_1.4.4 randomForest_4.6-14
#> [7] car_3.0-0 carData_3.0-1
#> [9] bartMachineJARs_1.1 rJava_0.9-10
#> [11] bindrcpp_0.2.2 MultiAssayExperiment_1.6.0
#> [13] MLInterfaces_1.62.0 cluster_2.0.7-1
#> [15] annotate_1.58.0 XML_3.98-1.16
#> [17] AnnotationDbi_1.42.1 class_7.3-14
#> [19] SummarizedExperiment_1.10.1 DelayedArray_0.6.0
#> [21] BiocParallel_1.14.1 matrixStats_0.53.1
#> [23] Biobase_2.40.0 GenomicRanges_1.32.3
#> [25] GenomeInfoDb_1.16.0 IRanges_2.14.10
#> [27] S4Vectors_0.18.2 BiocGenerics_0.26.0
#> [29] mlr_2.13.9000 ParamHelpers_1.12
#> [31] Bioc2mlr_0.1.0 magrittr_1.5
#> [33] forcats_0.3.0 stringr_1.3.1
#> [35] dplyr_0.7.8 purrr_0.2.5
#> [37] readr_1.1.1 tidyr_0.8.2
#> [39] tibble_2.0.1 ggplot2_3.1.0
#> [41] tidyverse_1.2.1 BiocStyle_2.8.2
#>
#> loaded via a namespace (and not attached):
#> [1] readxl_1.1.0 backports_1.1.3 fastmatch_1.1-0
#> [4] plyr_1.8.4 igraph_1.2.2 lazyeval_0.2.1
#> [7] splines_3.5.0 ggvis_0.4.3 crosstalk_1.0.0
#> [10] digest_0.6.18 htmltools_0.3.6 fansi_0.4.0
#> [13] gdata_2.18.0 checkmate_1.9.1 memoise_1.1.0
#> [16] BBmisc_1.11 sfsmisc_1.1-2 openxlsx_4.1.0
#> [19] modelr_0.1.2 rda_1.0.2-2 pkgdown_1.3.0
#> [22] colorspace_1.4-0 blob_1.1.1 rvest_0.3.2
#> [25] haven_1.1.2 xfun_0.4 crayon_1.3.4
#> [28] RCurl_1.95-4.10 jsonlite_1.6 roxygen2_6.1.1
#> [31] genefilter_1.62.0 bindr_0.1.1 survival_2.41-3
#> [34] glue_1.3.0 gtable_0.2.0 zlibbioc_1.26.0
#> [37] XVector_0.20.0 kernlab_0.9-27 prabclus_2.2-6
#> [40] DEoptimR_1.0-8 abind_1.4-5 scales_1.0.0
#> [43] mvtnorm_1.0-8 DBI_1.0.0 Rcpp_1.0.0
#> [46] xtable_1.8-3 foreign_0.8-70 bit_1.1-14
#> [49] mclust_5.4.2 htmlwidgets_1.3 httr_1.4.0
#> [52] threejs_0.3.1 RColorBrewer_1.1-2 fpc_2.1-11.1
#> [55] modeltools_0.2-22 pkgconfig_2.0.2 flexmix_2.3-14
#> [58] nnet_7.3-12 utf8_1.1.4 tidyselect_0.2.5
#> [61] rlang_0.3.1 later_0.7.5 munsell_0.5.0
#> [64] mlbench_2.1-1 cellranger_1.1.0 tools_3.5.0
#> [67] cli_1.0.1 RSQLite_2.1.1 pls_2.6-0
#> [70] broom_0.5.0 evaluate_0.12 yaml_2.2.0
#> [73] knitr_1.21 bit64_0.9-7 fs_1.2.3
#> [76] zip_1.0.0 robustbase_0.93-3 nlme_3.1-137
#> [79] mime_0.6 xml2_1.2.0 compiler_3.5.0
#> [82] rstudioapi_0.7 curl_3.2 stringi_1.2.4
#> [85] desc_1.2.0 lattice_0.20-35 trimcluster_0.1-2.1
#> [88] Matrix_1.2-14 commonmark_1.5 gbm_2.1.3
#> [91] pillar_1.3.1 data.table_1.12.0 bitops_1.0-6
#> [94] httpuv_1.4.5 R6_2.3.0 hwriter_1.3.2
#> [97] bookdown_0.7 promises_1.0.1 rio_0.5.10
#> [100] codetools_0.2-15 MASS_7.3-49 gtools_3.8.1
#> [103] assertthat_0.2.0 rprojroot_1.3-2 withr_2.1.2
#> [106] GenomeInfoDbData_1.1.0 diptest_0.75-7 hms_0.4.2
#> [109] grid_3.5.0 rpart_4.1-13 rmarkdown_1.11
#> [112] parallelMap_1.4 shiny_1.2.0 lubridate_1.7.4
#> [115] base64enc_0.1-3