Pre-processing: Feature-selection via limma filtering

A. SummarizedExperiment (SE)

data(Golub_Merge, package = 'golubEsets') # ExpressionSet 
smallG<-Golub_Merge[200:259,]
smallG
#> ExpressionSet (storageMode: lockedEnvironment)
#> assayData: 60 features, 72 samples 
#>   element names: exprs 
#> protocolData: none
#> phenoData
#>   sampleNames: 39 40 ... 33 (72 total)
#>   varLabels: Samples ALL.AML ... Source (11 total)
#>   varMetadata: labelDescription
#> featureData: none
#> experimentData: use 'experimentData(object)'
#>   pubMedIds: 10521349 
#> Annotation: hu6800

library(SummarizedExperiment)
#> Loading required package: GenomicRanges
#> Loading required package: stats4
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#> 
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#> 
#>     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#>     clusterExport, clusterMap, parApply, parCapply, parLapply,
#>     parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following object is masked from 'package:limma':
#> 
#>     plotMA
#> The following objects are masked from 'package:dplyr':
#> 
#>     combine, intersect, setdiff, union
#> The following objects are masked from 'package:stats':
#> 
#>     IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#> 
#>     anyDuplicated, append, as.data.frame, basename, cbind,
#>     colMeans, colnames, colSums, dirname, do.call, duplicated,
#>     eval, evalq, Filter, Find, get, grep, grepl, intersect,
#>     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
#>     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
#>     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
#>     table, tapply, union, unique, unsplit, which, which.max,
#>     which.min
#> Loading required package: S4Vectors
#> 
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:ParamHelpers':
#> 
#>     isEmpty
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, rename
#> The following object is masked from 'package:base':
#> 
#>     expand.grid
#> Loading required package: IRanges
#> 
#> Attaching package: 'IRanges'
#> The following object is masked from 'package:purrr':
#> 
#>     reduce
#> The following objects are masked from 'package:dplyr':
#> 
#>     collapse, desc, slice
#> The following object is masked from 'package:grDevices':
#> 
#>     windows
#> Loading required package: GenomeInfoDb
#> Loading required package: Biobase
#> Welcome to Bioconductor
#> 
#>     Vignettes contain introductory material; view with
#>     'browseVignettes()'. To cite Bioconductor, see
#>     'citation("Biobase")', and for packages 'citation("pkgname")'.
#> Loading required package: DelayedArray
#> Loading required package: matrixStats
#> 
#> Attaching package: 'matrixStats'
#> The following objects are masked from 'package:Biobase':
#> 
#>     anyMissing, rowMedians
#> The following object is masked from 'package:dplyr':
#> 
#>     count
#> Loading required package: BiocParallel
#> 
#> Attaching package: 'DelayedArray'
#> The following objects are masked from 'package:matrixStats':
#> 
#>     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
#> The following object is masked from 'package:purrr':
#> 
#>     simplify
#> The following objects are masked from 'package:base':
#> 
#>     aperm, apply
smallG_SE<-makeSummarizedExperimentFromExpressionSet(smallG)

smallG_SE # from vignette V1
#> class: RangedSummarizedExperiment 
#> dim: 60 72 
#> metadata(3): experimentData annotation protocolData
#> assays(1): exprs
#> rownames(60): D13627_at D13628_at ... D16350_at D16469_at
#> rowData names(0):
#> colnames(72): 39 40 ... 32 33
#> colData names(11): Samples ALL.AML ... PS Source
top_DE_analytes_present<-5

# functional:
task_SE_Functional<-Fun_SE_to_taskFunc(smallG_SE, param.Y.name = 'ALL.AML', param.covariates = NULL, param_positive_y_level = 'ALL', task_return_format = 'functional', task_type = 'classif') ## will work with either 1 or multiple assayS
task_SE_Functional
#> Supervised task: DF_functionals
#> Type: classif
#> Target: ALL.AML
#> Observations: 72
#> Features:
#>    numerics     factors     ordered functionals 
#>           0           0           0           1 
#> Missings: FALSE
#> Has weights: FALSE
#> Has blocking: FALSE
#> Has coordinates: FALSE
#> Classes: 2
#> ALL AML 
#>  47  25 
#> Positive class: ALL


# non-functional:
## 1. directly, but into DF
extracted_DF_from_task_SE<-getTaskData(task_SE_Functional, functionals.as = "dfcols") # keep matrix
#> Functional features have been converted to numerics
extracted_DF_from_task_SE %>% str
#> 'data.frame':    72 obs. of  61 variables:
#>  $ ALL.AML             : Factor w/ 2 levels "ALL","AML": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ exprs.D13627_at     : num  330 544 978 1035 3895 ...
#>  $ exprs.D13628_at     : num  229 147 110 237 106 256 144 84 -7 -3 ...
#>  $ exprs.D13630_at     : num  359 289 609 485 866 663 673 401 480 273 ...
#>  $ exprs.D13633_at     : num  -9 57 207 302 475 0 112 257 244 252 ...
#>  $ exprs.D13634_at     : num  115 248 91 58 244 245 98 182 186 241 ...
#>  $ exprs.D13635_at     : num  31 -43 40 31 84 -159 -7 -2 62 111 ...
#>  $ exprs.D13636_at     : num  195 23 -60 317 449 -262 386 295 177 51 ...
#>  $ exprs.D13637_at     : num  161 137 -94 -96 432 -535 136 86 99 143 ...
#>  $ exprs.D13639_at     : num  456 3336 655 2771 3575 ...
#>  $ exprs.D13640_at     : num  1105 1204 1751 1008 932 ...
#>  $ exprs.D13641_at     : num  760 458 793 863 698 477 818 322 606 756 ...
#>  $ exprs.D13642_at     : num  272 150 6 44 134 173 36 302 -148 38 ...
#>  $ exprs.D13643_at     : num  -1484 -1300 -298 -176 -558 ...
#>  $ exprs.D13644_at     : num  125 57 12 136 78 178 26 2 -29 -37 ...
#>  $ exprs.D13645_at     : num  -53 131 -183 132 91 -374 176 9 -346 -98 ...
#>  $ exprs.D13748_at     : num  3225 4583 4543 6713 6817 ...
#>  $ exprs.D13789_at     : num  -1303 -781 -453 -687 -353 ...
#>  $ exprs.D13897_rna2_at: num  569 1267 146 619 444 ...
#>  $ exprs.D13900_at     : num  2293 1077 1809 1607 3233 ...
#>  $ exprs.D13969_at     : num  -135 -373 -297 27 -36 -373 -37 -329 -211 -195 ...
#>  $ exprs.D13988_at     : num  1479 1264 1218 983 1261 ...
#>  $ exprs.D14043_at     : num  231 1427 1093 2927 3479 ...
#>  $ exprs.D14134_at     : num  -130 -136 -132 -142 -170 -175 -25 -84 -14 -110 ...
#>  $ exprs.D14446_at     : num  205 127 225 189 239 166 29 141 -34 61 ...
#>  $ exprs.D14497_at     : num  -45 57 5 -2 -11 4 46 -26 -91 -99 ...
#>  $ exprs.D14520_at     : num  65 -121 -223 48 -33 142 -28 153 -94 -35 ...
#>  $ exprs.D14530_at     : num  13361 16673 16061 19362 21519 ...
#>  $ exprs.D14533_at     : num  -410 -300 -82 -84 -66 -227 -97 122 -69 -75 ...
#>  $ exprs.D14657_at     : num  436 412 1541 1666 4514 ...
#>  $ exprs.D14658_at     : num  444 828 583 1199 2177 ...
#>  $ exprs.D14659_at     : num  -12 74 76 0 114 -9 67 28 2 -63 ...
#>  $ exprs.D14660_at     : num  -82 -36 12 2 144 -51 104 40 28 -40 ...
#>  $ exprs.D14661_at     : num  400 272 1100 327 805 317 355 324 555 133 ...
#>  $ exprs.D14662_at     : num  417 863 494 540 1553 ...
#>  $ exprs.D14663_at     : num  598 202 720 401 1039 ...
#>  $ exprs.D14664_at     : num  101 74 1 302 140 -132 84 62 -74 18 ...
#>  $ exprs.D14678_at     : num  50 -553 2 172 -59 -121 252 81 189 -100 ...
#>  $ exprs.D14686_at     : num  354 464 692 445 465 531 212 177 312 111 ...
#>  $ exprs.D14689_at     : num  557 436 447 498 925 ...
#>  $ exprs.D14694_at     : num  1431 1155 1569 2006 1883 ...
#>  $ exprs.D14695_at     : num  296 206 330 487 319 532 114 175 114 442 ...
#>  $ exprs.D14710_at     : num  2920 3187 3358 5148 6462 ...
#>  $ exprs.D14811_at     : num  238 323 315 214 422 616 316 230 -17 499 ...
#>  $ exprs.D14812_at     : num  2404 1305 2553 3610 6604 ...
#>  $ exprs.D14822_at     : num  0 48 27 33 -13 64 32 -12 -69 -37 ...
#>  $ exprs.D14823_at     : num  36 36 -2 12 -2 -64 28 31 133 174 ...
#>  $ exprs.D14827_at     : num  -325 -38 239 63 170 -8 -39 0 -250 -309 ...
#>  $ exprs.D14838_at     : num  -79 -45 31 -67 -29 41 -111 -9 -57 94 ...
#>  $ exprs.D14874_at     : num  164 187 389 138 42 82 -18 120 -106 137 ...
#>  $ exprs.D14878_at     : num  238 346 439 714 1026 ...
#>  $ exprs.D14889_at     : num  547 379 286 432 398 646 111 293 226 284 ...
#>  $ exprs.D15049_at     : num  -21 -282 311 -84 82 -306 48 -26 -111 -155 ...
#>  $ exprs.D15050_at     : num  473 355 2517 485 630 ...
#>  $ exprs.D15057_at     : num  369 178 593 628 2613 ...
#>  $ exprs.D16181_at     : num  41 133 77 -21 92 5 29 88 29 129 ...
#>  $ exprs.D16217_at     : num  407 869 775 369 795 752 349 499 283 527 ...
#>  $ exprs.D16227_at     : num  -393 -7 15 -254 -219 -662 -23 -21 -259 -431 ...
#>  $ exprs.D16294_at     : num  190 213 363 58 618 195 377 132 416 109 ...
#>  $ exprs.D16350_at     : num  89 33 -23 90 33 -74 14 72 50 81 ...
#>  $ exprs.D16469_at     : num  -19 226 770 874 2058 ...


## 2. Fun_SE_to_taskFunc(..., task_return_format = 'dfcols')
task_SE_NON_Functional<-Fun_SE_to_taskFunc(smallG_SE, param.Y.name = 'ALL.AML', param.covariates = NULL, param_positive_y_level = 'ALL', task_return_format = 'dfcols', task_type = 'classif') ## will work with either 1 or multiple assayS

?. Session information

sessionInfo()
#> R version 3.5.0 (2018-04-23)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 14393)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=English_United States.1252 
#> [2] LC_CTYPE=English_United States.1252   
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C                          
#> [5] LC_TIME=English_United States.1252    
#> 
#> attached base packages:
#> [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
#> [8] methods   base     
#> 
#> other attached packages:
#>  [1] MultiAssayExperiment_1.6.0  SummarizedExperiment_1.10.1
#>  [3] DelayedArray_0.6.0          BiocParallel_1.14.1        
#>  [5] matrixStats_0.53.1          Biobase_2.40.0             
#>  [7] GenomicRanges_1.32.3        GenomeInfoDb_1.16.0        
#>  [9] IRanges_2.14.10             S4Vectors_0.18.2           
#> [11] BiocGenerics_0.26.0         mlrCPO_0.3.4               
#> [13] mlr_2.13.9000               ParamHelpers_1.12          
#> [15] Bioc2mlr_0.1.0              limma_3.36.1               
#> [17] purrr_0.2.5                 magrittr_1.5               
#> [19] dplyr_0.7.8                
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_1.0.0             lattice_0.20-35        tidyr_0.8.2           
#>  [4] assertthat_0.2.0       rprojroot_1.3-2        digest_0.6.18         
#>  [7] R6_2.3.0               plyr_1.8.4             backports_1.1.3       
#> [10] evaluate_0.12          ggplot2_3.1.0          pillar_1.3.1          
#> [13] zlibbioc_1.26.0        rlang_0.3.1            lazyeval_0.2.1        
#> [16] rstudioapi_0.7         data.table_1.12.0      rpart_4.1-13          
#> [19] Matrix_1.2-14          checkmate_1.9.1        rmarkdown_1.11        
#> [22] pkgdown_1.3.0          desc_1.2.0             splines_3.5.0         
#> [25] stringr_1.3.1          RCurl_1.95-4.10        munsell_0.5.0         
#> [28] compiler_3.5.0         xfun_0.4               pkgconfig_2.0.2       
#> [31] BBmisc_1.11            htmltools_0.3.6        tidyselect_0.2.5      
#> [34] tibble_2.0.1           GenomeInfoDbData_1.1.0 roxygen2_6.1.1        
#> [37] XML_3.98-1.16          crayon_1.3.4           MASS_7.3-49           
#> [40] bitops_1.0-6           commonmark_1.5         grid_3.5.0            
#> [43] gtable_0.2.0           scales_1.0.0           stringi_1.2.4         
#> [46] XVector_0.20.0         fs_1.2.3               parallelMap_1.4       
#> [49] bindrcpp_0.2.2         xml2_1.2.0             fastmatch_1.1-0       
#> [52] tools_3.5.0            glue_1.3.0             survival_2.41-3       
#> [55] yaml_2.2.0             colorspace_1.4-0       memoise_1.1.0         
#> [58] knitr_1.21             bindr_0.1.1