Successfully reported this slideshow.
We use your LinkedIn profile and activity data to personalize ads and to show you more relevant ads. You can change your ad preferences anytime.

Large scale machine learning projects with R Suite

329 views

Published on

Presentation from a workshop delivered by company's CEO Wit Jakuczun during ML@Enterprise conference that took place on 14th of December 2017 in Warsaw. Check https://github.com/WLOGSolutions/MLForum2017) for R code generated during this workshop.

Machine Learning is not only about algorithms. Machine learning is about value and this can be achieved only after proper deployment of Machine Learning solutions. I will present best practices regarding managing R based ML projects. I will use our open-source tool R Suite (http://rsuite.io/). During the workshop I will talk about:
– project structure
– development cycle
– deployment
– test

Published in: Data & Analytics
  • Be the first to comment

Large scale machine learning projects with R Suite

  1. 1. Rscript R/master.R --port=7137
  2. 2. ● ○ ○
  3. 3. > rsuite install Detecting repositories ... Will use repositories: CRAN.CRAN = https://mran.microsoft.com/snapshot/2017-10-15 CRAN.CRANextra = http://www.stats.ox.ac.uk/pub/RWin Other = http://wlog-rsuite.s3.amazonaws.com Installing RSuite(v0.17x) package ... installing the source package 'RSuite' All done.
  4. 4. > rsuite proj start -n spmf
  5. 5. Commands: update Checks if newest version of RSuite CLI is installed. If not installer for newest version is downloaded and installation is initiated. install Install RSuite with all the dependencies. proj Use it to manage project, its dependencies, and build project packages. repo Use to manage repositories. e.g. upload packages. pkgzip Use to create PKGZIP packages to fillup remove repository. version Show RSuite CLI version. help Show this message and exit. Call 'rsuite [command] help' to get information on acceptable [args].
  6. 6. logs/.gitignore
  7. 7. PARAMETERS ● ● ● ○ ○ ○ ○ ●
  8. 8. ● ● ● LogLevel: INFO N_days: 365 solver_max_iterations: 10 solver_opt_horizon: 8
  9. 9. ● ● ○ main ○ if __name__ == "__main__":
  10. 10. predmodel
  11. 11. ● == ● >= ● <= ●
  12. 12. master.R
  13. 13. spmf/libs
  14. 14. packages_import.R
  15. 15. master.R
  16. 16. import_training.R (I) ● import/<session_id>/ ● work/<session_id>/ library(predmodel) import_path <- file.path(script_path, "../import") work_path <- file.path(script_path, "../work") # required session_id <- args$get(name = "session_id", default = "201711122000", required = FALSE) loginfo("--> Session id:%s", session_id) session_work <- file.path(work_path, session_id) if(!dir.exists(session_work)) { dir.create(session_work) } import_training_data(file.path(import_path, session_id), session_work)
  17. 17. import_training.R (II)
  18. 18. devtools
  19. 19. import_training_data #' @export import_training_data <- function(import_path, work_path) { pkg_loginfo("Importing from %s into %s", import_path, work_path) n <- 10000 dt <- data.table(feature1 = rnorm(n), feature2 = rnorm(n)) m <- round(n*0.3) dt[, resp := c(rep(1, m), rep(0, n - m))] fwrite(x = dt, file = file.path(work_path, "training.csv"), sep = ";") }
  20. 20. estimate_model.R (I) ● ● library(predmodel) work_path <- file.path(script_path, "../work") # required session_id <- args$get(name = "session_id", required = FALSE, default = "201710111655") loginfo("--> Session id:%s", session_id) session_work <- file.path(work_path, session_id) h2o.init(max_mem_size = "4g", nthreads = 2) logdebug("---> H2O started") train_file <- file.path(session_work, "training.csv") stopifnot(file.exists(train_file)) train_file %>% transform_training() %>% estimate_model(session_id) %>% save_model(session_work)
  21. 21. transform_training #' @export transform_training <- function(train_file) { dt <- h2o.importFile(path = train_file, destination_frame = "train_dt", parse = TRUE, header = TRUE, sep = ";") dt$resp <- as.factor(dt$resp) dt <- h2o.assign(data=dt, key = "train_dt") return(dt) }
  22. 22. estimate_model #'@export estimate_model <- function(dt, session_id) { model <- h2o.gbm(x = colnames(dt), y = "resp", training_frame = dt, model_id = sprintf("gbm_%s", session_id), ntrees = 10, learn_rate = 0.1) }
  23. 23. save_model #' @export save_model <- function(model, session_work) { h2o.saveModel(model, path = session_work, force =TRUE) }
  24. 24. import_test.R (I) ● import/<session_id>/ ● work/<session_id>/ library(predmodel) import_path <- file.path(script_path, "../import") work_path <- file.path(script_path, "../work") # required session_id <- args$get(name = "session_id", default = "201711122000", required = FALSE) loginfo("--> Session id:%s", session_id) session_work <- file.path(work_path, session_id) if(!dir.exists(session_work)) { dir.create(session_work) } import_test_data(file.path(import_path, session_id), session_work)
  25. 25. import_test_data #' @export import_test_data <- function(import_path, work_path) { pkg_loginfo("Importing from %s into %s", import_path, work_path) n <- 1000 dt <- data.table(feature1 = rnorm(n), feature2 = rnorm(n)) fwrite(x = dt, file = file.path(work_path, "test.csv"), sep = ";") }
  26. 26. score_model.R (I) ● work/<score_session_id> ● work/<train_session_id> ● export/<score_session_id>
  27. 27. score_model.R (II) library(h2o) library(magrittr) library(predmodel) work_path <- file.path(script_path, "../work") export_path <- file.path(script_path, "../export") # required train_session_id <- args$get(name = "train_session_id", required = FALSE, default = "201710111655") score_session_id <- args$get(name = "score_session_id", required = FALSE, default = "201710111655") loginfo("--> train session id:%s", train_session_id) loginfo("--> score session id:%s", score_session_id) score_session_export <- export_path train_session_work <- file.path(work_path, train_session_id) score_session_work <- file.path(work_path, score_session_id) h2o.init(max_mem_size = "4g", nthreads = 2) logdebug("---> H2O started") test_file <- file.path(score_session_work, "test.csv") model_file <- file.path(train_session_work, sprintf("gbm_%s", train_session_id)) stopifnot(file.exists(test_file)) stopifnot(file.exists(model_file)) test_dt <- test_file %>% transform_test() score_model(test_dt = test_dt, model_path = model_file) %>% export_score(export_path = export_path, score_session_id = score_session_id)
  28. 28. transform_test #' @export transform_test <- function(test_file) { h2o.importFile(path = test_file, destination_frame = "test_dt", parse = TRUE, header = TRUE, sep = ";") }
  29. 29. score_model #' @export score_model <- function(test_dt, model_path) { model <- h2o.loadModel(model_path) pred_dt <- h2o.predict(model, test_dt) pred_dt }
  30. 30. export_score #' @export export_score <- function(score_dt, score_session_id, export_path) { score_dt <- as.data.table(score_dt) score_dt[, score_session_id := score_session_id] fwrite(x = score_dt, file = file.path(export_path, "score.csv"), sep = ";", append = TRUE) }
  31. 31. Production spmf_0.1_001.zip Production/spmf import export work
  32. 32. Production/spmf/R a. Rscript import_training.R b. Rscript estimate_model.R c. Rscript import_test.R d. Rscript score_model.R Production/spmf/export
  33. 33. print
  34. 34. loginfo("Phase 1 passed") logdebug("Iter %d done", i) logtrace("Iter %d done", i) logwarning("Are you sure?") logerror("I failed :(") Packages pkg_loginfo("Phase 1 passed") pkg_logdebug("Iter %d done", i) pkg_logtrace("Iter %d done", i) pkg_logwarning("Are you sure?") pkg_logerror("I failed :(")
  35. 35. 2017-11-13 13:47:03 INFO::--> Session id:201711122000 2017-11-13 13:47:03 INFO:predmodel:Importing from C:/Workplace/Sandbox/Production/spmf/R/../import/201711122000 into C:/Workplace/Sandbox/Production/spmf/R/../work/201711122000 2017-11-13 13:47:14 INFO::--> Session id:201711122000 2017-11-13 13:47:51 INFO::--> Session id:201711131000 2017-11-13 13:47:51 INFO:predmodel:Importing from C:/Workplace/Sandbox/Production/spmf/R/../import/201711131000 into C:/Workplace/Sandbox/Production/spmf/R/../work/201711131000 2017-11-13 13:47:57 INFO::--> train session id:201711122000 2017-11-13 13:47:57 INFO::--> score session id:201711131000
  36. 36. LogLevel: INFO LogLevel: DEBUG LogLevel: TRACE
  37. 37. import_training.R
  38. 38. tests/test_spmf.R library(predmodel) library(testthat) context("Testing context") test_that(desc = "Test", code = { expect_true(5 > 3) expect_true(pi < 3) })

×