## ----echo = FALSE, message = FALSE, warning = FALSE--------------------------- library(PatientLevelPrediction) ## ----echo = TRUE, eval=FALSE-------------------------------------------------- # attr(param, "settings") <- list( # seed = 12, # modelName = "Special classifier" # ) ## ----tidy=FALSE,eval=FALSE---------------------------------------------------- # setMadeUp <- function(a = c(1, 4, 10), b = 2, seed = NULL) { # # add input checks here... # # param <- split( # expand.grid( # a = a, # b = b # ), # 1:(length(a) * length(b)) # ) # # attr(param, "settings") <- list( # modelName = "Made Up", # requiresDenseMatrix = TRUE, # seed = seed # ) # # # now create list of all combinations: # result <- list( # fitFunction = "fitMadeUp", # this will be called to train the made up model # param = param # ) # class(result) <- "modelSettings" # # return(result) # } ## ----tidy=FALSE,eval=FALSE---------------------------------------------------- # fitMadeUp <- function(trainData, modelSettings, search, analysisId) { # param <- modelSettings$param # # # **************** code to train the model here # # trainedModel <- this code should apply each hyper-parameter combination # # (param[[i]]) using the specified search (e.g., cross validation) # # then pick out the best hyper-parameter setting # # and finally fit a model on the whole train data using the # # optimal hyper-parameter settings # # **************** # # # **************** code to apply the model to trainData # # prediction <- code to apply trainedModel to trainData # # **************** # # # **************** code to get variable importance (if possible) # # varImp <- code to get importance of each variable in trainedModel # # **************** # # # # construct the standard output for a model: # result <- list( # model = trainedModel, # prediction = prediction, # the train and maybe the cross validation predictions for the trainData # preprocessing = list( # featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, # tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, # requireDenseMatrix = attr(param, "settings")$requiresDenseMatrix, # ), # modelDesign = list( # outcomeId = attr(trainData, "metaData")$outcomeId, # targetId = attr(trainData, "metaData")$targetId, # plpDataSettings = attr(trainData, "metaData")$plpDataSettings, # covariateSettings = attr(trainData, "metaData")$covariateSettings, # populationSettings = attr(trainData, "metaData")$populationSettings, # featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings, # prerocessSettings = attr(trainData$covariateData, "metaData")$prerocessSettings, # modelSettings = list( # model = attr(param, "settings")$modelName, # the model name # param = param, # finalModelParameters = param[[bestInd]], # best hyper-parameters # extraSettings = attr(param, "settings") # ), # splitSettings = attr(trainData, "metaData")$splitSettings, # sampleSettings = attr(trainData, "metaData")$sampleSettings # ), # trainDetails = list( # analysisId = analysisId, # developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema, # attrition = attr(trainData, "metaData")$attrition, # trainingTime = timeToTrain, # how long it took to train the model # trainingDate = Sys.Date(), # hyperParamSearch = hyperSummary # the hyper-parameters and performance data.frame # ), # covariateImportance = merge(trainData$covariateData$covariateRef, varImp, by = "covariateId") # add variable importance to covariateRef if possible # ) # class(result) <- "plpModel" # attr(result, "predictionFunction") <- "madeupPrediction" # attr(result, "modelType") <- "binary" # return(result) # } ## ----tidy=FALSE,eval=FALSE---------------------------------------------------- # madeupPrediction <- function(plpModel, data, cohort) { # # ************* code to do prediction for each rowId in cohort # # predictionValues <- code to do prediction here returning the predicted risk # # (value) for each rowId in cohort # #************** # # prediction <- merge(cohort, predictionValues, by = "rowId") # attr(prediction, "metaData") <- list(modelType = attr(plpModel, "modelType")) # return(prediction) # } ## ----tidy=FALSE,eval=FALSE---------------------------------------------------- # setMadeUp <- function(a = c(1, 4, 6), b = 2, seed = NULL) { # # add input checks here... # # if (is.null(seed)) { # seed <- sample(100000, 1) # } # # param <- split( # expand.grid( # a = a, # b = b # ), # 1:(length(a) * length(b)) # ) # # attr(param, "settings") <- list( # modelName = "Made Up", # requiresDenseMatrix = TRUE, # seed = seed # ) # # # now create list of all combinations: # result <- list( # fitFunction = "fitMadeUp", # this will be called to train the made up model # param = param # ) # class(result) <- "modelSettings" # # return(result) # } ## ----tidy=FALSE,eval=FALSE---------------------------------------------------- # fitMadeUp <- function(trainData, modelSettings, search, analysisId) { # # set the seed for reproducibility # param <- modelSettings$param # set.seed(attr(param, "settings")$seed) # # # add folds to labels: # trainData$labels <- merge(trainData$labels, trainData$folds, by = "rowId") # # convert data into sparse R Matrix: # mappedData <- toSparseM(trainData, map = NULL) # matrixData <- mappedData$dataMatrix # labels <- mappedData$labels # covariateRef <- mappedData$covariateRef # # # ============= STEP 1 ====================================== # # pick the best hyper-params and then do final training on all data... # writeLines("Cross validation") # paramSel <- lapply( # param, # function(x) { # do.call( # madeUpModel, # list( # param = x, # final = FALSE, # data = matrixData, # labels = labels # ) # ) # } # ) # hyperSummary <- do.call(rbind, lapply(paramSel, function(x) x$hyperSum)) # hyperSummary <- as.data.frame(hyperSummary) # hyperSummary$auc <- unlist(lapply(paramSel, function(x) x$auc)) # paramSel <- unlist(lapply(paramSel, function(x) x$auc)) # bestInd <- which.max(paramSel) # # # get cross val prediction for best hyper-parameters # prediction <- param.sel[[bestInd]]$prediction # prediction$evaluationType <- "CV" # # writeLines("final train") # finalResult <- do.call( # madeUpModel, # list( # param = param[[bestInd]], # final = TRUE, # data = matrixData, # labels = labels # ) # ) # # trainedModel <- finalResult$model # # # prediction risk on training data: # finalResult$prediction$evaluationType <- "Train" # # # get CV and train prediction # prediction <- rbind(prediction, finalResult$prediction) # # varImp <- covariateRef %>% dplyr::collect() # # no feature importance available # vqrImp$covariateValue <- 0 # # timeToTrain <- Sys.time() - start # # # construct the standard output for a model: # result <- list( # model = trainedModel, # prediction = prediction, # preprocessing = list( # featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, # tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, # requireDenseMatrix = attr(param, "settings")$requiresDenseMatrix, # ), # modelDesign = list( # outcomeId = attr(trainData, "metaData")$outcomeId, # targetId = attr(trainData, "metaData")$targetId, # plpDataSettings = attr(trainData, "metaData")$plpDataSettings, # covariateSettings = attr(trainData, "metaData")$covariateSettings, # populationSettings = attr(trainData, "metaData")$populationSettings, # featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings, # prerocessSettings = attr(trainData$covariateData, "metaData")$prerocessSettings, # modelSettings = list( # model = attr(param, "settings")$modelName, # the model name # param = param, # finalModelParameters = param[[bestInd]], # best hyper-parameters # extraSettings = attr(param, "settings") # ), # splitSettings = attr(trainData, "metaData")$splitSettings, # sampleSettings = attr(trainData, "metaData")$sampleSettings # ), # trainDetails = list( # analysisId = analysisId, # developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema, # attrition = attr(trainData, "metaData")$attrition, # trainingTime = timeToTrain, # how long it took to train the model # trainingDate = Sys.Date(), # hyperParamSearch = hyperSummary # the hyper-parameters and performance data.frame # ), # covariateImportance = merge(trainData$covariateData$covariateRef, varImp, by = "covariateId") # add variable importance to covariateRef if possible # ) # class(result) <- "plpModel" # attr(result, "predictionFunction") <- "madeupPrediction" # attr(result, "modelType") <- "binary" # return(result) # } ## ----tidy=FALSE,eval=FALSE---------------------------------------------------- # madeUpModel <- function(param, data, final = FALSE, labels) { # if (final == FALSE) { # # add value column to store all predictions # labels$value <- rep(0, nrow(labels)) # attr(labels, "metaData") <- list(modelType = "binary") # # foldPerm <- c() # this holds CV aucs # for (index in 1:max(labels$index)) { # model <- madeup::model( # x = data[labels$index != index, ], # remove left out fold # y = labels$outcomeCount[labels$index != index], # a = param$a, # b = param$b # ) # # # predict on left out fold # pred <- stats::predict(model, data[labels$index == index, ]) # labels$value[labels$index == index] <- pred # # # calculate auc on help out fold # aucVal <- computeAuc(labels[labels$index == index, ]) # foldPerm <- c(foldPerm, aucVal) # } # auc <- computeAuc(labels) # overal AUC # } else { # model <- madeup::model( # x = data, # y = labels$outcomeCount, # a = param$a, # b = param$b # ) # # pred <- stats::predict(model, data) # labels$value <- pred # attr(labels, "metaData") <- list(modelType = "binary") # auc <- computeAuc(labels) # foldPerm <- auc # } # # result <- list( # model = model, # auc = auc, # prediction = labels, # hyperSum = c(a = a, b = b, fold_auc = foldPerm) # ) # # return(result) # } ## ----tidy=FALSE,eval=FALSE---------------------------------------------------- # madeupPrediction <- function(plpModel, data, cohort) { # if (class(data) == "plpData") { # # convert # matrixObjects <- toSparseM( # plpData = data, # cohort = cohort, # map = plpModel$covariateImportance %>% # dplyr::select("columnId", "covariateId") # ) # # newData <- matrixObjects$dataMatrix # cohort <- matrixObjects$labels # } else { # newData <- data # } # # if (class(plpModel) == "plpModel") { # model <- plpModel$model # } else { # model <- plpModel # } # # cohort$value <- stats::predict(model, newData) # # # fix the rowIds to be the old ones # # now use the originalRowId and remove the matrix rowId # cohort <- cohort %>% # dplyr::select(-"rowId") %>% # dplyr::rename(rowId = "originalRowId") # # attr(cohort, "metaData") <- list(modelType = attr(plpModel, "modelType")) # return(cohort) # } ## ----tidy=TRUE,eval=TRUE------------------------------------------------------ citation("PatientLevelPrediction")