Merge pull request #492 from katy-sadowski/katy__exclude_checks

Add instruction for excluding checks
OHDSI · Oct 11, 2023 · b1eb194 · b1eb194
2 parents abd194d + da3539a
commit b1eb194
Show file tree

Hide file tree

Showing 2 changed files with 69 additions and 44 deletions.
diff --git a/extras/codeToRun.R b/extras/codeToRun.R
@@ -19,42 +19,59 @@ library(DatabaseConnector)
 
 # fill out the connection details -----------------------------------------------------------------------
 connectionDetails <- DatabaseConnector::createConnectionDetails(
-  dbms = "",
-  user = "",
-  password = "",
-  server = "",
-  port = "",
+  dbms = "", 
+  user = "", 
+  password = "", 
+  server = "", 
+  port = "", 
   extraSettings = "",
   pathToDriver = ""
 )
 
 cdmDatabaseSchema <- "yourCdmSchema" # the fully qualified database schema name of the CDM
 resultsDatabaseSchema <- "yourResultsSchema" # the fully qualified database schema name of the results schema (that you can write to)
 cdmSourceName <- "Your CDM Source" # a human readable name for your CDM source
-cdmVersion <- "5.4" # the CDM version you are targetting. Currently supporst 5.2.2, 5.3.1, and 5.4
+cdmVersion <- "5.4" # the CDM version you are targetting. Currently supports 5.2, 5.3, and 5.4
 
 # determine how many threads (concurrent SQL sessions) to use ----------------------------------------
 numThreads <- 1 # on Redshift, 3 seems to work well
 
 # specify if you want to execute the queries or inspect them ------------------------------------------
-sqlOnly <- FALSE # set to TRUE if you just want to get the SQL scripts and not actually run the queries.  See codeToRun_sqlOnly.R for other sqlOnly parameters
+sqlOnly <- FALSE # set to TRUE if you just want to get the SQL scripts and not actually run the queries
+sqlOnlyIncrementalInsert <- FALSE # set to TRUE if you want the generated SQL queries to calculate DQD results and insert them into a database table (@resultsDatabaseSchema.@writeTableName)
+sqlOnlyUnionCount <- 1  # in sqlOnlyIncrementalInsert mode, the number of check sqls to union in a single query; higher numbers can improve performance in some DBMS (e.g. a value of 25 may be 25x faster)
+
+# NOTES specific to sqlOnly <- TRUE option ------------------------------------------------------------
+# 1. You do not need a live database connection.  Instead, connectionDetails only needs these parameters:
+#      connectionDetails <- DatabaseConnector::createConnectionDetails(
+#        dbms = "", # specify your dbms
+#        pathToDriver = "/"
+#      )
+# 2. Since these are fully functional queries, this can help with debugging.
+# 3. In the results output by the sqlOnlyIncrementalInsert queries, placeholders are populated for execution_time, query_text, and warnings/errors; and the NOT_APPLICABLE rules are not applied.
+# 4. In order to use the generated SQL to insert metadata and check results into output table, you must set sqlOnlyIncrementalInsert = TRUE.  Otherwise sqlOnly is backwards compatable with <= v2.2.0, generating queries which run the checks but don't store the results.
 
 
 # where should the results and logs go? ----------------------------------------------------------------
 outputFolder <- "output"
 outputFile <- "results.json"
 
+
 # logging type -------------------------------------------------------------------------------------
 verboseMode <- TRUE # set to FALSE if you don't want the logs to be printed to the console
 
-# write results to table? -----------------------------------------------------------------------
-writeToTable <- FALSE # set to TRUE if you want to write to a SQL table in the results schema
+# write results to table? ------------------------------------------------------------------------------
+writeToTable <- TRUE # set to FALSE if you want to skip writing to a SQL table in the results schema
+
+# specify the name of the results table (used when writeToTable = TRUE and when sqlOnlyIncrementalInsert = TRUE)
+writeTableName <- "dqdashboard_results"
 
 # write results to a csv file? -----------------------------------------------------------------------
 writeToCsv <- FALSE # set to FALSE if you want to skip writing to csv file
 csvFile <- "" # only needed if writeToCsv is set to TRUE
 
 # if writing to table and using Redshift, bulk loading can be initialized -------------------------------
+
 # Sys.setenv("AWS_ACCESS_KEY_ID" = "",
 #            "AWS_SECRET_ACCESS_KEY" = "",
 #            "AWS_DEFAULT_REGION" = "",
@@ -67,46 +84,46 @@ csvFile <- "" # only needed if writeToCsv is set to TRUE
 checkLevels <- c("TABLE", "FIELD", "CONCEPT")
 
 # which DQ checks to run? ------------------------------------
-checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3.1_Check_Desciptions.csv
+checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3_Check_Descriptions.csv
+
+# want to EXCLUDE a pre-specified list of checks? run the following code:
+#
+# checksToExclude <- c() # Names of check types to exclude from your DQD run
+# allChecks <- DataQualityDashboard::listDqChecks()
+# checkNames <- allChecks$checkDescriptions %>%
+#   subset(!(checkName %in% checksToExclude)) %>%
+#   select(checkName)
 
 # which CDM tables to exclude? ------------------------------------
-tablesToExclude <- c()
+tablesToExclude <- c("CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN") # list of CDM table names to skip evaluating checks against; by default DQD excludes the vocab tables
 
 # run the job --------------------------------------------------------------------------------------
-DataQualityDashboard::executeDqChecks(
-  connectionDetails = connectionDetails,
-  cdmDatabaseSchema = cdmDatabaseSchema,
-  resultsDatabaseSchema = resultsDatabaseSchema,
-  cdmSourceName = cdmSourceName,
-  cdmVersion = cdmVersion
-  numThreads = numThreads,
-  sqlOnly = sqlOnly, 
-  outputFolder = outputFolder,
-  outputFile = outputFile,
-  verboseMode = verboseMode,
-  writeToTable = writeToTable,
-  writeToCsv = writeToCsv,
-  csvFile = csvFile,
-  checkLevels = checkLevels,
-  tablesToExclude = tablesToExclude,
-  checkNames = checkNames
-)
+DataQualityDashboard::executeDqChecks(connectionDetails = connectionDetails, 
+                                      cdmDatabaseSchema = cdmDatabaseSchema, 
+                                      resultsDatabaseSchema = resultsDatabaseSchema,
+                                      cdmSourceName = cdmSourceName, 
+                                      cdmVersion = cdmVersion,
+                                      numThreads = numThreads,
+                                      sqlOnly = sqlOnly, 
+                                      sqlOnlyUnionCount = sqlOnlyUnionCount,
+                                      sqlOnlyIncrementalInsert = sqlOnlyIncrementalInsert,
+                                      outputFolder = outputFolder,
+                                      outputFile = outputFile,
+                                      verboseMode = verboseMode,
+                                      writeToTable = writeToTable,
+                                      writeToCsv = writeToCsv,
+                                      csvFile = csvFile,
+                                      checkLevels = checkLevels,
+                                      tablesToExclude = tablesToExclude,
+                                      checkNames = checkNames)
 
 # inspect logs ----------------------------------------------------------------------------
-ParallelLogger::launchLogViewer(
-  logFileName = file.path(outputFolder,
-  sprintf("log_DqDashboard_%s.txt", cdmSourceName))
-)
-
-# View the Data Quality Dashboard using the integrated shiny application ------------------------------------
-DataQualityDashboard::viewDqDashboard(
-  jsonPath = file.path(getwd(), outputFolder, outputFile)
-)
+ParallelLogger::launchLogViewer(logFileName = file.path(outputFolder, cdmSourceName, 
+                                                        sprintf("log_DqDashboard_%s.txt", cdmSourceName)))
 
 # (OPTIONAL) if you want to write the JSON file to the results table separately -----------------------------
-jsonFilePath <- "" # put the path to the outputted JSON file
-DataQualityDashboard::writeJsonResultsToTable(
-  connectionDetails = connectionDetails,
-  resultsDatabaseSchema = resultsDatabaseSchema,
-  jsonFilePath = jsonFilePath
-)
+jsonFilePath <- ""
+DataQualityDashboard::writeJsonResultsToTable(connectionDetails = connectionDetails, 
+                                              resultsDatabaseSchema = resultsDatabaseSchema, 
+                                              jsonFilePath = jsonFilePath)
+
diff --git a/vignettes/DataQualityDashboard.rmd b/vignettes/DataQualityDashboard.rmd
@@ -111,6 +111,14 @@ checkLevels <- c("TABLE", "FIELD", "CONCEPT")
 # which DQ checks to run? ------------------------------------
 checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3_Check_Descriptions.csv
 
+# want to EXCLUDE a pre-specified list of checks? run the following code:
+#
+# checksToExclude <- c() # Names of check types to exclude from your DQD run
+# allChecks <- DataQualityDashboard::listDqChecks()
+# checkNames <- allChecks$checkDescriptions %>%
+#   subset(!(checkName %in% checksToExclude)) %>%
+#   select(checkName)
+
 # which CDM tables to exclude? ------------------------------------
 tablesToExclude <- c("CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN") # list of CDM table names to skip evaluating checks against; by default DQD excludes the vocab tables