From d7ae662b953e0df770c604bb77dd495bd4d54553 Mon Sep 17 00:00:00 2001 From: Marla Jahari Date: Fri, 9 Jun 2023 12:42:18 +0530 Subject: [PATCH 1/7] add test function --- src/CRRao.jl | 3 ++- src/kmeans.jl | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 src/kmeans.jl diff --git a/src/CRRao.jl b/src/CRRao.jl index 59c062d..a29f5ce 100644 --- a/src/CRRao.jl +++ b/src/CRRao.jl @@ -397,9 +397,10 @@ export Prior_Ridge, Prior_Laplace, Prior_Cauchy, Prior_TDist, Prior_HorseShoe, P export CRRaoLink, Logit, Probit, Cloglog, Cauchit, fit export coef, coeftable, r2, adjr2, loglikelihood, aic, bic, sigma, predict, residuals, cooksdistance, BPTest, pvalue export FrequentistRegression, BayesianRegression +export myf include("random_number_generator.jl") include("general_stats.jl") include("fitmodel.jl") - +include("kmeans.jl") end diff --git a/src/kmeans.jl b/src/kmeans.jl new file mode 100644 index 0000000..92f1174 --- /dev/null +++ b/src/kmeans.jl @@ -0,0 +1,4 @@ +function myf() + return "This is my second function" +end + From eebbc41c70270c57151f03fbda477e8bfde15d78 Mon Sep 17 00:00:00 2001 From: Marla Jahari Date: Sat, 10 Jun 2023 20:47:05 +0530 Subject: [PATCH 2/7] add gpr functions --- .vscode/launch.json | 17 +++++++++ Project.toml | 1 + src/CRRao.jl | 36 +++++++++++++++++-- src/bayesian/gaussian_processes_regression.jl | 30 ++++++++++++++++ src/kmeans.jl | 4 --- 5 files changed, 81 insertions(+), 7 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 src/bayesian/gaussian_processes_regression.jl delete mode 100644 src/kmeans.jl diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..1398655 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,17 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + + { + "type": "judy", + "request": "launch", + "name": "Ask for file name", + "program": "${workspaceFolder}/${command:AskForProgramName}", + "stopOnEntry": false, + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/Project.toml b/Project.toml index 651b150..c6238e6 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ authors = ["xKDR Forum, Sourish Das"] version = "0.1.0" [deps] +Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" diff --git a/src/CRRao.jl b/src/CRRao.jl index a29f5ce..6e97204 100644 --- a/src/CRRao.jl +++ b/src/CRRao.jl @@ -1,11 +1,15 @@ module CRRao + using DataFrames, GLM, Turing, StatsModels, StatsBase using StatsBase, Distributions, LinearAlgebra using Optim, NLSolversBase, Random, HypothesisTests +using GaussianProcesses, Distances,StatsModels + import StatsBase: coef, coeftable, r2, adjr2, loglikelihood, aic, bic, predict, residuals, cooksdistance, fit import HypothesisTests: pvalue + """ ```julia LinearRegression @@ -94,12 +98,37 @@ where """ struct PoissonRegression end +""" +```julia +GaussianProcessesRegression +``` +Type representing the Gaussian Processes Regression model class. + +```math + y = f(X) + \\varepsilon, +``` +where +```math +\\varepsilon \\sim N(0,\\sigma^2), +``` + + ``y`` is the response vector of size ``n``, representing the observed target values, + + ``X`` is the matrix of input variables of size ``n \\times p``, where ``n`` is the sample size and ``p`` is the number of input variables, + + ``f(X)`` is the latent function that represents the underlying unknown relationship between ``X`` and ``y``, + + ``\\varepsilon`` is the noise term that follows a Gaussian distribution with zero mean and variance ``\\sigma^2``. + + ``\\sigma`` is the standard deviation of the noise ``\\varepsilon``. + +The latent function `f(X)` is assumed to follow a Gaussian process, completely specified by its mean function and covariance function. The mean function provides the prior expectation of the latent function, and the covariance function captures the dependence structure between different input points. + +""" +struct GaussianProcessesRegression end + """ ```julia Boot_Residual ``` Type representing Residual Bootstrap. """ + struct Boot_Residual end """ @@ -307,6 +336,8 @@ y_i \\sim D(\\mu_i,\\sigma), i=1,2,\\cdots,n """ struct Prior_HorseShoe end + + """ ```julia CRRaoLink @@ -392,15 +423,14 @@ end Cauchit() = Cauchit(Cauchit_Link) -export LinearRegression, LogisticRegression, PoissonRegression, NegBinomRegression, Boot_Residual +export LinearRegression, LogisticRegression, PoissonRegression, NegBinomRegression, Boot_Residual, GaussianProcessesRegression export Prior_Ridge, Prior_Laplace, Prior_Cauchy, Prior_TDist, Prior_HorseShoe, Prior_Gauss export CRRaoLink, Logit, Probit, Cloglog, Cauchit, fit export coef, coeftable, r2, adjr2, loglikelihood, aic, bic, sigma, predict, residuals, cooksdistance, BPTest, pvalue export FrequentistRegression, BayesianRegression -export myf include("random_number_generator.jl") include("general_stats.jl") include("fitmodel.jl") -include("kmeans.jl") +include("bayesian/gaussian_processes_regression.jl") end diff --git a/src/bayesian/gaussian_processes_regression.jl b/src/bayesian/gaussian_processes_regression.jl new file mode 100644 index 0000000..93f0985 --- /dev/null +++ b/src/bayesian/gaussian_processes_regression.jl @@ -0,0 +1,30 @@ + +function fit(formula, data::DataFrame, modelClass::GaussianProcessesRegression,IndexVar, mean, kern::Kernel, + DistanceClass::Euclidean) + + formula = apply_schema(formula, schema(formula, data), RegressionModel) + select!(data, IndexVar) + y, X = modelcols(formula, data) + logObsNoise = -1.0 + gp = GP(X', y, mean, kern, logObsNoise) + optimize!(gp) + return BayesianRegression(:GaussianProcessesRegression, gp, formula) + +end + + + +function fit(formula, data::DataFrame, modelClass::GaussianProcessesRegression,IndexVar, + DistanceClass::Euclidean) + +formula = apply_schema(formula, schema(formula, data), RegressionModel) +select!(data, IndexVar) +y, X = modelcols(formula, data) +logObsNoise = -1.0 +mean= MeanZero() +kern=SE(0.0,0.0) +gp = GP(X', y, mean, kern, logObsNoise) +optimize!(gp) +return BayesianRegression(:GaussianProcessesRegression, gp, formula) + +end diff --git a/src/kmeans.jl b/src/kmeans.jl deleted file mode 100644 index 92f1174..0000000 --- a/src/kmeans.jl +++ /dev/null @@ -1,4 +0,0 @@ -function myf() - return "This is my second function" -end - From 4f000847210cc26836ec884212fd923b8040597d Mon Sep 17 00:00:00 2001 From: Marla Jahari Date: Sat, 10 Jun 2023 20:59:43 +0530 Subject: [PATCH 3/7] add gpr functions --- Project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Project.toml b/Project.toml index c6238e6..6ad6583 100644 --- a/Project.toml +++ b/Project.toml @@ -6,9 +6,11 @@ version = "0.1.0" [deps] Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" +GaussianProcesses = "891a1506-143c-57d2-908e-e1f8e92e6de9" HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" NLSolversBase = "d41bc354-129a-5804-8e4c-c37616107c6c" From 70e12ca2b94d17be14ed0e9339b3de09df5088b6 Mon Sep 17 00:00:00 2001 From: Shahnaz Abdul Hameed <77606931+MarlaJahari@users.noreply.github.com> Date: Sat, 10 Jun 2023 21:45:46 +0530 Subject: [PATCH 4/7] Delete launch.json --- .vscode/launch.json | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index 1398655..0000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - - { - "type": "judy", - "request": "launch", - "name": "Ask for file name", - "program": "${workspaceFolder}/${command:AskForProgramName}", - "stopOnEntry": false, - "console": "integratedTerminal" - } - ] -} \ No newline at end of file From 32983718f5e57d8195760a05da6bc429d30ce7b6 Mon Sep 17 00:00:00 2001 From: Marla Jahari Date: Tue, 13 Jun 2023 00:08:10 +0530 Subject: [PATCH 5/7] Added test cases. --- src/bayesian/gaussian_processes_regression.jl | 126 ++++++++++++++++-- 1 file changed, 114 insertions(+), 12 deletions(-) diff --git a/src/bayesian/gaussian_processes_regression.jl b/src/bayesian/gaussian_processes_regression.jl index 93f0985..7f91502 100644 --- a/src/bayesian/gaussian_processes_regression.jl +++ b/src/bayesian/gaussian_processes_regression.jl @@ -1,6 +1,56 @@ -function fit(formula, data::DataFrame, modelClass::GaussianProcessesRegression,IndexVar, mean, kern::Kernel, - DistanceClass::Euclidean) +""" +```julia + +fit(formula, data::DataFrame, modelClass::GaussianProcessesRegression, IndexVar, mean, kern::Kernel, + DistanceClass::Euclidean) +``` + +Fit a Gaussian Process Regression model on the input data with a Gaussian Process prior and user-specific mean and kernel functions. + +#Example + +```julia-repl + +julia> using CRRao, RDatasets, StatsModels, StatsPlots, GaussianProcesses, Distances +julia> df = dataset("datasets", "mtcars") +32×12 DataFrame + Row │ Model MPG Cyl Disp HP DRat WT QSec VS AM Gear Carb + │ String31 Float64 Int64 Float64 Int64 Float64 Float64 Float64 Int64 Int64 Int64 Int64 +─────┼────────────────────────────────────────────────────────────────────────────────────────────────────────── + 1 │ Mazda RX4 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4 + 2 │ Mazda RX4 Wag 21.0 6 160.0 110 3.9 2.875 17.02 0 1 4 4 + ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ + 31 │ Maserati Bora 15.0 8 301.0 335 3.54 3.57 14.6 0 1 5 8 + 32 │ Volvo 142E 21.4 4 121.0 109 4.11 2.78 18.6 1 1 4 2 + 28 rows omitted + +julia> container=fit(@formula(MPG ~0+ HP),df,GaussianProcessesRegression(),[:MPG, :HP],MeanZero(), SE(0.0,0.0),Euclidean()) + +Formula: MPG ~ 0 + HP +Link: CRRao.Identity(CRRao.Identity_Link) +Chain: GP Exact object: + Dim = 1 + Number of observations = 32 + Mean function: + Type: MeanZero, Params: Float64[] + Kernel: + Type: SEIso{Float64}, Params: [5.464908573213355, 3.3936838718120708] + Input observations = +[110.0 110.0 … 335.0 109.0] + Output observations = [21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2 … 15.2, 13.3, 19.2, 27.3, 26.0, 30.4, 15.8, 19.7, 15.0, 21.4] + Variance of observation noise = 9.667961411202336 + Marginal Log-Likelihood = -89.745 +julia> plot(container.chain) +``` +""" +function fit(formula, + data::DataFrame, + modelClass::GaussianProcessesRegression, + IndexVar, + mean, + kern::Kernel, + DistanceClass::Euclidean) formula = apply_schema(formula, schema(formula, data), RegressionModel) select!(data, IndexVar) @@ -12,19 +62,71 @@ function fit(formula, data::DataFrame, modelClass::GaussianProcessesRegression,I end +""" + +```julia + +fit(formula, data::DataFrame, modelClass::GaussianProcessesRegression, IndexVar, mean + DistanceClass::Euclidean) +``` + +Fit a Gaussian Process Regression model on the input data with a Gaussian Process prior. The Zero Mean and Squared Exponential kernel function is implemented by default. + +#Example + +```julia-repl +julia> using CRRao, RDatasets, StatsModels, StatsPlots, GaussianProcesses, Distances +julia> df = dataset("datasets", "mtcars") +32×12 DataFrame + Row │ Model MPG Cyl Disp HP DRat WT QSec VS AM Gear Carb + │ String31 Float64 Int64 Float64 Int64 Float64 Float64 Float64 Int64 Int64 Int64 Int64 +─────┼────────────────────────────────────────────────────────────────────────────────────────────────────────── + 1 │ Mazda RX4 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4 + 2 │ Mazda RX4 Wag 21.0 6 160.0 110 3.9 2.875 17.02 0 1 4 4 + ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ + 31 │ Maserati Bora 15.0 8 301.0 335 3.54 3.57 14.6 0 1 5 8 + 32 │ Volvo 142E 21.4 4 121.0 109 4.11 2.78 18.6 1 1 4 2 + 28 rows omitted -function fit(formula, data::DataFrame, modelClass::GaussianProcessesRegression,IndexVar, +julia> container=fit(@formula(MPG ~0+ HP),df,GaussianProcessesRegression(),[:MPG, :HP],Euclidean()) + +Formula: MPG ~ 0 + HP +Link: CRRao.Identity(CRRao.Identity_Link) +Chain: GP Exact object: + Dim = 1 + Number of observations = 32 + Mean function: + Type: MeanZero, Params: Float64[] + Kernel: + Type: SEIso{Float64}, Params: [5.464908573213355, 3.3936838718120708] + Input observations = +[110.0 110.0 … 335.0 109.0] + Output observations = [21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2 … 15.2, 13.3, 19.2, 27.3, 26.0, 30.4, 15.8, 19.7, 15.0, 21.4] + Variance of observation noise = 9.667961411202336 + Marginal Log-Likelihood = -89.745 + + + +julia> plot(container.chain) +``` +""" + + +function fit(formula, + data::DataFrame, + modelClass::GaussianProcessesRegression, + IndexVar, DistanceClass::Euclidean) -formula = apply_schema(formula, schema(formula, data), RegressionModel) -select!(data, IndexVar) -y, X = modelcols(formula, data) -logObsNoise = -1.0 -mean= MeanZero() -kern=SE(0.0,0.0) -gp = GP(X', y, mean, kern, logObsNoise) -optimize!(gp) -return BayesianRegression(:GaussianProcessesRegression, gp, formula) + formula = apply_schema(formula, schema(formula, data), RegressionModel) + select!(data, IndexVar) + y, X = modelcols(formula, data) + logObsNoise = -1.0 + mean= MeanZero() + kern=SE(0.0,0.0) + gp = GP(X', y, mean, kern, logObsNoise) + optimize!(gp) + return BayesianRegression(:GaussianProcessesRegression, gp, formula) end From 0035598ab70e4ed9cff9f0dd0d54c7666a42a67d Mon Sep 17 00:00:00 2001 From: Shahnaz Abdul Hameed <77606931+MarlaJahari@users.noreply.github.com> Date: Tue, 13 Jun 2023 20:36:29 +0530 Subject: [PATCH 6/7] Update src/CRRao.jl Space after comma Co-authored-by: Ayush Patnaik --- src/CRRao.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CRRao.jl b/src/CRRao.jl index 6e97204..def6448 100644 --- a/src/CRRao.jl +++ b/src/CRRao.jl @@ -4,7 +4,7 @@ module CRRao using DataFrames, GLM, Turing, StatsModels, StatsBase using StatsBase, Distributions, LinearAlgebra using Optim, NLSolversBase, Random, HypothesisTests -using GaussianProcesses, Distances,StatsModels +using GaussianProcesses, Distances, StatsModels import StatsBase: coef, coeftable, r2, adjr2, loglikelihood, aic, bic, predict, residuals, cooksdistance, fit import HypothesisTests: pvalue From 8e7dbda152de3f671f4d29695a3373b83848bb2c Mon Sep 17 00:00:00 2001 From: Marla Jahari Date: Thu, 20 Jul 2023 16:28:58 +0530 Subject: [PATCH 7/7] Added test cases for GPR --- test/Project.toml | 2 ++ test/numerical/bayesian/GaussianProcessesRegression.jl | 8 ++++++++ test/runtests.jl | 8 +++++++- 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 test/numerical/bayesian/GaussianProcessesRegression.jl diff --git a/test/Project.toml b/test/Project.toml index 6b30ee2..d435824 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,5 +1,7 @@ [deps] +Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" +GaussianProcesses = "891a1506-143c-57d2-908e-e1f8e92e6de9" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" diff --git a/test/numerical/bayesian/GaussianProcessesRegression.jl b/test/numerical/bayesian/GaussianProcessesRegression.jl new file mode 100644 index 0000000..1434b71 --- /dev/null +++ b/test/numerical/bayesian/GaussianProcessesRegression.jl @@ -0,0 +1,8 @@ +mtcars = dataset("datasets", "mtcars") + +CRRao.set_rng(StableRNG(123)) +model = fit(@formula(MPG ~ 0 + HP), mtcars, GaussianProcessesRegression(), [:MPG, :HP], MeanZero(), SE(0.0, 0.0), Euclidean()) + +@test get_params(model.chain)[2:3] ≈ [5.464908573213355, 3.3936838718120708] +@test noise_variance(model.chain) ≈ 9.667961411202336 +@test model.chain.target ≈ -89.74473360543863 \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 2d9d5aa..afd6eba 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,5 @@ -using CRRao, Test, StableRNGs, Logging, RDatasets, StatsModels, GLM, Statistics +using CRRao, Test, StableRNGs, Logging, RDatasets, StatsModels, GLM, Statistics, Distances +using GaussianProcesses: MeanZero, SE, get_params, noise_variance Logging.disable_logging(Logging.Warn) @@ -34,6 +35,10 @@ CRRao.set_rng(StableRNG(123)) end @testset "Bayesian" begin + @testset "Gaussian Processes Regression" begin + include("numerical/bayesian/GaussianProcessesRegression.jl") + end + @testset "Linear Regression" begin include("numerical/bayesian/LinearRegression.jl") end @@ -52,3 +57,4 @@ CRRao.set_rng(StableRNG(123)) end end end +