diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
index 27f4091b..b161782d 100644
--- a/.github/workflows/Benchmark.yml
+++ b/.github/workflows/Benchmark.yml
@@ -13,6 +13,7 @@ concurrency:
 permissions:
   contents: write
   pull-requests: write
+  issues: write
 
 jobs:
   benchmark:
@@ -47,10 +48,10 @@ jobs:
           name: Benchmark Results
           tool: 'julia'
           output-file-path: bench/benchmark_results.json
-          summary-always: true
+          summary-always: ${{ !github.event.pull_request.head.repo.fork }} # Disable summary for PRs from forks
           github-token: ${{ secrets.GITHUB_TOKEN }}
-          comment-always: true
           alert-threshold: "200%"
           fail-on-alert: true
           benchmark-data-dir-path: benchmarks
+          comment-always: ${{ !github.event.pull_request.head.repo.fork }} # Disable comments for PRs from forks
           auto-push: ${{ !github.event.pull_request.head.repo.fork }} # Disable push for PRs from forks
diff --git a/Project.toml b/Project.toml
index 62f7a550..3219992c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -42,7 +42,7 @@ Accessors = "0.1"
 Bijectors = "0.13"
 ChainRulesCore = "1.16"
 DiffResults = "1"
-Distributions = "0.25.87"
+Distributions = "0.25.111"
 DocStringExtensions = "0.8, 0.9"
 Enzyme = "0.12.32"
 FillArrays = "1.3"
diff --git a/docs/make.jl b/docs/make.jl
index b71d9a4f..c70bf05f 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -16,8 +16,8 @@ makedocs(;
         "ELBO Maximization" => [
             "Overview" => "elbo/overview.md",
             "Reparameterization Gradient Estimator" => "elbo/repgradelbo.md",
-            "Location-Scale Variational Family" => "locscale.md",
         ],
+        "Variational Families" => "families.md",
         "Optimization" => "optimization.md",
     ],
 )
diff --git a/docs/src/families.md b/docs/src/families.md
new file mode 100644
index 00000000..e270acad
--- /dev/null
+++ b/docs/src/families.md
@@ -0,0 +1,267 @@
+# [Reparameterizable Variational Families](@id families)
+
+The [RepGradELBO](@ref repgradelbo) objective assumes that the members of the variational family have a differentiable sampling path.
+We provide multiple pre-packaged variational families that can be readily used.
+
+## [The `LocationScale` Family](@id locscale)
+
+The [location-scale](https://en.wikipedia.org/wiki/Location%E2%80%93scale_family) variational family is a family of probability distributions, where their sampling process can be represented as
+
+```math
+z \sim  q_{\lambda} \qquad\Leftrightarrow\qquad
+z \stackrel{d}{=} C u + m;\quad u \sim \varphi
+```
+
+where ``C`` is the *scale*, ``m`` is the location, and ``\varphi`` is the *base distribution*.
+``m`` and ``C`` form the variational parameters ``\lambda = (m, C)`` of ``q_{\lambda}``.
+The location-scale family encompases many practical variational families, which can be instantiated by setting the *base distribution* of ``u`` and the structure of ``C``.
+
+The probability density is given by
+
+```math
+  q_{\lambda}(z) = {|C|}^{-1} \varphi(C^{-1}(z - m)),
+```
+
+the covariance is given as
+
+```math
+  \mathrm{Var}\left(q_{\lambda}\right) = C \mathrm{Var}(q_{\lambda}) C^{\top}
+```
+
+and the entropy is given as
+
+```math
+  \mathbb{H}(q_{\lambda}) = \mathbb{H}(\varphi) + \log |C|,
+```
+
+where ``\mathbb{H}(\varphi)`` is the entropy of the base distribution.
+Notice the ``\mathbb{H}(\varphi)`` does not depend on ``\log |C|``.
+The derivative of the entropy with respect to ``\lambda`` is thus independent of the base distribution.
+
+### API
+
+!!! note
+    
+    For stable convergence, the initial `scale` needs to be sufficiently large and well-conditioned.
+    Initializing `scale` to have small eigenvalues will often result in initial divergences and numerical instabilities.
+
+```@docs
+MvLocationScale
+```
+
+The following are specialized constructors for convenience:
+
+```@docs
+FullRankGaussian
+MeanFieldGaussian
+```
+
+### Gaussian Variational Families
+
+```julia
+using AdvancedVI, LinearAlgebra, Distributions;
+μ = zeros(2);
+
+L = LowerTriangular(diagm(ones(2)));
+q = FullRankGaussian(μ, L)
+
+L = Diagonal(ones(2));
+q = MeanFieldGaussian(μ, L)
+```
+
+### Student-$$t$$ Variational Families
+
+```julia
+using AdvancedVI, LinearAlgebra, Distributions;
+μ = zeros(2);
+ν = 3;
+
+# Full-Rank 
+L = LowerTriangular(diagm(ones(2)));
+q = MvLocationScale(μ, L, TDist(ν))
+
+# Mean-Field
+L = Diagonal(ones(2));
+q = MvLocationScale(μ, L, TDist(ν))
+```
+
+### Laplace Variational families
+
+```julia
+using AdvancedVI, LinearAlgebra, Distributions;
+μ = zeros(2);
+
+# Full-Rank 
+L = LowerTriangular(diagm(ones(2)));
+q = MvLocationScale(μ, L, Laplace())
+
+# Mean-Field
+L = Diagonal(ones(2));
+q = MvLocationScale(μ, L, Laplace())
+```
+
+## The `LocationScaleLowRank` Family
+
+In practice, `LocationScale` families with full-rank scale matrices are known to converge slowly as they require a small SGD stepsize.
+Low-rank variational families can be an effective alternative[^ONS2018].
+`LocationScaleLowRank` generally represent any ``d``-dimensional distribution which its sampling path can be represented as
+
+```math
+z \sim  q_{\lambda} \qquad\Leftrightarrow\qquad
+z \stackrel{d}{=} D u_1 + U u_2  + m;\quad u_1, u_2 \sim \varphi
+```
+
+where ``D \in \mathbb{R}^{d \times d}`` is a diagonal matrix, ``U \in \mathbb{R}^{d \times r}`` is a dense low-rank matrix for the rank ``r > 0``, ``m \in \mathbb{R}^d`` is the location, and ``\varphi`` is the *base distribution*.
+``m``, ``D``, and ``U`` form the variational parameters ``\lambda = (m, D, U)``.
+
+The covariance of this distribution is given as
+
+```math
+  \mathrm{Var}\left(q_{\lambda}\right) = D \mathrm{Var}(\varphi) D + U \mathrm{Var}(\varphi) U^{\top}
+```
+
+and the entropy is given by the matrix determinant lemma as
+
+```math
+  \mathbb{H}(q_{\lambda}) 
+  = \mathbb{H}(\varphi) + \log |\Sigma|
+  = \mathbb{H}(\varphi) + 2 \log |D| + \log |I + U^{\top} D^{-2} U|,
+```
+
+where ``\mathbb{H}(\varphi)`` is the entropy of the base distribution.
+
+```@setup lowrank
+using ADTypes
+using AdvancedVI
+using Distributions
+using LinearAlgebra
+using LogDensityProblems
+using Optimisers
+using Plots
+using ReverseDiff
+
+struct Target{D}
+    dist::D
+end
+
+function LogDensityProblems.logdensity(model::Target, θ)
+    logpdf(model.dist, θ)
+end
+
+function LogDensityProblems.dimension(model::Target)
+    return length(model.dist)
+end
+
+function LogDensityProblems.capabilities(::Type{<:Target})
+    return LogDensityProblems.LogDensityOrder{0}()
+end
+
+n_dims     = 30
+U_true     = randn(n_dims, 3)
+D_true     = Diagonal(log.(1 .+ exp.(randn(n_dims))))
+Σ_true     = D_true + U_true*U_true'
+Σsqrt_true = sqrt(Σ_true)
+μ_true     = randn(n_dims)
+model      = Target(MvNormal(μ_true, Σ_true));
+
+d  = LogDensityProblems.dimension(model);
+μ  = zeros(d);
+
+L     = Diagonal(ones(d));
+q0_mf = MeanFieldGaussian(μ, L)
+
+L     = LowerTriangular(diagm(ones(d)));
+q0_fr = FullRankGaussian(μ, L)
+
+D     = ones(n_dims)
+U     = zeros(n_dims, 3)
+q0_lr = LowRankGaussian(μ, D, U)
+
+obj = RepGradELBO(1);
+
+max_iter = 10^4
+
+function callback(; params, averaged_params, restructure, stat, kwargs...)
+    q = restructure(averaged_params)
+    μ, Σ = mean(q), cov(q)
+    (dist2 = sum(abs2, μ - μ_true) + tr(Σ + Σ_true - 2*sqrt(Σsqrt_true*Σ*Σsqrt_true)),)
+end
+
+_, _, stats_fr, _ = AdvancedVI.optimize(
+    model,
+    obj,
+    q0_fr,
+    max_iter;
+    show_progress = false,
+    adtype        = AutoReverseDiff(),
+    optimizer     = Adam(0.01),
+    averager      = PolynomialAveraging(),
+    callback      = callback,
+); 
+
+_, _, stats_mf, _ = AdvancedVI.optimize(
+    model,
+    obj,
+    q0_mf,
+    max_iter;
+    show_progress = false,
+    adtype        = AutoReverseDiff(),
+    optimizer     = Adam(0.01),
+    averager      = PolynomialAveraging(),
+    callback      = callback,
+); 
+
+_, _, stats_lr, _ = AdvancedVI.optimize(
+    model,
+    obj,
+    q0_lr,
+    max_iter;
+    show_progress = false,
+    adtype        = AutoReverseDiff(),
+    optimizer     = Adam(0.01),
+    averager      = PolynomialAveraging(),
+    callback      = callback,
+); 
+
+t       = [stat.iteration for stat in stats_fr]
+dist_fr = [sqrt(stat.dist2) for stat in stats_fr]
+dist_mf = [sqrt(stat.dist2) for stat in stats_mf]
+dist_lr = [sqrt(stat.dist2) for stat in stats_lr]
+plot( t, dist_mf , label="Mean-Field Gaussian", xlabel="Iteration", ylabel="Wasserstein-2 Distance")
+plot!(t, dist_fr,  label="Full-Rank Gaussian",  xlabel="Iteration", ylabel="Wasserstein-2 Distance")
+plot!(t, dist_lr,  label="Low-Rank Gaussian",   xlabel="Iteration", ylabel="Wasserstein-2 Distance")
+savefig("lowrank_family_wasserstein.svg")
+nothing
+```
+
+Consider a 30-dimensional Gaussian with a diagonal plus low-rank covariance structure, where the true rank is 3.
+Then, we can compare the convergence speed of `LowRankGaussian` versus `FullRankGaussian`:
+
+![](lowrank_family_wasserstein.svg)
+
+As we can see, `LowRankGaussian` converges faster than `FullRankGaussian`.
+While `FullRankGaussian` can converge to the true solution since it is a more expressive variational family, `LowRankGaussian` gets there faster.
+
+!!! info
+    
+    `MvLocationScaleLowRank` tend to work better with the `Optimisers.Adam` optimizer due to non-smoothness.
+    Other optimisers may experience divergences.
+
+### API
+
+```@docs
+MvLocationScaleLowRank
+```
+
+The `logpdf` of  `MvLocationScaleLowRank` has an optional argument `non_differentiable::Bool` (default: `false`).
+If set as `true`, a more efficient ``O\left(r d^2\right)`` implementation is used to evaluate the density.
+This, however, is not differentiable under most AD frameworks due to the use of Cholesky `lowrankupdate`.
+The default value is `false`, which uses a ``O\left(d^3\right)`` implementation, is differentiable and therefore compatible with the `StickingTheLandingEntropy` estimator.
+
+The following is a specialized constructor for convenience:
+
+```@docs
+LowRankGaussian
+```
+
+[^ONS2018]: Ong, V. M. H., Nott, D. J., & Smith, M. S. (2018). Gaussian variational approximation with a factor covariance structure. Journal of Computational and Graphical Statistics, 27(3), 465-478.
diff --git a/docs/src/locscale.md b/docs/src/locscale.md
deleted file mode 100644
index 643c3a98..00000000
--- a/docs/src/locscale.md
+++ /dev/null
@@ -1,80 +0,0 @@
-
-# [Location-Scale Variational Family](@id locscale)
-
-## Introduction
-The [location-scale](https://en.wikipedia.org/wiki/Location%E2%80%93scale_family) variational family is a family of probability distributions, where their sampling process can be represented as
-```math
-z \sim  q_{\lambda} \qquad\Leftrightarrow\qquad
-z \stackrel{d}{=} C u + m;\quad u \sim \varphi
-```
-where ``C`` is the *scale*, ``m`` is the location, and ``\varphi`` is the *base distribution*.
-``m`` and ``C`` form the variational parameters ``\lambda = (m, C)`` of ``q_{\lambda}``. 
-The location-scale family encompases many practical variational families, which can be instantiated by setting the *base distribution* of ``u`` and the structure of ``C``.
-
-The probability density is given by
-```math
-  q_{\lambda}(z) = {|C|}^{-1} \varphi(C^{-1}(z - m))
-```
-and the entropy is given as
-```math
-  \mathbb{H}(q_{\lambda}) = \mathbb{H}(\varphi) + \log |C|,
-```
-where ``\mathbb{H}(\varphi)`` is the entropy of the base distribution.
-Notice the ``\mathbb{H}(\varphi)`` does not depend on ``\log |C|``.
-The derivative of the entropy with respect to ``\lambda`` is thus independent of the base distribution.
-
-## Constructors
-
-!!! note
-    For stable convergence, the initial `scale` needs to be sufficiently large and well-conditioned. 
-    Initializing `scale` to have small eigenvalues will often result in initial divergences and numerical instabilities.
-
-```@docs
-MvLocationScale
-```
-
-```@docs
-FullRankGaussian
-MeanFieldGaussian
-```
-
-## Gaussian Variational Families
-```julia
-using AdvancedVI, LinearAlgebra, Distributions;
-μ = zeros(2);
-
-L = diagm(ones(2)) |> LowerTriangular;
-q = FullRankGaussian(μ, L)
-
-L = ones(2) |> Diagonal;
-q = MeanFieldGaussian(μ, L)
-```
-
-## Sudent-$$t$$ Variational Families
-```julia
-using AdvancedVI, LinearAlgebra, Distributions;
-μ = zeros(2);
-ν = 3;
-
-# Full-Rank 
-L = diagm(ones(2)) |> LowerTriangular;
-q = MvLocationScale(μ, L, TDist(ν))
-
-# Mean-Field
-L = ones(2) |> Diagonal;
-q = MvLocationScale(μ, L, TDist(ν))
-```
-
-## Laplace Variational families
-```julia
-using AdvancedVI, LinearAlgebra, Distributions;
-μ = zeros(2);
-
-# Full-Rank 
-L = diagm(ones(2)) |> LowerTriangular;
-q = MvLocationScale(μ, L, Laplace())
-
-# Mean-Field
-L = ones(2) |> Diagonal;
-q = MvLocationScale(μ, L, Laplace())
-```
diff --git a/src/AdvancedVI.jl b/src/AdvancedVI.jl
index 8ac1b645..5402e075 100644
--- a/src/AdvancedVI.jl
+++ b/src/AdvancedVI.jl
@@ -180,6 +180,10 @@ export MvLocationScale, MeanFieldGaussian, FullRankGaussian
 
 include("families/location_scale.jl")
 
+export MvLocationScaleLowRank, LowRankGaussian
+
+include("families/location_scale_low_rank.jl")
+
 # Optimization Rules
 
 include("optimization/rules.jl")
diff --git a/src/families/location_scale.jl b/src/families/location_scale.jl
index 1aab2e71..22af4b4a 100644
--- a/src/families/location_scale.jl
+++ b/src/families/location_scale.jl
@@ -1,6 +1,14 @@
 
+struct MvLocationScale{S,D<:ContinuousDistribution,L,E<:Real} <:
+       ContinuousMultivariateDistribution
+    location::L
+    scale::S
+    dist::D
+    scale_eps::E
+end
+
 """
-    MvLocationScale(location, scale, dist) <: ContinuousMultivariateDistribution
+    MvLocationScale(location, scale, dist; scale_eps)
 
 The location scale variational family broadly represents various variational
 families using `location` and `scale` variational parameters.
@@ -12,21 +20,20 @@ represented as follows:
   u = rand(dist, d)
   z = scale*u + location
 ```
-"""
-struct MvLocationScale{S,D<:ContinuousDistribution,L,E<:Real} <:
-       ContinuousMultivariateDistribution
-    location::L
-    scale::S
-    dist::D
-    scale_eps::E
-end
 
+`scale_eps` sets a constraint on the smallest value of `scale` to be enforced during optimization.
+This is necessary to guarantee stable convergence.
+
+# Keyword Arguments
+- `scale_eps`: Lower bound constraint for the diagonal of the scale. (default: `1e-4`).
+"""
 function MvLocationScale(
     location::AbstractVector{T},
     scale::AbstractMatrix{T},
-    dist::ContinuousDistribution;
-    scale_eps::T=sqrt(eps(T)),
+    dist::ContinuousUnivariateDistribution;
+    scale_eps::T=T(1e-4),
 ) where {T<:Real}
+    @assert minimum(diag(scale)) ≥ scale_eps "Initial scale is too small (smallest diagonal value is $(minimum(diag(scale)))). This might result in unstable optimization behavior."
     return MvLocationScale(location, scale, dist, scale_eps)
 end
 
@@ -37,8 +44,8 @@ Functors.@functor MvLocationScale (location, scale)
 # `scale <: Diagonal`, which is not the default behavior. Otherwise, forward-mode AD
 # is very inefficient.
 # begin
-struct RestructureMeanField{S<:Diagonal,D,L}
-    model::MvLocationScale{S,D,L}
+struct RestructureMeanField{S<:Diagonal,D,L,E}
+    model::MvLocationScale{S,D,L,E}
 end
 
 function (re::RestructureMeanField)(flat::AbstractVector)
@@ -48,7 +55,7 @@ function (re::RestructureMeanField)(flat::AbstractVector)
     return MvLocationScale(location, scale, re.model.dist, re.model.scale_eps)
 end
 
-function Optimisers.destructure(q::MvLocationScale{<:Diagonal,D,L}) where {D,L}
+function Optimisers.destructure(q::MvLocationScale{<:Diagonal,D,L,E}) where {D,L,E}
     @unpack location, scale, dist = q
     flat = vcat(location, diag(scale))
     return flat, RestructureMeanField(q)
@@ -59,7 +66,7 @@ Base.length(q::MvLocationScale) = length(q.location)
 
 Base.size(q::MvLocationScale) = size(q.location)
 
-Base.eltype(::Type{<:MvLocationScale{S,D,L}}) where {S,D,L} = eltype(D)
+Base.eltype(::Type{<:MvLocationScale{S,D,L,E}}) where {S,D,L,E} = eltype(D)
 
 function StatsBase.entropy(q::MvLocationScale)
     @unpack location, scale, dist = q
@@ -106,54 +113,57 @@ function Distributions._rand!(
     return x .+= location
 end
 
-Distributions.mean(q::MvLocationScale) = q.location
+function Distributions.mean(q::MvLocationScale)
+    @unpack location, scale = q
+    return location + scale * Fill(mean(q.dist), length(location))
+end
 
 function Distributions.var(q::MvLocationScale)
     C = q.scale
-    return Diagonal(C * C')
+    σ2 = var(q.dist)
+    return σ2 * diag(C * C')
 end
 
 function Distributions.cov(q::MvLocationScale)
     C = q.scale
-    return Hermitian(C * C')
+    σ2 = var(q.dist)
+    return σ2 * Hermitian(C * C')
 end
 
 """
-    FullRankGaussian(location, scale; check_args = true)
+    FullRankGaussian(μ, L; scale_eps)
 
 Construct a Gaussian variational approximation with a dense covariance matrix.
 
 # Arguments
-- `location::AbstractVector{T}`: Mean of the Gaussian.
-- `scale::LinearAlgebra.AbstractTriangular{T}`: Cholesky factor of the covariance of the Gaussian.
+- `μ::AbstractVector{T}`: Mean of the Gaussian.
+- `L::LinearAlgebra.AbstractTriangular{T}`: Cholesky factor of the covariance of the Gaussian.
 
 # Keyword Arguments
-- `check_args`: Check the conditioning of the initial scale (default: `true`).
+- `scale_eps`: Smallest value allowed for the diagonal of the scale. (default: `1e-4`).
 """
 function FullRankGaussian(
-    μ::AbstractVector{T}, L::LinearAlgebra.AbstractTriangular{T}; scale_eps::T=sqrt(eps(T))
+    μ::AbstractVector{T}, L::LinearAlgebra.AbstractTriangular{T}; scale_eps::T=T(1e-4)
 ) where {T<:Real}
-    @assert minimum(diag(L)) ≥ sqrt(scale_eps) "Initial scale is too small (smallest diagonal value is $(minimum(diag(L)))). This might result in unstable optimization behavior."
     q_base = Normal{T}(zero(T), one(T))
     return MvLocationScale(μ, L, q_base, scale_eps)
 end
 
 """
-    MeanFieldGaussian(location, scale; check_args = true)
+    MeanFieldGaussian(μ, L; scale_eps)
 
 Construct a Gaussian variational approximation with a diagonal covariance matrix.
 
 # Arguments
-- `location::AbstractVector{T}`: Mean of the Gaussian.
-- `scale::Diagonal{T}`: Diagonal Cholesky factor of the covariance of the Gaussian.
+- `μ::AbstractVector{T}`: Mean of the Gaussian.
+- `L::Diagonal{T}`: Diagonal Cholesky factor of the covariance of the Gaussian.
 
 # Keyword Arguments
-- `check_args`: Check the conditioning of the initial scale (default: `true`).
+- `scale_eps`: Smallest value allowed for the diagonal of the scale. (default: `1e-4`).
 """
 function MeanFieldGaussian(
-    μ::AbstractVector{T}, L::Diagonal{T}; scale_eps::T=sqrt(eps(T))
+    μ::AbstractVector{T}, L::Diagonal{T}; scale_eps::T=T(1e-4)
 ) where {T<:Real}
-    @assert minimum(diag(L)) ≥ sqrt(eps(eltype(L))) "Initial scale is too small (smallest diagonal value is $(minimum(diag(L)))). This might result in unstable optimization behavior."
     q_base = Normal{T}(zero(T), one(T))
     return MvLocationScale(μ, L, q_base, scale_eps)
 end
diff --git a/src/families/location_scale_low_rank.jl b/src/families/location_scale_low_rank.jl
new file mode 100644
index 00000000..e2044142
--- /dev/null
+++ b/src/families/location_scale_low_rank.jl
@@ -0,0 +1,176 @@
+
+struct MvLocationScaleLowRank{
+    L,SD<:AbstractVector,SF<:AbstractMatrix,D<:ContinuousDistribution,E<:Real
+} <: ContinuousMultivariateDistribution
+    location::L
+    scale_diag::SD
+    scale_factors::SF
+    dist::D
+    scale_eps::E
+end
+
+"""
+    MvLocationLowRankScale(location, scale_diag, scale_factors, dist; scale_eps)
+
+Variational family with a covariance in the form of a diagonal matrix plus a squared low-rank matrix.
+The rank is given by `size(scale_factors, 2)`.
+
+It generally represents any distribution for which the sampling path can be
+represented as follows:
+```julia
+  d = length(location)
+  r = size(scale_factors, 2)
+  u_diag = rand(dist, d)
+  u_factors = rand(dist, r)
+  z = scale_diag.*u_diag + scale_factors*u_factors + location
+```
+
+`scale_eps` sets a constraint on the smallest value of `scale_diag` to be enforced during optimization.
+This is necessary to guarantee stable convergence.
+
+# Keyword Arguments
+- `scale_eps`: Lower bound constraint for the values of scale_diag. (default: `sqrt(eps(T))`).
+"""
+function MvLocationScaleLowRank(
+    location::AbstractVector{T},
+    scale_diag::AbstractVector{T},
+    scale_factors::AbstractMatrix{T},
+    dist::ContinuousUnivariateDistribution;
+    scale_eps::T=T(1e-4),
+) where {T<:Real}
+    @assert minimum(scale_diag) ≥ scale_eps "Initial scale is too small (smallest diagonal scale value is $(minimum(scale_diag)). This might result in unstable optimization behavior."
+    @assert size(scale_factors, 1) == length(scale_diag)
+    return MvLocationScaleLowRank(location, scale_diag, scale_factors, dist, scale_eps)
+end
+
+Functors.@functor MvLocationScaleLowRank (location, scale_diag, scale_factors)
+
+Base.length(q::MvLocationScaleLowRank) = length(q.location)
+
+Base.size(q::MvLocationScaleLowRank) = size(q.location)
+
+Base.eltype(::Type{<:MvLocationScaleLowRank{L,SD,SF,D,E}}) where {L,SD,SF,D,E} = eltype(L)
+
+function StatsBase.entropy(q::MvLocationScaleLowRank)
+    @unpack location, scale_diag, scale_factors, dist = q
+    n_dims = length(location)
+    scale_diag2 = scale_diag .* scale_diag
+    UtDinvU = Hermitian(scale_factors' * (scale_factors ./ scale_diag2))
+    logdetΣ = 2 * sum(log.(scale_diag)) + logdet(I + UtDinvU)
+    return n_dims * convert(eltype(location), entropy(dist)) + logdetΣ / 2
+end
+
+function Distributions.logpdf(
+    q::MvLocationScaleLowRank, z::AbstractVector{<:Real}; non_differntiable::Bool=false
+)
+    @unpack location, scale_diag, scale_factors, dist = q
+    μ_base = mean(dist)
+    n_dims = length(location)
+
+    scale2chol = if non_differntiable
+        # Fast O(kd^2) path (not supported by most current AD frameworks):
+        scale2chol = Cholesky(LowerTriangular(diagm(sqrt.(scale_diag))))
+        n_factors = size(scale_factors, 2)
+        for k in 1:n_factors
+            factor = scale_factors[:, k] # copy necessary due to in-place mutation
+            lowrankupdate!(scale2chol, factor)
+        end
+        scale2chol
+    else
+        # Slow but differentiable O(d^3) path
+        scale2 = Diagonal(scale_diag .* scale_diag) + scale_factors * scale_factors'
+        cholesky(scale2)
+    end
+    z_std = z - mean(q) + scale2chol.L * Fill(μ_base, n_dims)
+    return sum(Base.Fix1(logpdf, dist), scale2chol.L \ z_std) - logdet(scale2chol.L)
+end
+
+function Distributions.rand(q::MvLocationScaleLowRank)
+    @unpack location, scale_diag, scale_factors, dist = q
+    n_dims = length(location)
+    n_factors = size(scale_factors, 2)
+    u_diag = rand(dist, n_dims)
+    u_fact = rand(dist, n_factors)
+    return scale_diag .* u_diag + scale_factors * u_fact + location
+end
+
+function Distributions.rand(
+    rng::AbstractRNG, q::MvLocationScaleLowRank{S,D,L}, num_samples::Int
+) where {S,D,L}
+    @unpack location, scale_diag, scale_factors, dist = q
+    n_dims = length(location)
+    n_factors = size(scale_factors, 2)
+    u_diag = rand(rng, dist, n_dims, num_samples)
+    u_fact = rand(rng, dist, n_factors, num_samples)
+    return scale_diag .* u_diag + scale_factors * u_fact .+ location
+end
+
+function Distributions._rand!(
+    rng::AbstractRNG, q::MvLocationScaleLowRank, x::AbstractVecOrMat{<:Real}
+)
+    @unpack location, scale_diag, scale_factors, dist = q
+
+    rand!(rng, dist, x)
+    x[:] = scale_diag .* x
+
+    u_fact = rand(rng, dist, size(scale_factors, 2), size(x, 2))
+    x[:, :] += scale_factors * u_fact
+
+    return x .+= location
+end
+
+function Distributions.mean(q::MvLocationScaleLowRank)
+    @unpack location, scale_diag, scale_factors = q
+    μ = mean(q.dist)
+    return location +
+           scale_diag .* Fill(μ, length(scale_diag)) +
+           scale_factors * Fill(μ, size(scale_factors, 2))
+end
+
+function Distributions.var(q::MvLocationScaleLowRank)
+    @unpack scale_diag, scale_factors = q
+    σ2 = var(q.dist)
+    return σ2 *
+           (scale_diag .* scale_diag + sum(scale_factors .* scale_factors; dims=2)[:, 1])
+end
+
+function Distributions.cov(q::MvLocationScaleLowRank)
+    @unpack scale_diag, scale_factors = q
+    σ2 = var(q.dist)
+    return σ2 * (Diagonal(scale_diag .* scale_diag) + scale_factors * scale_factors')
+end
+
+function update_variational_params!(
+    ::Type{<:MvLocationScaleLowRank}, opt_st, params, restructure, grad
+)
+    opt_st, params = Optimisers.update!(opt_st, params, grad)
+    q = restructure(params)
+    ϵ = q.scale_eps
+
+    # Clip diagonal to guarantee positive definite covariance
+    @. q.scale_diag = max(q.scale_diag, ϵ)
+
+    params, _ = Optimisers.destructure(q)
+
+    return opt_st, params
+end
+
+"""
+    LowRankGaussian(μ, D, U; scale_eps)
+
+Construct a Gaussian variational approximation with a diagonal plus low-rank covariance matrix.
+
+# Arguments
+- `μ::AbstractVector{T}`: Mean of the Gaussian.
+- `D::Vector{T}`: Diagonal of the scale.
+- `U::Matrix{T}`: Low-rank factors of the scale, where `size(U,2)` is the rank.
+
+# Keyword Arguments
+- `scale_eps`: Smallest value allowed for the diagonal of the scale. (default: `1e-4`).
+"""
+function LowRankGaussian(
+    μ::AbstractVector{T}, D::Vector{T}, U::Matrix{T}; scale_eps::T=T(1e-4)
+) where {T<:Real}
+    q_base = Normal{T}(zero(T), one(T))
+    return MvLocationScaleLowRank(μ, D, U, q_base; scale_eps)
+end
diff --git a/test/Project.toml b/test/Project.toml
index 251869e7..018198d1 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -27,7 +27,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 ADTypes = "0.2.1, 1"
 Bijectors = "0.13"
 DiffResults = "1.0"
-Distributions = "0.25.100"
+Distributions = "0.25.111"
 DistributionsAD = "0.6.45"
 Enzyme = "0.12.32"
 FillArrays = "1.6.1"
diff --git a/test/interface/location_scale.jl b/test/families/location_scale.jl
similarity index 60%
rename from test/interface/location_scale.jl
rename to test/families/location_scale.jl
index dcc3369d..bd45458d 100644
--- a/test/interface/location_scale.jl
+++ b/test/families/location_scale.jl
@@ -1,27 +1,34 @@
 
 @testset "interface LocationScale" begin
-    @testset "$(string(covtype)) $(basedist) $(realtype)" for basedist in [:gaussian],
+    @testset "$(string(covtype)) $(basedist) $(realtype)" for basedist in
+                                                              [:gaussian, :gaussian_nonstd],
         covtype in [:meanfield, :fullrank],
         realtype in [Float32, Float64]
 
         n_dims = 10
         n_montecarlo = 1000_000
 
-        μ = randn(realtype, n_dims)
-        L = if covtype == :fullrank
+        location = randn(realtype, n_dims)
+        scale = if covtype == :fullrank
             LowerTriangular(tril(I + ones(realtype, n_dims, n_dims) / 2))
         else
             Diagonal(ones(realtype, n_dims))
         end
-        Σ = L * L'
 
         q = if covtype == :fullrank && basedist == :gaussian
-            FullRankGaussian(μ, L)
+            FullRankGaussian(location, scale)
         elseif covtype == :meanfield && basedist == :gaussian
-            MeanFieldGaussian(μ, L)
+            MeanFieldGaussian(location, scale)
+        elseif covtype == :fullrank && basedist == :gaussian_nonstd
+            MvLocationScale(location, scale, Normal(realtype(3), realtype(3)))
+        elseif covtype == :meanfield && basedist == :gaussian_nonstd
+            MvLocationScale(location, scale, Normal(realtype(3), realtype(3)))
         end
+
         q_true = if basedist == :gaussian
-            MvNormal(μ, Σ)
+            MvNormal(location, scale * scale')
+        elseif basedist == :gaussian_nonstd
+            MvNormal(location + scale * fill(3, n_dims), 9 * scale * scale')
         end
 
         @testset "eltype" begin
@@ -46,15 +53,15 @@
         @testset "statistics" begin
             @testset "mean" begin
                 @test eltype(mean(q)) == realtype
-                @test mean(q) == μ
+                @test mean(q) ≈ mean(q_true)
             end
             @testset "var" begin
                 @test eltype(var(q)) == realtype
-                @test var(q) ≈ Diagonal(Σ)
+                @test var(q) ≈ var(q_true)
             end
             @testset "cov" begin
                 @test eltype(cov(q)) == realtype
-                @test cov(q) ≈ Σ
+                @test cov(q) ≈ cov(q_true)
             end
         end
 
@@ -62,11 +69,13 @@
             @testset "rand" begin
                 z_samples = mapreduce(x -> rand(q), hcat, 1:n_montecarlo)
                 @test eltype(z_samples) == realtype
-                @test dropdims(mean(z_samples; dims=2); dims=2) ≈ μ rtol = realtype(1e-2)
-                @test dropdims(var(z_samples; dims=2); dims=2) ≈ diag(Σ) rtol = realtype(
+                @test dropdims(mean(z_samples; dims=2); dims=2) ≈ mean(q_true) rtol = realtype(
+                    1e-2
+                )
+                @test dropdims(var(z_samples; dims=2); dims=2) ≈ var(q_true) rtol = realtype(
                     1e-2
                 )
-                @test cov(z_samples; dims=2) ≈ Σ rtol = realtype(1e-2)
+                @test cov(z_samples; dims=2) ≈ cov(q_true) rtol = realtype(1e-2)
 
                 z_sample_ref = rand(StableRNG(1), q)
                 @test z_sample_ref == rand(StableRNG(1), q)
@@ -75,11 +84,13 @@
             @testset "rand batch" begin
                 z_samples = rand(q, n_montecarlo)
                 @test eltype(z_samples) == realtype
-                @test dropdims(mean(z_samples; dims=2); dims=2) ≈ μ rtol = realtype(1e-2)
-                @test dropdims(var(z_samples; dims=2); dims=2) ≈ diag(Σ) rtol = realtype(
+                @test dropdims(mean(z_samples; dims=2); dims=2) ≈ mean(q_true) rtol = realtype(
                     1e-2
                 )
-                @test cov(z_samples; dims=2) ≈ Σ rtol = realtype(1e-2)
+                @test dropdims(var(z_samples; dims=2); dims=2) ≈ var(q_true) rtol = realtype(
+                    1e-2
+                )
+                @test cov(z_samples; dims=2) ≈ cov(q_true) rtol = realtype(1e-2)
 
                 samples_ref = rand(StableRNG(1), q, n_montecarlo)
                 @test samples_ref == rand(StableRNG(1), q, n_montecarlo)
@@ -94,11 +105,13 @@
                 z_samples = mapreduce(first, hcat, res)
                 z_samples_ret = mapreduce(last, hcat, res)
                 @test z_samples == z_samples_ret
-                @test dropdims(mean(z_samples; dims=2); dims=2) ≈ μ rtol = realtype(1e-2)
-                @test dropdims(var(z_samples; dims=2); dims=2) ≈ diag(Σ) rtol = realtype(
+                @test dropdims(mean(z_samples; dims=2); dims=2) ≈ mean(q_true) rtol = realtype(
                     1e-2
                 )
-                @test cov(z_samples; dims=2) ≈ Σ rtol = realtype(1e-2)
+                @test dropdims(var(z_samples; dims=2); dims=2) ≈ var(q_true) rtol = realtype(
+                    1e-2
+                )
+                @test cov(z_samples; dims=2) ≈ cov(q_true) rtol = realtype(1e-2)
 
                 z_sample_ref = Array{realtype}(undef, n_dims)
                 rand!(StableRNG(1), q, z_sample_ref)
@@ -112,11 +125,13 @@
                 z_samples = Array{realtype}(undef, n_dims, n_montecarlo)
                 z_samples_ret = rand!(q, z_samples)
                 @test z_samples == z_samples_ret
-                @test dropdims(mean(z_samples; dims=2); dims=2) ≈ μ rtol = realtype(1e-2)
-                @test dropdims(var(z_samples; dims=2); dims=2) ≈ diag(Σ) rtol = realtype(
+                @test dropdims(mean(z_samples; dims=2); dims=2) ≈ mean(q_true) rtol = realtype(
+                    1e-2
+                )
+                @test dropdims(var(z_samples; dims=2); dims=2) ≈ var(q_true) rtol = realtype(
                     1e-2
                 )
-                @test cov(z_samples; dims=2) ≈ Σ rtol = realtype(1e-2)
+                @test cov(z_samples; dims=2) ≈ cov(q_true) rtol = realtype(1e-2)
 
                 z_samples_ref = Array{realtype}(undef, n_dims, n_montecarlo)
                 rand!(StableRNG(1), q, z_samples_ref)
@@ -128,6 +143,38 @@
         end
     end
 
+    @testset "scale positive definite projection" begin
+        @testset "$(string(covtype)) $(realtype) $(bijector)" for covtype in
+                                                                  [:meanfield, :fullrank],
+            realtype in [Float32, Float64],
+            bijector in [nothing, :identity]
+
+            d = 5
+            μ = zeros(realtype, d)
+            ϵ = sqrt(realtype(0.5))
+            q = if covtype == :fullrank
+                L = LowerTriangular(Matrix{realtype}(I, d, d))
+                FullRankGaussian(μ, L; scale_eps=ϵ)
+            elseif covtype == :meanfield
+                L = Diagonal(ones(realtype, d))
+                MeanFieldGaussian(μ, L; scale_eps=ϵ)
+            end
+            q_trans = if isnothing(bijector)
+                q
+            else
+                Bijectors.TransformedDistribution(q, identity)
+            end
+            g = deepcopy(q)
+
+            λ, re = Optimisers.destructure(q)
+            grad, _ = Optimisers.destructure(g)
+            opt_st = Optimisers.setup(Descent(one(realtype)), λ)
+            _, λ′ = AdvancedVI.update_variational_params!(typeof(q), opt_st, λ, re, grad)
+            q′ = re(λ′)
+            @test all(var(q′) .≥ ϵ^2)
+        end
+    end
+
     @testset "Diagonal destructure" begin
         n_dims = 10
         μ = zeros(n_dims)
@@ -139,35 +186,3 @@
         @test q == re(λ)
     end
 end
-
-@testset "scale positive definite projection" begin
-    @testset "$(string(covtype)) $(realtype) $(bijector)" for covtype in
-                                                              [:meanfield, :fullrank],
-        realtype in [Float32, Float64],
-        bijector in [nothing, :identity]
-
-        d = 5
-        μ = zeros(realtype, d)
-        ϵ = sqrt(realtype(0.5))
-        q = if covtype == :fullrank
-            L = LowerTriangular(Matrix{realtype}(I, d, d))
-            FullRankGaussian(μ, L; scale_eps=ϵ)
-        elseif covtype == :meanfield
-            L = Diagonal(ones(realtype, d))
-            MeanFieldGaussian(μ, L; scale_eps=ϵ)
-        end
-        q_trans = if isnothing(bijector)
-            q
-        else
-            Bijectors.TransformedDistribution(q, identity)
-        end
-        g = deepcopy(q)
-
-        λ, re = Optimisers.destructure(q)
-        grad, _ = Optimisers.destructure(g)
-        opt_st = Optimisers.setup(Descent(one(realtype)), λ)
-        _, λ′ = AdvancedVI.update_variational_params!(typeof(q), opt_st, λ, re, grad)
-        q′ = re(λ′)
-        @test all(diag(var(q′)) .≥ ϵ^2)
-    end
-end
diff --git a/test/families/location_scale_low_rank.jl b/test/families/location_scale_low_rank.jl
new file mode 100644
index 00000000..2accb971
--- /dev/null
+++ b/test/families/location_scale_low_rank.jl
@@ -0,0 +1,178 @@
+
+@testset "interface LocationScaleLowRank" begin
+    @testset "$(basedist) rank=$(rank) $(realtype)" for basedist in
+                                                        [:gaussian, :gaussian_nonstd],
+        n_rank in [1, 2],
+        realtype in [Float32, Float64]
+
+        n_dims = 10
+        n_montecarlo = 1000_000
+
+        location = randn(realtype, n_dims)
+        scale_diag = ones(realtype, n_dims)
+        scale_factors = randn(realtype, n_dims, n_rank)
+
+        q = if basedist == :gaussian
+            LowRankGaussian(location, scale_diag, scale_factors)
+        elseif basedist == :gaussian_nonstd
+            MvLocationScaleLowRank(
+                location, scale_diag, scale_factors, Normal(realtype(3), realtype(3))
+            )
+        end
+
+        q_true = if basedist == :gaussian
+            μ = location
+            Σ = Diagonal(scale_diag .^ 2) + scale_factors * scale_factors'
+            MvNormal(location, Σ)
+        elseif basedist == :gaussian_nonstd
+            μ = location + scale_diag .* fill(3, n_dims) + scale_factors * fill(3, n_rank)
+            Σ = 3^2 * (Diagonal(scale_diag .^ 2) + scale_factors * scale_factors')
+            MvNormal(μ, Σ)
+        end
+
+        @testset "eltype" begin
+            @test eltype(q) == realtype
+        end
+
+        @testset "logpdf" begin
+            z = rand(q)
+            @test logpdf(q, z) ≈ logpdf(q_true, z) rtol = realtype(1e-2)
+            @test eltype(logpdf(q, z)) == realtype
+
+            @test logpdf(q, z; non_differntiable=true) ≈ logpdf(q_true, z) rtol = realtype(
+                1e-2
+            )
+            @test eltype(logpdf(q, z; non_differntiable=true)) == realtype
+        end
+
+        @testset "entropy" begin
+            @test eltype(entropy(q)) == realtype
+            @test entropy(q) ≈ entropy(q_true)
+        end
+
+        @testset "length" begin
+            @test length(q) == n_dims
+        end
+
+        @testset "statistics" begin
+            @testset "mean" begin
+                @test eltype(mean(q)) == realtype
+                @test mean(q) ≈ mean(q_true)
+            end
+            @testset "var" begin
+                @test eltype(var(q)) == realtype
+                @test var(q) ≈ var(q_true)
+            end
+            @testset "cov" begin
+                @test eltype(cov(q)) == realtype
+                @test cov(q) ≈ cov(q_true)
+            end
+        end
+
+        @testset "sampling" begin
+            @testset "rand" begin
+                z_samples = mapreduce(x -> rand(q), hcat, 1:n_montecarlo)
+                @test eltype(z_samples) == realtype
+                @test dropdims(mean(z_samples; dims=2); dims=2) ≈ mean(q_true) rtol = realtype(
+                    1e-2
+                )
+                @test dropdims(var(z_samples; dims=2); dims=2) ≈ var(q_true) rtol = realtype(
+                    1e-2
+                )
+                @test cov(z_samples; dims=2) ≈ cov(q_true) rtol = realtype(1e-2)
+
+                z_sample_ref = rand(StableRNG(1), q)
+                @test z_sample_ref ≈ rand(StableRNG(1), q)
+            end
+
+            @testset "rand batch" begin
+                z_samples = rand(q, n_montecarlo)
+                @test eltype(z_samples) == realtype
+                @test dropdims(mean(z_samples; dims=2); dims=2) ≈ mean(q_true) rtol = realtype(
+                    1e-2
+                )
+                @test dropdims(var(z_samples; dims=2); dims=2) ≈ var(q_true) rtol = realtype(
+                    1e-2
+                )
+                @test cov(z_samples; dims=2) ≈ cov(q_true) rtol = realtype(1e-2)
+
+                samples_ref = rand(StableRNG(1), q, n_montecarlo)
+                @test samples_ref ≈ rand(StableRNG(1), q, n_montecarlo)
+            end
+
+            @testset "rand! AbstractVector" begin
+                res = map(1:n_montecarlo) do _
+                    z_sample = Array{realtype}(undef, n_dims)
+                    z_sample_ret = rand!(q, z_sample)
+                    (z_sample, z_sample_ret)
+                end
+                z_samples = mapreduce(first, hcat, res)
+                z_samples_ret = mapreduce(last, hcat, res)
+                @test z_samples ≈ z_samples_ret
+                @test dropdims(mean(z_samples; dims=2); dims=2) ≈ mean(q_true) rtol = realtype(
+                    1e-2
+                )
+                @test dropdims(var(z_samples; dims=2); dims=2) ≈ var(q_true) rtol = realtype(
+                    1e-2
+                )
+                @test cov(z_samples; dims=2) ≈ cov(q_true) rtol = realtype(1e-2)
+
+                z_sample_ref = Array{realtype}(undef, n_dims)
+                rand!(StableRNG(1), q, z_sample_ref)
+
+                z_sample = Array{realtype}(undef, n_dims)
+                rand!(StableRNG(1), q, z_sample)
+                @test z_sample_ref ≈ z_sample
+            end
+
+            @testset "rand! AbstractMatrix" begin
+                z_samples = Array{realtype}(undef, n_dims, n_montecarlo)
+                z_samples_ret = rand!(q, z_samples)
+                @test z_samples ≈ z_samples_ret
+                @test dropdims(mean(z_samples; dims=2); dims=2) ≈ mean(q_true) rtol = realtype(
+                    1e-2
+                )
+                @test dropdims(var(z_samples; dims=2); dims=2) ≈ var(q_true) rtol = realtype(
+                    1e-2
+                )
+                @test cov(z_samples; dims=2) ≈ cov(q_true) rtol = realtype(1e-2)
+
+                z_samples_ref = Array{realtype}(undef, n_dims, n_montecarlo)
+                rand!(StableRNG(1), q, z_samples_ref)
+
+                z_samples = Array{realtype}(undef, n_dims, n_montecarlo)
+                rand!(StableRNG(1), q, z_samples)
+                @test z_samples_ref ≈ z_samples
+            end
+        end
+    end
+
+    @testset "diagonal positive definite projection" begin
+        @testset "$(realtype) $(bijector)" for realtype in [Float32, Float64],
+            bijector in [nothing, :identity]
+
+            n_rank = 2
+            d = 5
+            μ = zeros(realtype, d)
+            ϵ = sqrt(realtype(0.5))
+            D = ones(realtype, d)
+            U = randn(realtype, d, n_rank)
+            q = MvLocationScaleLowRank(
+                μ, D, U, Normal{realtype}(zero(realtype), one(realtype)); scale_eps=ϵ
+            )
+            q_trans = if isnothing(bijector)
+                q
+            else
+                Bijectors.TransformedDistribution(q, bijector)
+            end
+            g = deepcopy(q)
+
+            λ, re = Optimisers.destructure(q)
+            grad, _ = Optimisers.destructure(g)
+            opt_st = Optimisers.setup(Descent(one(realtype)), λ)
+            _, λ′ = AdvancedVI.update_variational_params!(typeof(q), opt_st, λ, re, grad)
+            q′ = re(λ′)
+            @test all(var(q′) .≥ ϵ^2)
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 5d0d2c8d..43958e8e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -52,7 +52,11 @@ if GROUP == "All" || GROUP == "Interface"
     include("interface/repgradelbo.jl")
     include("interface/rules.jl")
     include("interface/averaging.jl")
-    include("interface/location_scale.jl")
+end
+
+if GROUP == "All" || GROUP == "Families"
+    include("families/location_scale.jl")
+    include("families/location_scale_low_rank.jl")
 end
 
 const PROGRESS = haskey(ENV, "PROGRESS")