Skip to content

Commit

Permalink
improve performance with ties (#74)
Browse files Browse the repository at this point in the history
* check for all equal in small chunks

* patch bump

* actually split on ties

* tests

* pre sort

* clean up a little

* cleanup

* add comment about performance linking to PR

* consistency and correctness in debug logging
  • Loading branch information
palday committed Aug 23, 2023
1 parent 5126a74 commit 33a4cd3
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 32 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "Loess"
uuid = "4345ca2d-374a-55d4-8d30-97f9976e7612"
version = "0.6.1"
version = "0.6.2"

[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Expand Down
77 changes: 46 additions & 31 deletions src/kd.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ function KDTree(
) where T <: AbstractFloat

n, m = size(xs)
perm = collect(1:n)

bounds = Array{T}(undef, 2, m)
for j in 1:m
Expand All @@ -63,14 +62,13 @@ function KDTree(
push!(verts, T[vert...])
end

perm = collect(1:n)
root = build_kdtree(xs, perm, bounds, leaf_size_cutoff, leaf_diameter_cutoff, verts)

KDTree(convert(Matrix{T}, xs), collect(1:n), root, verts, bounds)
KDTree(convert(Matrix{T}, xs), perm, root, verts, bounds)
end




"""
diameter(bounds)
Expand All @@ -88,6 +86,32 @@ function diameter(bounds::Matrix)
euclidean(vec(bounds[1,:]), vec(bounds[2,:]))
end

"""
_select_j(xs::AbstractMatrix{T})
Select the column for sorting the rows xs based on the column with the largest spread.
"""
function _select_j(xs::AbstractMatrix{T}) where {T <: AbstractFloat}
size(xs, 2) == 1 && return 1

# split on the dimension with the largest spread
# maxspread, j = findmax(maximum(xs[perm, k]) - minimum(xs[perm, k]) for k in 1:m)
j = 1
maxspread = 0
@inbounds for k in axes(xs, 2)
xmin = Inf
xmax = -Inf
@inbounds for i in axes(xs, 1)
xmin = min(xmin, xs[i, k])
xmax = max(xmax, xs[i, k])
end
if xmax - xmin > maxspread
maxspread = xmax - xmin
j = k
end
end
return j
end

"""
build_kdtree(xs, perm, bounds, leaf_size_cutoff, leaf_diameter_cutoff, verts)
Expand Down Expand Up @@ -121,30 +145,22 @@ function build_kdtree(xs::AbstractMatrix{T},
Base.require_one_based_indexing(xs)
Base.require_one_based_indexing(perm)

j = _select_j(xs)
n, m = size(xs)
# performance testing showed that sorting everything at once was dramatically faster
# than repeated partial sorting with partialsort! when there are ties:
# https://github.com/JuliaStats/Loess.jl/pull/74
if !issorted(view(xs, perm, j))
@debug "received unsorted data, sorting"
sortperm!(perm, view(xs, :, j))
end
xjs = view(xs, perm, j)

if length(perm) <= leaf_size_cutoff || diameter(bounds) <= leaf_diameter_cutoff
@debug "Creating leaf node" length(perm) leaf_size_cutoff diameter(bounds) leaf_diameter_cutoff
return nothing
end

# split on the dimension with the largest spread
# maxspread, j = findmax(maximum(xs[perm, k]) - minimum(xs[perm, k]) for k in 1:m)
j = 1
maxspread = 0
for k in 1:m
xmin = Inf
xmax = -Inf
for i in perm
xmin = min(xmin, xs[i, k])
xmax = max(xmax, xs[i, k])
end
if xmax - xmin > maxspread
maxspread = xmax - xmin
j = k
end
end

# Find the "median" and partition
#
# The aim of the algorithm is to split the data recursively in two roughly equally sized
Expand All @@ -165,37 +181,36 @@ function build_kdtree(xs::AbstractMatrix{T},
#
# The details here are reversed engineered from the C/Fortran implementation wrapped
# by R and also distribtued on NETLIB.
mid = (length(perm) + 1) ÷ 2
@debug "Candidate median index and median value" mid xs[perm[mid], j]
mid = (length(xjs) + 1) ÷ 2
@debug "Candidate median index and median value" mid xjs[mid]

offset = 0
local mid1, mid2
while true
mid1 = mid + offset
mid2 = mid1 + 1
if mid1 < 1
@debug "mid1 is zero. All elements are identical. Creating vertex and then two leaves" mid1 length(perm) xs[perm[mid], j]
@debug "mid1 is zero. All elements are identical. Creating vertex and then two leaves" mid1 length(xjs) xjs[mid]
offset = mid1 = 0
mid2 = length(perm) + 1
mid2 = length(xjs) + 1
break
end
if mid2 > length(perm)
@debug "mid2 is out of bounds. Continuing with negative offset" mid2 length(perm) offset
if mid2 > length(xjs)
@debug "mid2 is out of bounds. Continuing with negative offset" mid2 length(xjs) offset
# This makes the offset 0, 1, -1, 2, -2, ...
offset = -offset + (offset <= 0)
continue
end
p12 = partialsort!(perm, mid1:mid2, by = i -> xs[i, j])
if xs[p12[1], j] == xs[p12[2], j]
@debug "tie! Adjusting offset" xs[p12[1], j] xs[p12[2], j] offset
if xjs[mid1] == xjs[mid2]
# @debug "tie! Adjusting offset" xs[p12[1], j] xs[p12[2], j] offset
# This makes the offset 0, 1, -1, 2, -2, ...
offset = -offset + (offset <= 0)
else
break
end
end
mid += offset
med = xs[perm[mid], j]
med = xjs[mid]
@debug "Accepted median index and median value" mid med

leftbounds = copy(bounds)
Expand Down
14 changes: 14 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,20 @@ end
@test_broken predict(model, x)[end] pred[end] atol=1e-5
end

@testset "lots of ties" begin
# adapted from https://github.com/JuliaStats/Loess.jl/pull/74#discussion_r1294303522
x = repeat([π/4*i for i in -20:20], inner=101)
y = sin.(x)

model = loess(x,y; span=0.2)
for i in -3:3
@test predict(model, i * π) 0 atol=1e-12
# not great tolerance but loess also struggles to capture the sine peaks
@test abs(predict(model, i * π + π / 2 )) 0.9 atol=0.1
end

end

@test_throws DimensionMismatch loess([1.0 2.0; 3.0 4.0], [1.0])

@testset "Issue 28" begin
Expand Down

2 comments on commit 33a4cd3

@palday
Copy link
Member Author

@palday palday commented on 33a4cd3 Aug 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/90188

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.6.2 -m "<description of version>" 33a4cd302aaf1a05f5c35d085da3bca3387b92b2
git push origin v0.6.2

Please sign in to comment.