Skip to content

Commit

Permalink
improvements for GroupBy (#17)
Browse files Browse the repository at this point in the history
* improvements for GroupBy

* add test_broken for broken GroupBy use case
  • Loading branch information
ExpandingMan committed Jul 18, 2023
1 parent b3f6cd9 commit b0a0f1d
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 17 deletions.
28 changes: 14 additions & 14 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,20 @@ Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
SplittablesBase = "171d559e-b47b-412a-8079-5efa626c420e"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[weakdeps]
BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
LazyArrays = "5078a376-72f3-5289-bfd5-ec5146d43c02"
OnlineStatsBase = "925886fa-5bf2-5e8e-b522-a9147a512338"
Referenceables = "42d2dcc6-99eb-4e98-b66c-637b7d73030e"

[extensions]
TransducersBlockArraysExt = "BlockArrays"
TransducersDataFramesExt = "DataFrames"
TransducersLazyArraysExt = "LazyArrays"
TransducersOnlineStatsBaseExt = "OnlineStatsBase"
TransducersReferenceablesExt = "Referenceables"

[compat]
Adapt = "1, 2, 3"
ArgCheck = "1, 2.0"
Expand All @@ -37,13 +51,6 @@ SplittablesBase = "0.1.2"
Tables = "0.2, 1.0"
julia = "1.6"

[extensions]
TransducersBlockArraysExt = "BlockArrays"
TransducersDataFramesExt = "DataFrames"
TransducersLazyArraysExt = "LazyArrays"
TransducersOnlineStatsBaseExt = "OnlineStatsBase"
TransducersReferenceablesExt = "Referenceables"

[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
Expand Down Expand Up @@ -74,10 +81,3 @@ TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"

[targets]
test = ["Aqua", "BlockArrays", "Compat", "DataFrames", "DataTools", "Dates", "Distributed", "Documenter", "Folds", "InteractiveUtils", "LazyArrays", "LiterateTest", "LoadAllPackages", "Maybe", "OnlineStats", "OnlineStatsBase", "PerformanceTestTools", "Pkg", "Random", "Referenceables", "SparseArrays", "StaticArrays", "Statistics", "StructArrays", "Test", "TypedTables"]

[weakdeps]
BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
LazyArrays = "5078a376-72f3-5289-bfd5-ec5146d43c02"
OnlineStatsBase = "925886fa-5bf2-5e8e-b522-a9147a512338"
Referenceables = "42d2dcc6-99eb-4e98-b66c-637b7d73030e"
34 changes: 31 additions & 3 deletions src/groupby.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
GroupBy(key, rf, [init])
GroupBy(key, xf::Transducer, [step = right, [init]])
GroupBy(key, rf, [init])
Group the input stream by a function `key` and then fan-out each group
of key-value pairs to the reducing function `rf`.
Expand Down Expand Up @@ -31,6 +31,19 @@ That is to say,
the current and all preceding results is then fed into the
`downstream`.
The signature `GroupBy(key, ::Transducer, [step, [init]])` is preferred since
results can only be combined with [`Transducers.combine`](@ref) (e.g. for use in `foldxt`)
when the inner-most reducing function is accessible. For example, instead of
```
GroupBy(key, (y, (k, v)) -> y + v.a, init)
```
one should write
```
GroupBy(key, Map(kv -> kv[2].a)'(+), init)
```
The difference is that in the latter case Transducers can figure out how to sum separately from
accessing the value.
See also `groupreduce` in
[SplitApplyCombine.jl](https://github.com/JuliaData/SplitApplyCombine.jl).
Expand Down Expand Up @@ -139,8 +152,23 @@ function Base.getindex(dict::GroupByViewDict{<:Any,<:Any,S}, key) where {S}
end

function Base.get(dict::GroupByViewDict{<:Any,<:Any,S}, key, default) where {S}
value = unwrap_all(unreduced(dict.state[key]))
return value isa S ? default : value
val = get(dict.state, key, _NoValue())
if val _NoValue()
default
else
val = unwrap_all(unreduced(something(val)))
val isa S ? default : val
end
end

#this is copied from SplittablesBase.halve(::DictLike) which should probably be more generic
function SplittablesBase.halve(dict::GroupByViewDict{K,V,S}) where {K,V,S}
i1 = SplittablesBase.Implementations.firstslot(dict.state)
i3 = SplittablesBase.Implementations.lastslot(dict.state)
i2 = (i3 - i1 + 1) ÷ 2 + i1
left = SplittablesBase.Implementations.DictView(dict, i1, i2 - 1)
right = SplittablesBase.Implementations.DictView(dict, i2, i3)
(left, right)
end

function start(rf::R_{GroupBy}, result)
Expand Down
1 change: 1 addition & 0 deletions test/test_aqua.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ if isdefined(Core, :kwcall)
Transducers;
ambiguities = (; exclude = to_exclude),
unbound_args = false, # TODO: make it work
project_toml_formatting = false, # this fails every single time a valid change is made to Project which is infuriating
)

@testset "Compare test/Project.toml and test/environments/main/Project.toml" begin
Expand Down
12 changes: 12 additions & 0 deletions test/test_groupby.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,21 @@ end
@test gd1 == Dict("1" => [1, 1], "2" => [2, 2], "3" => [3])
@test gd2 == gd3
@test gd4 == Dict(2 => [2, 4], 3 => [3], 6 => [6], 1 => [1])
@test get(gd1, "1", Int[]) == [1, 1]
@test get(gd1, "9", Int[]) == Int[]
@test Dict(gd1) == Dict("1" => [1, 1], "2" => [2, 2], "3" => [3])
@test Dict(gd2) == Dict(gd3)
@test Dict(gd4) == Dict(2 => [2, 4], 3 => [3], 6 => [6], 1 => [1])

# ensure we can use GroupByViewDict it in foldxt
r1 = gd1 |> MapSplat((k, v) -> k=>sum(v)) |> tcollect
@test Set(r1) == Set(["1"=>2, "2"=>4, "3"=>3]) # order not guaranteed

# TODO: one would expect this to work, but it doesn't, because foldxt calls combine,
# which needs to know about the inner reducing function (here push!!) which it can't
# infer from this lambda
gb = GroupBy(string, (y, (k, v)) -> push!!(y, v))
@test_broken foldxt(right, gb, [1,2,1,2,3])
end

end # module

2 comments on commit b0a0f1d

@MasonProtter
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/87772

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.4.78 -m "<description of version>" b0a0f1d2d5841eeff3b2bc84e3a1e389c072132d
git push origin v0.4.78

Please sign in to comment.