Skip to content

Commit 3712291

Browse files
jrevelsericphanson
andauthored
enable field-order-agnostic overloads of fromarrow for struct types (#493)
Motivated by beacon-biosignals/Legolas.jl#94 (comment) Still requires: - [x] docs - [x] a test - [x] a bit more due diligence benchmarking-wise. `@benchmark`ing the access in the test case from beacon-biosignals/Legolas.jl#94 didn't reveal any perf difference, which seems like a good sign --------- Co-authored-by: Eric Hanson <5846501+ericphanson@users.noreply.github.com>
1 parent 787768f commit 3712291

File tree

6 files changed

+61
-20
lines changed

6 files changed

+61
-20
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
name = "Arrow"
1818
uuid = "69666777-d1a9-59fb-9406-91d4454c9d45"
1919
authors = ["quinnj <quinn.jacobd@gmail.com>"]
20-
version = "2.6.3"
20+
version = "2.7.0"
2121

2222
[deps]
2323
ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd"

src/ArrowTypes/Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
name = "ArrowTypes"
1919
uuid = "31f734f8-188a-4ce0-8406-c8a06bd891cd"
2020
authors = ["quinnj <quinn.jacobd@gmail.com>"]
21-
version = "2.2.2"
21+
version = "2.3.0"
2222

2323
[deps]
2424
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"

src/ArrowTypes/src/ArrowTypes.jl

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,11 +170,15 @@ overload is necessary.
170170
A few `ArrowKind`s have/allow slightly more custom overloads for their `fromarrow` methods:
171171
* `ListKind{true}`: for `String` types, they may overload `fromarrow(::Type{T}, ptr::Ptr{UInt8}, len::Int) = ...` to avoid
172172
materializing a `String`
173-
* `StructKind`: must overload `fromarrow(::Type{T}, x...)` where individual fields are passed as separate
173+
* `StructKind`:
174+
* May overload `fromarrow(::Type{T}, x...)` where individual fields are passed as separate
174175
positional arguments; so if my custom type `Interval` has two fields `first` and `last`, then I'd overload like
175176
`ArrowTypes.fromarrow(::Type{Interval}, first, last) = ...`. Note the default implementation is
176177
`ArrowTypes.fromarrow(::Type{T}, x...) = T(x...)`, so if your type already accepts all arguments in a constructor
177178
no additional `fromarrow` method should be necessary (default struct constructors have this behavior).
179+
* Alternatively, may overload `fromarrowstruct(::Type{T}, ::Val{fnames}, x...)`, where `fnames` is a tuple of the
180+
field names corresponding to the values in `x`. This approach is useful when you need to implement deserialization
181+
in a manner that is agnostic to the field order used by the serializer. When implemented, `fromarrowstruct` takes precedence over `fromarrow` in `StructKind` deserialization.
178182
"""
179183
function fromarrow end
180184
fromarrow(::Type{T}, x::T) where {T} = x
@@ -302,6 +306,8 @@ struct StructKind <: ArrowKind end
302306

303307
ArrowKind(::Type{<:NamedTuple}) = StructKind()
304308

309+
@inline fromarrowstruct(T::Type, ::Val, x...) = fromarrow(T, x...)
310+
305311
fromarrow(
306312
::Type{NamedTuple{names,types}},
307313
x::NamedTuple{names,types},

src/arraytypes/struct.jl

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
2020
An `ArrowVector` where each element is a "struct" of some kind with ordered, named fields, like a `NamedTuple{names, types}` or regular julia `struct`.
2121
"""
22-
struct Struct{T,S} <: ArrowVector{T}
22+
struct Struct{T,S,fnames} <: ArrowVector{T}
2323
validity::ValidityBitmap
2424
data::S # Tuple of ArrowVector
2525
::Int
@@ -33,23 +33,29 @@ isnamedtuple(T) = false
3333
istuple(::Type{<:Tuple}) = true
3434
istuple(T) = false
3535

36-
@propagate_inbounds function Base.getindex(s::Struct{T,S}, i::Integer) where {T,S}
36+
if isdefined(ArrowTypes, :fromarrowstruct)
37+
# https://github.com/apache/arrow-julia/pull/493
38+
@inline function _fromarrowstruct(T::Type, v::Val, x...)
39+
return ArrowTypes.fromarrowstruct(T, v, x...)
40+
end
41+
else
42+
@inline function _fromarrowstruct(T::Type, ::Val, x...)
43+
return ArrowTypes.fromarrow(T, x...)
44+
end
45+
end
46+
47+
@propagate_inbounds function Base.getindex(
48+
s::Struct{T,S,fnames},
49+
i::Integer,
50+
) where {T,S,fnames}
3751
@boundscheck checkbounds(s, i)
3852
NT = Base.nonmissingtype(T)
53+
NT !== T && (s.validity[i] || return missing)
54+
vals = ntuple(j -> s.data[j][i], fieldcount(S))
3955
if isnamedtuple(NT) || istuple(NT)
40-
if NT !== T
41-
return s.validity[i] ? NT(ntuple(j -> s.data[j][i], fieldcount(S))) : missing
42-
else
43-
return NT(ntuple(j -> s.data[j][i], fieldcount(S)))
44-
end
56+
return NT(vals)
4557
else
46-
if NT !== T
47-
return s.validity[i] ?
48-
ArrowTypes.fromarrow(NT, (s.data[j][i] for j = 1:fieldcount(S))...) :
49-
missing
50-
else
51-
return ArrowTypes.fromarrow(NT, (s.data[j][i] for j = 1:fieldcount(S))...)
52-
end
58+
return _fromarrowstruct(NT, Val{fnames}(), vals...)
5359
end
5460
end
5561

@@ -100,7 +106,8 @@ function arrowvector(::StructKind, x, i, nl, fi, de, ded, meta; kw...)
100106
arrowvector(ToStruct(x, j), i, nl + 1, j, de, ded, nothing; kw...) for
101107
j = 1:fieldcount(T)
102108
)
103-
return Struct{withmissing(eltype(x), namedtupletype(T, data)),typeof(data)}(
109+
NT = namedtupletype(T, data)
110+
return Struct{withmissing(eltype(x), NT),typeof(data),fieldnames(NT)}(
104111
validity,
105112
data,
106113
len,

src/table.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -840,7 +840,8 @@ function build(f::Meta.Field, L::Meta.Struct, batch, rb, de, nodeidx, bufferidx,
840840
data = Tuple(vecs)
841841
meta = buildmetadata(f.custom_metadata)
842842
T = juliaeltype(f, meta, convert)
843-
return Struct{T,typeof(data)}(validity, data, len, meta), nodeidx, bufferidx
843+
fnames = ntuple(i -> Symbol(f.children[i].name), length(f.children))
844+
return Struct{T,typeof(data),fnames}(validity, data, len, meta), nodeidx, bufferidx
844845
end
845846

846847
function build(f::Meta.Field, L::Meta.Union, batch, rb, de, nodeidx, bufferidx, convert)

test/runtests.jl

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1014,6 +1014,33 @@ end
10141014
# @test isequal(table.v, table2.v)
10151015

10161016
# end
1017-
1017+
if isdefined(ArrowTypes, :StructElement)
1018+
@testset "# 493" begin
1019+
# This test stresses the existence of the mechanism
1020+
# implemented in https://github.com/apache/arrow-julia/pull/493,
1021+
# but doesn't stress the actual use case that motivates
1022+
# that mechanism, simply because it'd be more annoying to
1023+
# write that test; see the PR for details.
1024+
struct Foo493
1025+
x::Int
1026+
y::Int
1027+
end
1028+
ArrowTypes.arrowname(::Type{Foo493}) = Symbol("JuliaLang.Foo493")
1029+
ArrowTypes.JuliaType(::Val{Symbol("JuliaLang.Foo493")}, T) = Foo493
1030+
function ArrowTypes.fromarrowstruct(
1031+
::Type{Foo493},
1032+
::Val{fnames},
1033+
x...,
1034+
) where {fnames}
1035+
nt = NamedTuple{fnames}(x)
1036+
return Foo493(nt.x + 1, nt.y + 1)
1037+
end
1038+
t = (; f=[Foo493(1, 2), Foo493(3, 4)])
1039+
buf = Arrow.tobuffer(t)
1040+
tbl = Arrow.Table(buf)
1041+
@test tbl.f[1] === Foo493(2, 3)
1042+
@test tbl.f[2] === Foo493(4, 5)
1043+
end
1044+
end
10181045
end # @testset "misc"
10191046
end

0 commit comments

Comments
 (0)