Introduction to DataFrames

07_factors

DataFrame v1.2, Julia 1.6.1

In [2]:
using DataFrames
In [4]:
# CategoricalArrays.jl is independent from DataFrames.jl but 
# it is often used in combination
using CategoricalArrays
In [6]:
using Pipe, BenchmarkTools

Working with CategoricalArrays

Constructor

In [58]:
x = categorical(["A","B","B","C"]) # unordered
Out[58]:
4-element CategoricalArray{String,1,UInt32}:
 "A"
 "B"
 "B"
 "C"
In [59]:
# ordered, by default order is sorting order
y = categorical(["A","B","B","C"],ordered=true)
Out[59]:
4-element CategoricalArray{String,1,UInt32}:
 "A"
 "B"
 "B"
 "C"
In [60]:
z = categorical(["A","B","B","C",missing]) # unordered with missing
Out[60]:
5-element CategoricalArray{Union{Missing, String},1,UInt32}:
 "A"
 "B"
 "B"
 "C"
 missing
In [61]:
# ordered, into equal counts, possible to rename labels and give
# custom break
# break : 5, 5개의 구간으로 나누는데 upper가 open,closed 두가지 형태로 돌려준다.
c = cut(1:10,5)
Out[61]:
10-element CategoricalArray{String,1,UInt32}:
 "Q1: [1.0, 2.8)"
 "Q1: [1.0, 2.8)"
 "Q2: [2.8, 4.6)"
 "Q2: [2.8, 4.6)"
 "Q3: [4.6, 6.4)"
 "Q3: [4.6, 6.4)"
 "Q4: [6.4, 8.2)"
 "Q4: [6.4, 8.2)"
 "Q5: [8.2, 10.0]"
 "Q5: [8.2, 10.0]"

(we will cover grouping later, but let us here use it to analyze the results, we use Chain.jl for chaining)

In [62]:
LinRange(1,10,6)
Out[62]:
6-element LinRange{Float64}:
 1.0,2.8,4.6,6.4,8.2,10.0
In [63]:
[range(1,10,length=6)...]
Out[63]:
6-element Vector{Float64}:
  1.0
  2.8
  4.6
  6.4
  8.2
 10.0
In [64]:
using Chain
In [65]:
@chain DataFrame(x=cut(randn(100000),10)) begin
  groupby(:x)
  combine(nrow) # just to make sure cut works right
end
Out[65]:

10 rows × 2 columns

xnrow
Cat…Int64
1Q1: [-4.60933, -1.27672)10000
2Q2: [-1.27672, -0.837586)10000
3Q3: [-0.837586, -0.522596)10000
4Q4: [-0.522596, -0.252172)10000
5Q5: [-0.252172, -0.00358539)10000
6Q6: [-0.00358539, 0.24998)10000
7Q7: [0.24998, 0.522463)10000
8Q8: [0.522463, 0.837398)10000
9Q9: [0.837398, 1.28127)10000
10Q10: [1.28127, 4.83148]10000
In [66]:
v = categorical([1,2,2,3,3]) # contains integers not strings
Out[66]:
5-element CategoricalArray{Int64,1,UInt32}:
 1
 2
 2
 3
 3
In [67]:
# sometimes you need to convert back to a standard vector
Vector{Union{String,Missing}}(z)
Out[67]:
5-element Vector{Union{Missing, String}}:
 "A"
 "B"
 "B"
 "C"
 missing

Managing levels

In [68]:
arr = [x,y,z,c,v]
Out[68]:
5-element Vector{CategoricalVector{T, UInt32, V, C, U} where {T, V, C, U}}:
 CategoricalValue{String, UInt32}["A", "B", "B", "C"]
 CategoricalValue{String, UInt32}["A", "B", "B", "C"]
 Union{Missing, CategoricalValue{String, UInt32}}["A", "B", "B", "C", missing]
 CategoricalValue{String, UInt32}["Q1: [1.0, 2.8)", "Q1: [1.0, 2.8)", "Q2: [2.8, 4.6)", "Q2: [2.8, 4.6)", "Q3: [4.6, 6.4)", "Q3: [4.6, 6.4)", "Q4: [6.4, 8.2)", "Q4: [6.4, 8.2)", "Q5: [8.2, 10.0]", "Q5: [8.2, 10.0]"]
 CategoricalValue{Int64, UInt32}[1, 2, 2, 3, 3]
In [69]:
isordered.(arr)
Out[69]:
5-element BitVector:
 0
 1
 0
 1
 0
In [70]:
ordered!(x,true), isordered(x) # make x ordered
Out[70]:
(CategoricalValue{String, UInt32}["A", "B", "B", "C"], true)
In [71]:
ordered!(x,false), isordered(x) # and unordered again
Out[71]:
(CategoricalValue{String, UInt32}["A", "B", "B", "C"], false)

missing이 포함 되지 않음

In [72]:
levels.(arr) # list levels
Out[72]:
5-element Vector{Vector{T} where T}:
 ["A", "B", "C"]
 ["A", "B", "C"]
 ["A", "B", "C"]
 ["Q1: [1.0, 2.8)", "Q2: [2.8, 4.6)", "Q3: [4.6, 6.4)", "Q4: [6.4, 8.2)", "Q5: [8.2, 10.0]"]
 [1, 2, 3]

missing이 포함됨

In [73]:
unique.(arr) # missing will be included
Out[73]:
5-element Vector{Vector{T} where T}:
 ["A", "B", "C"]
 ["A", "B", "C"]
 Union{Missing, String}["A", "B", "C", missing]
 ["Q1: [1.0, 2.8)", "Q2: [2.8, 4.6)", "Q3: [4.6, 6.4)", "Q4: [6.4, 8.2)", "Q5: [8.2, 10.0]"]
 [1, 2, 3]
In [74]:
isordered(y)
Out[74]:
true
In [75]:
y[1] < y[2] # can compare as y is ordered
Out[75]:
true
In [76]:
isordered(v)
Out[76]:
false
In [77]:
v[1] < v[2] # not comparable, v is unordered although it contains integers
ArgumentError: Unordered CategoricalValue objects cannot be tested for order using <. Use isless instead, or call the ordered! function on the parent array to change this

Stacktrace:
 [1] <(x::CategoricalValue{Int64, UInt32}, y::CategoricalValue{Int64, UInt32})
   @ CategoricalArrays ~/.julia/packages/CategoricalArrays/Fr04b/src/value.jl:150
 [2] top-level scope
   @ In[77]:1
 [3] eval
   @ ./boot.jl:360 [inlined]
 [4] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094
In [78]:
y[2] < "A"
Out[78]:
false
In [105]:
levels(y)
Out[105]:
3-element Vector{String}:
 "A"
 "B"
 "C"
In [107]:
# you can reorder levels, mostly useful for ordered CategoricalArrays
levels!(y,["C","B","A"])
Out[107]:
4-element CategoricalArray{String,1,UInt32}:
 "A"
 "B"
 "B"
 "C"
In [108]:
isordered(y)
Out[108]:
true
In [109]:
y[1] < y[2]
Out[109]:
false
In [113]:
# you have to specify all levels that are present
levels!(z,["A","B"])
ArgumentError: cannot remove level "C" as it is used at position 4 and allowmissing=false.

Stacktrace:
 [1] levels!(A::CategoricalVector{Union{Missing, String}, UInt32, String, CategoricalValue{String, UInt32}, Missing}, newlevels::Vector{String}; allowmissing::Bool, allow_missing::Nothing)
   @ CategoricalArrays ~/.julia/packages/CategoricalArrays/Fr04b/src/array.jl:797
 [2] levels!(A::CategoricalVector{Union{Missing, String}, UInt32, String, CategoricalValue{String, UInt32}, Missing}, newlevels::Vector{String})
   @ CategoricalArrays ~/.julia/packages/CategoricalArrays/Fr04b/src/array.jl:778
 [3] top-level scope
   @ In[113]:2
 [4] eval
   @ ./boot.jl:360 [inlined]
 [5] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094
In [114]:
# unless the underlying array allows for missings and force removal of levels
levels!(z,["A","B"], allowmissing=true)
Out[114]:
5-element CategoricalArray{Union{Missing, String},1,UInt32}:
 "A"
 "B"
 "B"
 missing
 missing
In [115]:
z[1] = "B"
z # now z has only "B" entries
Out[115]:
5-element CategoricalArray{Union{Missing, String},1,UInt32}:
 "B"
 "B"
 "B"
 missing
 missing
In [116]:
levels(z) # but it remembers the levels it has(the reason is mostly performance)
Out[116]:
2-element Vector{String}:
 "A"
 "B"
In [117]:
droplevels!(z) # this way we can clean it up
Out[117]:
5-element CategoricalArray{Union{Missing, String},1,UInt32}:
 "B"
 "B"
 "B"
 missing
 missing
In [118]:
levels(z)
Out[118]:
1-element Vector{String}:
 "B"

Data manipulation

In [121]:
x
Out[121]:
4-element CategoricalArray{String,1,UInt32}:
 "A"
 "B"
 "B"
 "C"
In [122]:
levels(x)
Out[122]:
3-element Vector{String}:
 "A"
 "B"
 "C"
In [123]:
v
Out[123]:
5-element CategoricalArray{Int64,1,UInt32}:
 1
 2
 2
 3
 3
In [124]:
levels(v)
Out[124]:
3-element Vector{Int64}:
 1
 2
 3
In [126]:
# even though the underying data is Int, we cannot operate on it
v[1] + v[2]
MethodError: no method matching +(::CategoricalValue{Int64, UInt32}, ::CategoricalValue{Int64, UInt32})
Closest candidates are:
  +(::Any, ::Any, ::Any, ::Any...) at operators.jl:560

Stacktrace:
 [1] top-level scope
   @ In[126]:2
 [2] eval
   @ ./boot.jl:360 [inlined]
 [3] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094
In [129]:
# you have either to retrieve the data by conversion (may be expensive)
Vector{Int}(v)
Out[129]:
5-element Vector{Int64}:
 1
 2
 2
 3
 3
In [133]:
unwrap(v[1]) + unwrap(v[2]) # or get a single value
Out[133]:
3
In [137]:
unw
MethodError: no method matching +(::CategoricalValue{Int64, UInt32}, ::Int64)
Closest candidates are:
  +(::Any, ::Any, ::Any, ::Any...) at operators.jl:560
  +(::T, ::T) where T<:Union{Int128, Int16, Int32, Int64, Int8, UInt128, UInt16, UInt32, UInt64, UInt8} at int.jl:87
  +(::T, ::Integer) where T<:AbstractChar at char.jl:223
  ...

Stacktrace:
  [1] _broadcast_getindex_evalf
    @ ./broadcast.jl:648 [inlined]
  [2] _broadcast_getindex
    @ ./broadcast.jl:621 [inlined]
  [3] getindex
    @ ./broadcast.jl:575 [inlined]
  [4] copy
    @ ./broadcast.jl:922 [inlined]
  [5] materialize
    @ ./broadcast.jl:883 [inlined]
  [6] broadcast_preserving_zero_d
    @ ./broadcast.jl:872 [inlined]
  [7] +(A::CategoricalVector{Int64, UInt32, Int64, CategoricalValue{Int64, UInt32}, Union{}}, B::Vector{Int64})
    @ Base ./arraymath.jl:39
  [8] top-level scope
    @ In[137]:1
  [9] eval
    @ ./boot.jl:360 [inlined]
 [10] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base ./loading.jl:1094
In [140]:
unwrap(v) === v
Out[140]:
true
In [173]:
unwrap.(v) # this will work for arrays without missing
Out[173]:
5-element Vector{Int64}:
 1
 2
 2
 3
 3
In [149]:
@btime unwrap.(z)  # also works on missing values
  2.184 μs (14 allocations: 736 bytes)
Out[149]:
5-element Vector{Union{Missing, String}}:
 "B"
 "B"
 "B"
 missing
 missing
In [150]:
@btime Vector{Union{Missing,String}}(z) # or do the conersion
  189.995 ns (1 allocation: 128 bytes)
Out[150]:
5-element Vector{Union{Missing, String}}:
 "B"
 "B"
 "B"
 missing
 missing
In [158]:
Vector(x)
Out[158]:
4-element Vector{CategoricalValue{String, UInt32}}:
 "A"
 "B"
 "B"
 "C"
In [160]:
Vector{String}(x)
Out[160]:
4-element Vector{String}:
 "A"
 "B"
 "B"
 "C"
In [167]:
Vector{String}(x) == Vector(x)
Out[167]:
true
In [168]:
Vector{String}(x) .* "aa"
Out[168]:
4-element Vector{String}:
 "Aaa"
 "Baa"
 "Baa"
 "Caa"
In [172]:
Vector(x) .* "aa"
MethodError: no method matching *(::CategoricalValue{String, UInt32}, ::String)
Closest candidates are:
  *(::Any, ::Any, ::Any, ::Any...) at operators.jl:560
  *(::Union{AbstractChar, AbstractString}, ::Union{AbstractChar, AbstractString}...) at strings/basic.jl:260
  *(::Missing, ::AbstractString) at missing.jl:175
  ...

Stacktrace:
 [1] _broadcast_getindex_evalf
   @ ./broadcast.jl:648 [inlined]
 [2] _broadcast_getindex
   @ ./broadcast.jl:621 [inlined]
 [3] getindex
   @ ./broadcast.jl:575 [inlined]
 [4] copy
   @ ./broadcast.jl:922 [inlined]
 [5] materialize(bc::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}, Nothing, typeof(*), Tuple{Vector{CategoricalValue{String, UInt32}}, Base.RefValue{String}}})
   @ Base.Broadcast ./broadcast.jl:883
 [6] top-level scope
   @ In[172]:1
 [7] eval
   @ ./boot.jl:360 [inlined]
 [8] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094
In [174]:
# recode some values in an array; has also in place recode! equivalent
recode([1,2,3,4,5,missing], 1=>10)
Out[174]:
6-element Vector{Union{Missing, Int64}}:
 10
  2
  3
  4
  5
   missing
In [177]:
# here we provided a default value for not mapped recodings
# missing은 적용되지 않음
recode([1,2,3,4,5,missing],"a",1=>10,2=>20)
Out[177]:
6-element Vector{Union{Missing, Int64, String}}:
 10
 20
   "a"
   "a"
   "a"
   missing
In [179]:
# missing값을 변경하는 경우 명확하게 아래 처럼 변경할 값을 지정해야 한다.
recode([1,2,3,4,5,missing],1=>10,missing=>-1)
Out[179]:
6-element Vector{Int64}:
 10
  2
  3
  4
  5
 -1
In [180]:
t = categorical([1:5; missing])
Out[180]:
6-element CategoricalArray{Union{Missing, Int64},1,UInt32}:
 1
 2
 3
 4
 5
 missing
In [182]:
levels(t)
Out[182]:
5-element Vector{Int64}:
 1
 2
 3
 4
 5
In [202]:
t[1]=20; t[2]=30; t[3]=4; push!(t,22)
Out[202]:
7-element CategoricalArray{Union{Missing, Int64},1,UInt32}:
 20
 30
 4
 4
 5
 missing
 22
In [203]:
levels(t)
Out[203]:
9-element Vector{Int64}:
  1
  2
  3
  4
  5
 10
 20
 30
 22
In [204]:
unwrap.(t)
Out[204]:
7-element Vector{Union{Missing, Int64}}:
 20
 30
  4
  4
  5
   missing
 22
In [205]:
droplevels!(t)
Out[205]:
7-element CategoricalArray{Union{Missing, Int64},1,UInt32}:
 20
 30
 4
 4
 5
 missing
 22
In [207]:
levels(t)
Out[207]:
5-element Vector{Int64}:
  4
  5
 20
 30
 22
In [208]:
t = categorical([1:5;missing])
Out[208]:
6-element CategoricalArray{Union{Missing, Int64},1,UInt32}:
 1
 2
 3
 4
 5
 missing
In [209]:
recode!(t,[1,3]=>2)
Out[209]:
6-element CategoricalArray{Union{Missing, Int64},1,UInt32}:
 2
 2
 2
 4
 5
 missing
In [210]:
levels(t)
Out[210]:
3-element Vector{Int64}:
 2
 4
 5
In [224]:
t = categorical([3,1,2], ordered=true)
Out[224]:
3-element CategoricalArray{Int64,1,UInt32}:
 3
 1
 2
In [225]:
levels(t)
Out[225]:
3-element Vector{Int64}:
 1
 2
 3
In [226]:
# and if toy introduce a new levels they are added at the end in the order of appearance
levels(recode(t,2=>0,1=>-1))
Out[226]:
3-element Vector{Int64}:
  3
  0
 -1
In [229]:
# when using default it becomes the last level
t = categorical([1,2,3,4,5],ordered=true)
levels(recode(t,300,[1,2]=>100,3=>200))
Out[229]:
3-element Vector{Int64}:
 100
 200
 300

Comparisions

In [231]:
x = categorical([1,2,3])
xs = [x, categorical(x), categorical(x,ordered=true), categorical(x,ordered=true)]
Out[231]:
4-element Vector{CategoricalVector{Int64, UInt32, Int64, CategoricalValue{Int64, UInt32}, Union{}}}:
 [1, 2, 3]
 [1, 2, 3]
 [1, 2, 3]
 [1, 2, 3]
In [232]:
levels(xs[2])
Out[232]:
3-element Vector{Int64}:
 1
 2
 3
In [233]:
levels!(xs[2],[3,2,1])
Out[233]:
3-element CategoricalArray{Int64,1,UInt32}:
 1
 2
 3
In [234]:
levels!(xs[4],[2,3,1])
Out[234]:
3-element CategoricalArray{Int64,1,UInt32}:
 1
 2
 3
In [236]:
[a == b for a in xs, b in xs] # all ara equal - comparison only by contents
Out[236]:
4×4 Matrix{Bool}:
 1  1  1  1
 1  1  1  1
 1  1  1  1
 1  1  1  1
In [237]:
xs[1] == xs[2]
Out[237]:
true
In [238]:
# this is actually the full signature of CategoricalArray
signature(x::CategoricalArray) = (x, levels(x), isordered(x))

# all are different, notice that xs[1] and xs[2] are unordered but have a 
# different order of levels
[signature(a) == signature(b) for a in xs, b in xs]
Out[238]:
4×4 Matrix{Bool}:
 1  0  0  0
 0  1  0  0
 0  0  1  0
 0  0  0  1
In [240]:
# you cannot compare elements of unordered CategoricalArray
x[1] < x[2]
ArgumentError: Unordered CategoricalValue objects cannot be tested for order using <. Use isless instead, or call the ordered! function on the parent array to change this

Stacktrace:
 [1] <(x::CategoricalValue{Int64, UInt32}, y::CategoricalValue{Int64, UInt32})
   @ CategoricalArrays ~/.julia/packages/CategoricalArrays/Fr04b/src/value.jl:150
 [2] top-level scope
   @ In[240]:2
 [3] eval
   @ ./boot.jl:360 [inlined]
 [4] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094
In [241]:
t[1] < t[2] # but you can do it for an ordered one
Out[241]:
true
In [242]:
# isless works within the same CategoricalArray even if it is not ordered
isless(x[1],x[2])
Out[242]:
true
In [244]:
y = deepcopy(x)
Out[244]:
3-element CategoricalArray{Int64,1,UInt32}:
 1
 2
 3
In [246]:
x === y, x==y
Out[246]:
(false, true)
In [248]:
# but not across categorical arrays
isless(x[1] < y[2])
ArgumentError: CategoricalValue objects with different pools cannot be tested for order

Stacktrace:
 [1] <(x::CategoricalValue{Int64, UInt32}, y::CategoricalValue{Int64, UInt32})
   @ CategoricalArrays ~/.julia/packages/CategoricalArrays/Fr04b/src/value.jl:148
 [2] top-level scope
   @ In[248]:2
 [3] eval
   @ ./boot.jl:360 [inlined]
 [4] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094
In [249]:
# you can use get to make a comparison of the contents of CategoricalArray
isless(unwrap(x[1]), unwrap(y[2]))
Out[249]:
true
In [250]:
# equality tests works OK! across CategoricalArrays
x[1]  == y[2]
Out[250]:
false

Categorical columns in a DataFrame

In [251]:
df = DataFrame(x=1:3,y='a':'c',z=["a","b","c"])
Out[251]:

3 rows × 3 columns

xyz
Int64CharString
11aa
22bb
33cc

Convert all string columns to categorical in-place

In [258]:
transform(df, names(df,String) => categorical, renamecols=false)
Out[258]:

3 rows × 3 columns

xyz
Int64CharCat…
11aa
22bb
33cc
In [257]:
names(df, Union{Int,Char})
Out[257]:
2-element Vector{String}:
 "x"
 "y"

복수개의 column을 categorical로 변경하기

In [260]:
transform(df, names(df,Union{String,Char}) .=> categorical, renamecols=false)
Out[260]:

3 rows × 3 columns

xyz
Int64Cat…Cat…
11aa
22bb
33cc
In [261]:
transform(df, [:x,:z] .=> categorical, renamecols=false)
Out[261]:

3 rows × 3 columns

xyz
Cat…CharCat…
11aa
22bb
33cc
In [262]:
transform!(df, names(df, String)=>categorical,renamecols=false)
Out[262]:

3 rows × 3 columns

xyz
Int64CharCat…
11aa
22bb
33cc
In [263]:
describe(df)
Out[263]:

3 rows × 7 columns

variablemeanminmedianmaxnmissingeltype
SymbolUnion…AnyUnion…AnyInt64DataType
1x2.012.030Int64
2yac0Char
3zac0CategoricalValue{String, UInt32}
In [ ]: