Introduction to DataFrames¶

DataFrame v1.2, Julia 1.6.1

using DataFrames

# CategoricalArrays.jl is independent from DataFrames.jl but 
# it is often used in combination
using CategoricalArrays

using Pipe, BenchmarkTools

Working with CategoricalArrays¶

Constructor¶

x = categorical(["A","B","B","C"]) # unordered

4-element CategoricalArray{String,1,UInt32}:
 "A"
 "B"
 "B"
 "C"

# ordered, by default order is sorting order
y = categorical(["A","B","B","C"],ordered=true)

4-element CategoricalArray{String,1,UInt32}:
 "A"
 "B"
 "B"
 "C"

z = categorical(["A","B","B","C",missing]) # unordered with missing

5-element CategoricalArray{Union{Missing, String},1,UInt32}:
 "A"
 "B"
 "B"
 "C"
 missing

# ordered, into equal counts, possible to rename labels and give
# custom break
# break : 5, 5개의 구간으로 나누는데 upper가 open,closed 두가지 형태로 돌려준다.
c = cut(1:10,5)

10-element CategoricalArray{String,1,UInt32}:
 "Q1: [1.0, 2.8)"
 "Q1: [1.0, 2.8)"
 "Q2: [2.8, 4.6)"
 "Q2: [2.8, 4.6)"
 "Q3: [4.6, 6.4)"
 "Q3: [4.6, 6.4)"
 "Q4: [6.4, 8.2)"
 "Q4: [6.4, 8.2)"
 "Q5: [8.2, 10.0]"
 "Q5: [8.2, 10.0]"

(we will cover grouping later, but let us here use it to analyze the results, we use Chain.jl for chaining)

LinRange(1,10,6)

6-element LinRange{Float64}:
 1.0,2.8,4.6,6.4,8.2,10.0

[range(1,10,length=6)...]

6-element Vector{Float64}:
  1.0
  2.8
  4.6
  6.4
  8.2
 10.0

using Chain

@chain DataFrame(x=cut(randn(100000),10)) begin
  groupby(:x)
  combine(nrow) # just to make sure cut works right
end

v = categorical([1,2,2,3,3]) # contains integers not strings

5-element CategoricalArray{Int64,1,UInt32}:
 1
 2
 2
 3
 3

# sometimes you need to convert back to a standard vector
Vector{Union{String,Missing}}(z)

5-element Vector{Union{Missing, String}}:
 "A"
 "B"
 "B"
 "C"
 missing

Managing levels¶

arr = [x,y,z,c,v]

5-element Vector{CategoricalVector{T, UInt32, V, C, U} where {T, V, C, U}}:
 CategoricalValue{String, UInt32}["A", "B", "B", "C"]
 CategoricalValue{String, UInt32}["A", "B", "B", "C"]
 Union{Missing, CategoricalValue{String, UInt32}}["A", "B", "B", "C", missing]
 CategoricalValue{String, UInt32}["Q1: [1.0, 2.8)", "Q1: [1.0, 2.8)", "Q2: [2.8, 4.6)", "Q2: [2.8, 4.6)", "Q3: [4.6, 6.4)", "Q3: [4.6, 6.4)", "Q4: [6.4, 8.2)", "Q4: [6.4, 8.2)", "Q5: [8.2, 10.0]", "Q5: [8.2, 10.0]"]
 CategoricalValue{Int64, UInt32}[1, 2, 2, 3, 3]

isordered.(arr)

5-element BitVector:
 0
 1
 0
 1
 0

ordered!(x,true), isordered(x) # make x ordered

(CategoricalValue{String, UInt32}["A", "B", "B", "C"], true)

ordered!(x,false), isordered(x) # and unordered again

(CategoricalValue{String, UInt32}["A", "B", "B", "C"], false)

missing이 포함 되지 않음

levels.(arr) # list levels

5-element Vector{Vector{T} where T}:
 ["A", "B", "C"]
 ["A", "B", "C"]
 ["A", "B", "C"]
 ["Q1: [1.0, 2.8)", "Q2: [2.8, 4.6)", "Q3: [4.6, 6.4)", "Q4: [6.4, 8.2)", "Q5: [8.2, 10.0]"]
 [1, 2, 3]

missing이 포함됨

unique.(arr) # missing will be included

5-element Vector{Vector{T} where T}:
 ["A", "B", "C"]
 ["A", "B", "C"]
 Union{Missing, String}["A", "B", "C", missing]
 ["Q1: [1.0, 2.8)", "Q2: [2.8, 4.6)", "Q3: [4.6, 6.4)", "Q4: [6.4, 8.2)", "Q5: [8.2, 10.0]"]
 [1, 2, 3]

isordered(y)

true

y[1] < y[2] # can compare as y is ordered

true

isordered(v)

false

v[1] < v[2] # not comparable, v is unordered although it contains integers

ArgumentError: Unordered CategoricalValue objects cannot be tested for order using <. Use isless instead, or call the ordered! function on the parent array to change this

Stacktrace:
 [1] <(x::CategoricalValue{Int64, UInt32}, y::CategoricalValue{Int64, UInt32})
   @ CategoricalArrays ~/.julia/packages/CategoricalArrays/Fr04b/src/value.jl:150
 [2] top-level scope
   @ In[77]:1
 [3] eval
   @ ./boot.jl:360 [inlined]
 [4] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094

y[2] < "A"

false

levels(y)

3-element Vector{String}:
 "A"
 "B"
 "C"

# you can reorder levels, mostly useful for ordered CategoricalArrays
levels!(y,["C","B","A"])

4-element CategoricalArray{String,1,UInt32}:
 "A"
 "B"
 "B"
 "C"

isordered(y)

true

y[1] < y[2]

false

# you have to specify all levels that are present
levels!(z,["A","B"])

ArgumentError: cannot remove level "C" as it is used at position 4 and allowmissing=false.

Stacktrace:
 [1] levels!(A::CategoricalVector{Union{Missing, String}, UInt32, String, CategoricalValue{String, UInt32}, Missing}, newlevels::Vector{String}; allowmissing::Bool, allow_missing::Nothing)
   @ CategoricalArrays ~/.julia/packages/CategoricalArrays/Fr04b/src/array.jl:797
 [2] levels!(A::CategoricalVector{Union{Missing, String}, UInt32, String, CategoricalValue{String, UInt32}, Missing}, newlevels::Vector{String})
   @ CategoricalArrays ~/.julia/packages/CategoricalArrays/Fr04b/src/array.jl:778
 [3] top-level scope
   @ In[113]:2
 [4] eval
   @ ./boot.jl:360 [inlined]
 [5] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094

# unless the underlying array allows for missings and force removal of levels
levels!(z,["A","B"], allowmissing=true)

5-element CategoricalArray{Union{Missing, String},1,UInt32}:
 "A"
 "B"
 "B"
 missing
 missing

z[1] = "B"
z # now z has only "B" entries

5-element CategoricalArray{Union{Missing, String},1,UInt32}:
 "B"
 "B"
 "B"
 missing
 missing

levels(z) # but it remembers the levels it has(the reason is mostly performance)

2-element Vector{String}:
 "A"
 "B"

droplevels!(z) # this way we can clean it up

5-element CategoricalArray{Union{Missing, String},1,UInt32}:
 "B"
 "B"
 "B"
 missing
 missing

levels(z)

1-element Vector{String}:
 "B"

Data manipulation¶

x

4-element CategoricalArray{String,1,UInt32}:
 "A"
 "B"
 "B"
 "C"

levels(x)

3-element Vector{String}:
 "A"
 "B"
 "C"

v

5-element CategoricalArray{Int64,1,UInt32}:
 1
 2
 2
 3
 3

levels(v)

3-element Vector{Int64}:
 1
 2
 3

# even though the underying data is Int, we cannot operate on it
v[1] + v[2]

MethodError: no method matching +(::CategoricalValue{Int64, UInt32}, ::CategoricalValue{Int64, UInt32})
Closest candidates are:
  +(::Any, ::Any, ::Any, ::Any...) at operators.jl:560

Stacktrace:
 [1] top-level scope
   @ In[126]:2
 [2] eval
   @ ./boot.jl:360 [inlined]
 [3] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094

# you have either to retrieve the data by conversion (may be expensive)
Vector{Int}(v)

5-element Vector{Int64}:
 1
 2
 2
 3
 3

unwrap(v[1]) + unwrap(v[2]) # or get a single value

3

unw

MethodError: no method matching +(::CategoricalValue{Int64, UInt32}, ::Int64)
Closest candidates are:
  +(::Any, ::Any, ::Any, ::Any...) at operators.jl:560
  +(::T, ::T) where T<:Union{Int128, Int16, Int32, Int64, Int8, UInt128, UInt16, UInt32, UInt64, UInt8} at int.jl:87
  +(::T, ::Integer) where T<:AbstractChar at char.jl:223
  ...

Stacktrace:
  [1] _broadcast_getindex_evalf
    @ ./broadcast.jl:648 [inlined]
  [2] _broadcast_getindex
    @ ./broadcast.jl:621 [inlined]
  [3] getindex
    @ ./broadcast.jl:575 [inlined]
  [4] copy
    @ ./broadcast.jl:922 [inlined]
  [5] materialize
    @ ./broadcast.jl:883 [inlined]
  [6] broadcast_preserving_zero_d
    @ ./broadcast.jl:872 [inlined]
  [7] +(A::CategoricalVector{Int64, UInt32, Int64, CategoricalValue{Int64, UInt32}, Union{}}, B::Vector{Int64})
    @ Base ./arraymath.jl:39
  [8] top-level scope
    @ In[137]:1
  [9] eval
    @ ./boot.jl:360 [inlined]
 [10] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base ./loading.jl:1094

unwrap(v) === v

true

unwrap.(v) # this will work for arrays without missing

5-element Vector{Int64}:
 1
 2
 2
 3
 3

@btime unwrap.(z)  # also works on missing values

  2.184 μs (14 allocations: 736 bytes)

5-element Vector{Union{Missing, String}}:
 "B"
 "B"
 "B"
 missing
 missing

@btime Vector{Union{Missing,String}}(z) # or do the conersion

  189.995 ns (1 allocation: 128 bytes)

5-element Vector{Union{Missing, String}}:
 "B"
 "B"
 "B"
 missing
 missing

Vector(x)

4-element Vector{CategoricalValue{String, UInt32}}:
 "A"
 "B"
 "B"
 "C"

Vector{String}(x)

4-element Vector{String}:
 "A"
 "B"
 "B"
 "C"

Vector{String}(x) == Vector(x)

true

Vector{String}(x) .* "aa"

4-element Vector{String}:
 "Aaa"
 "Baa"
 "Baa"
 "Caa"

Vector(x) .* "aa"

MethodError: no method matching *(::CategoricalValue{String, UInt32}, ::String)
Closest candidates are:
  *(::Any, ::Any, ::Any, ::Any...) at operators.jl:560
  *(::Union{AbstractChar, AbstractString}, ::Union{AbstractChar, AbstractString}...) at strings/basic.jl:260
  *(::Missing, ::AbstractString) at missing.jl:175
  ...

Stacktrace:
 [1] _broadcast_getindex_evalf
   @ ./broadcast.jl:648 [inlined]
 [2] _broadcast_getindex
   @ ./broadcast.jl:621 [inlined]
 [3] getindex
   @ ./broadcast.jl:575 [inlined]
 [4] copy
   @ ./broadcast.jl:922 [inlined]
 [5] materialize(bc::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}, Nothing, typeof(*), Tuple{Vector{CategoricalValue{String, UInt32}}, Base.RefValue{String}}})
   @ Base.Broadcast ./broadcast.jl:883
 [6] top-level scope
   @ In[172]:1
 [7] eval
   @ ./boot.jl:360 [inlined]
 [8] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094

# recode some values in an array; has also in place recode! equivalent
recode([1,2,3,4,5,missing], 1=>10)

6-element Vector{Union{Missing, Int64}}:
 10
  2
  3
  4
  5
   missing

# here we provided a default value for not mapped recodings
# missing은 적용되지 않음
recode([1,2,3,4,5,missing],"a",1=>10,2=>20)

6-element Vector{Union{Missing, Int64, String}}:
 10
 20
   "a"
   "a"
   "a"
   missing

# missing값을 변경하는 경우 명확하게 아래 처럼 변경할 값을 지정해야 한다.
recode([1,2,3,4,5,missing],1=>10,missing=>-1)

6-element Vector{Int64}:
 10
  2
  3
  4
  5
 -1

t = categorical([1:5; missing])

6-element CategoricalArray{Union{Missing, Int64},1,UInt32}:
 1
 2
 3
 4
 5
 missing

levels(t)

5-element Vector{Int64}:
 1
 2
 3
 4
 5

t[1]=20; t[2]=30; t[3]=4; push!(t,22)

7-element CategoricalArray{Union{Missing, Int64},1,UInt32}:
 20
 30
 4
 4
 5
 missing
 22

levels(t)

9-element Vector{Int64}:
  1
  2
  3
  4
  5
 10
 20
 30
 22

unwrap.(t)

7-element Vector{Union{Missing, Int64}}:
 20
 30
  4
  4
  5
   missing
 22

droplevels!(t)

7-element CategoricalArray{Union{Missing, Int64},1,UInt32}:
 20
 30
 4
 4
 5
 missing
 22

levels(t)

5-element Vector{Int64}:
  4
  5
 20
 30
 22

t = categorical([1:5;missing])

6-element CategoricalArray{Union{Missing, Int64},1,UInt32}:
 1
 2
 3
 4
 5
 missing

recode!(t,[1,3]=>2)

6-element CategoricalArray{Union{Missing, Int64},1,UInt32}:
 2
 2
 2
 4
 5
 missing

levels(t)

3-element Vector{Int64}:
 2
 4
 5

t = categorical([3,1,2], ordered=true)

3-element CategoricalArray{Int64,1,UInt32}:
 3
 1
 2

levels(t)

3-element Vector{Int64}:
 1
 2
 3

# and if toy introduce a new levels they are added at the end in the order of appearance
levels(recode(t,2=>0,1=>-1))

3-element Vector{Int64}:
  3
  0
 -1

# when using default it becomes the last level
t = categorical([1,2,3,4,5],ordered=true)
levels(recode(t,300,[1,2]=>100,3=>200))

3-element Vector{Int64}:
 100
 200
 300

Comparisions¶

x = categorical([1,2,3])
xs = [x, categorical(x), categorical(x,ordered=true), categorical(x,ordered=true)]

4-element Vector{CategoricalVector{Int64, UInt32, Int64, CategoricalValue{Int64, UInt32}, Union{}}}:
 [1, 2, 3]
 [1, 2, 3]
 [1, 2, 3]
 [1, 2, 3]

levels(xs[2])

3-element Vector{Int64}:
 1
 2
 3

levels!(xs[2],[3,2,1])

3-element CategoricalArray{Int64,1,UInt32}:
 1
 2
 3

levels!(xs[4],[2,3,1])

3-element CategoricalArray{Int64,1,UInt32}:
 1
 2
 3

[a == b for a in xs, b in xs] # all ara equal - comparison only by contents

4×4 Matrix{Bool}:
 1  1  1  1
 1  1  1  1
 1  1  1  1
 1  1  1  1

xs[1] == xs[2]

true

# this is actually the full signature of CategoricalArray
signature(x::CategoricalArray) = (x, levels(x), isordered(x))

# all are different, notice that xs[1] and xs[2] are unordered but have a 
# different order of levels
[signature(a) == signature(b) for a in xs, b in xs]

4×4 Matrix{Bool}:
 1  0  0  0
 0  1  0  0
 0  0  1  0
 0  0  0  1

# you cannot compare elements of unordered CategoricalArray
x[1] < x[2]

ArgumentError: Unordered CategoricalValue objects cannot be tested for order using <. Use isless instead, or call the ordered! function on the parent array to change this

Stacktrace:
 [1] <(x::CategoricalValue{Int64, UInt32}, y::CategoricalValue{Int64, UInt32})
   @ CategoricalArrays ~/.julia/packages/CategoricalArrays/Fr04b/src/value.jl:150
 [2] top-level scope
   @ In[240]:2
 [3] eval
   @ ./boot.jl:360 [inlined]
 [4] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094

t[1] < t[2] # but you can do it for an ordered one

true

# isless works within the same CategoricalArray even if it is not ordered
isless(x[1],x[2])

true

y = deepcopy(x)

3-element CategoricalArray{Int64,1,UInt32}:
 1
 2
 3

x === y, x==y

(false, true)

# but not across categorical arrays
isless(x[1] < y[2])

ArgumentError: CategoricalValue objects with different pools cannot be tested for order

Stacktrace:
 [1] <(x::CategoricalValue{Int64, UInt32}, y::CategoricalValue{Int64, UInt32})
   @ CategoricalArrays ~/.julia/packages/CategoricalArrays/Fr04b/src/value.jl:148
 [2] top-level scope
   @ In[248]:2
 [3] eval
   @ ./boot.jl:360 [inlined]
 [4] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094

# you can use get to make a comparison of the contents of CategoricalArray
isless(unwrap(x[1]), unwrap(y[2]))

true

# equality tests works OK! across CategoricalArrays
x[1]  == y[2]

false

Categorical columns in a DataFrame¶

df = DataFrame(x=1:3,y='a':'c',z=["a","b","c"])

Convert all string columns to categorical in-place

transform(df, names(df,String) => categorical, renamecols=false)

names(df, Union{Int,Char})

2-element Vector{String}:
 "x"
 "y"

복수개의 column을 categorical로 변경하기

transform(df, names(df,Union{String,Char}) .=> categorical, renamecols=false)

transform(df, [:x,:z] .=> categorical, renamecols=false)

transform!(df, names(df, String)=>categorical,renamecols=false)

describe(df)

	variable	mean	min	median	max	nmissing	eltype
	Symbol	Union…	Any	Union…	Any	Int64	DataType
1	x	2.0	1	2.0	3	0	Int64
2	y		a		c	0	Char
3	z		a		c	0	CategoricalValue{String, UInt32}

	x	nrow
	Cat…	Int64
1	Q1: [-4.60933, -1.27672)	10000
2	Q2: [-1.27672, -0.837586)	10000
3	Q3: [-0.837586, -0.522596)	10000
4	Q4: [-0.522596, -0.252172)	10000
5	Q5: [-0.252172, -0.00358539)	10000
6	Q6: [-0.00358539, 0.24998)	10000
7	Q7: [0.24998, 0.522463)	10000
8	Q8: [0.522463, 0.837398)	10000
9	Q9: [0.837398, 1.28127)	10000
10	Q10: [1.28127, 4.83148]	10000