using DataFrames
# CategoricalArrays.jl is independent from DataFrames.jl but
# it is often used in combination
using CategoricalArrays
using Pipe, BenchmarkTools
x = categorical(["A","B","B","C"]) # unordered
# ordered, by default order is sorting order
y = categorical(["A","B","B","C"],ordered=true)
z = categorical(["A","B","B","C",missing]) # unordered with missing
# ordered, into equal counts, possible to rename labels and give
# custom break
# break : 5, 5개의 구간으로 나누는데 upper가 open,closed 두가지 형태로 돌려준다.
c = cut(1:10,5)
(we will cover grouping later, but let us here use it to analyze the results, we use Chain.jl for chaining)
LinRange(1,10,6)
[range(1,10,length=6)...]
using Chain
@chain DataFrame(x=cut(randn(100000),10)) begin
groupby(:x)
combine(nrow) # just to make sure cut works right
end
v = categorical([1,2,2,3,3]) # contains integers not strings
# sometimes you need to convert back to a standard vector
Vector{Union{String,Missing}}(z)
arr = [x,y,z,c,v]
isordered.(arr)
ordered!(x,true), isordered(x) # make x ordered
ordered!(x,false), isordered(x) # and unordered again
missing이 포함 되지 않음
levels.(arr) # list levels
missing이 포함됨
unique.(arr) # missing will be included
isordered(y)
y[1] < y[2] # can compare as y is ordered
isordered(v)
v[1] < v[2] # not comparable, v is unordered although it contains integers
y[2] < "A"
levels(y)
# you can reorder levels, mostly useful for ordered CategoricalArrays
levels!(y,["C","B","A"])
isordered(y)
y[1] < y[2]
# you have to specify all levels that are present
levels!(z,["A","B"])
# unless the underlying array allows for missings and force removal of levels
levels!(z,["A","B"], allowmissing=true)
z[1] = "B"
z # now z has only "B" entries
levels(z) # but it remembers the levels it has(the reason is mostly performance)
droplevels!(z) # this way we can clean it up
levels(z)
x
levels(x)
v
levels(v)
# even though the underying data is Int, we cannot operate on it
v[1] + v[2]
# you have either to retrieve the data by conversion (may be expensive)
Vector{Int}(v)
unwrap(v[1]) + unwrap(v[2]) # or get a single value
unw
unwrap(v) === v
unwrap.(v) # this will work for arrays without missing
@btime unwrap.(z) # also works on missing values
@btime Vector{Union{Missing,String}}(z) # or do the conersion
Vector(x)
Vector{String}(x)
Vector{String}(x) == Vector(x)
Vector{String}(x) .* "aa"
Vector(x) .* "aa"
# recode some values in an array; has also in place recode! equivalent
recode([1,2,3,4,5,missing], 1=>10)
# here we provided a default value for not mapped recodings
# missing은 적용되지 않음
recode([1,2,3,4,5,missing],"a",1=>10,2=>20)
# missing값을 변경하는 경우 명확하게 아래 처럼 변경할 값을 지정해야 한다.
recode([1,2,3,4,5,missing],1=>10,missing=>-1)
t = categorical([1:5; missing])
levels(t)
t[1]=20; t[2]=30; t[3]=4; push!(t,22)
levels(t)
unwrap.(t)
droplevels!(t)
levels(t)
t = categorical([1:5;missing])
recode!(t,[1,3]=>2)
levels(t)
t = categorical([3,1,2], ordered=true)
levels(t)
# and if toy introduce a new levels they are added at the end in the order of appearance
levels(recode(t,2=>0,1=>-1))
# when using default it becomes the last level
t = categorical([1,2,3,4,5],ordered=true)
levels(recode(t,300,[1,2]=>100,3=>200))
x = categorical([1,2,3])
xs = [x, categorical(x), categorical(x,ordered=true), categorical(x,ordered=true)]
levels(xs[2])
levels!(xs[2],[3,2,1])
levels!(xs[4],[2,3,1])
[a == b for a in xs, b in xs] # all ara equal - comparison only by contents
xs[1] == xs[2]
# this is actually the full signature of CategoricalArray
signature(x::CategoricalArray) = (x, levels(x), isordered(x))
# all are different, notice that xs[1] and xs[2] are unordered but have a
# different order of levels
[signature(a) == signature(b) for a in xs, b in xs]
# you cannot compare elements of unordered CategoricalArray
x[1] < x[2]
t[1] < t[2] # but you can do it for an ordered one
# isless works within the same CategoricalArray even if it is not ordered
isless(x[1],x[2])
y = deepcopy(x)
x === y, x==y
# but not across categorical arrays
isless(x[1] < y[2])
# you can use get to make a comparison of the contents of CategoricalArray
isless(unwrap(x[1]), unwrap(y[2]))
# equality tests works OK! across CategoricalArrays
x[1] == y[2]
df = DataFrame(x=1:3,y='a':'c',z=["a","b","c"])
Convert all string columns to categorical in-place
transform(df, names(df,String) => categorical, renamecols=false)
names(df, Union{Int,Char})
복수개의 column을 categorical로 변경하기
transform(df, names(df,Union{String,Char}) .=> categorical, renamecols=false)
transform(df, [:x,:z] .=> categorical, renamecols=false)
transform!(df, names(df, String)=>categorical,renamecols=false)
describe(df)