using DataFrames
using Pipe
using FreqTables
using CategoricalArrays
using BenchmarkTools
ENV["LINES"] = 15
df = DataFrame(a=rand('a':'d',1000), b=rand(["x","y","z"],1000))
# observe that dimensions are sorted if possible
ft = freqtable(df, :a, :b)
# you can index the result using number of names
ft[1,1], ft['b',"z"]
# getting proportions - 1 means we want to calculate them in rows
# (first dimension)
pr = prop(ft, margins=1)
pr[1,:] |> sum,pr[4,:] |> sum
# and columns are normalized to 1.0 now
pc = prop(ft, margins=2)
pc[:,"x"] |> sum,pc[:,"y"] |> sum
p = prop(ft)
p |> sum
x = categorical(rand(1:3,10))
# rerodering levels and adding an extra level
levels!(x,[3,1,2,4])
# order is preserved and not-used level is shown
freqtable(x)
# by default missings are listed
freqtable([1,1,2,3,missing])
# but we can skip them
freqtable([1,1,2,3,missing],skipmissing=true)
df = DataFrame(a=rand(3:4,1_000),b=rand(5:6,1_000))
# now dimensions are numbers
ft = freqtable(df, :a, :b)
# this is an error - standard array indexing takes precedence
ft[3,5]
# you have to use Name() wrapper
ft[FreqTables.Name(3),FreqTables.Name(5)]
DataFrame
¶DataFramesMeta.jl provides a more terse syntax due to the benefits of metaprogramming.
using DataFramesMeta
df = DataFrame(x=1:8, y='a':'h',z=repeat([true,false],outer=4))
# expressions with columns of DataFrame
@btime DataFramesMeta.@with($df, :x + :z)
@btime +($df.x,df.z)
# you can define code blocks
@with df begin
a = :x[:z] # 1,3,5,7
b = :x[.!:z] # 2,4,6,8
:y + [a;b] # [a;b] : 1,3,5,7,2,4,8
end
# @with creates hard scope so variables do not leak out
a
df2 = DataFrame(a=[:a,:b,:c])
# sometimes we want to work on a raw Symbol, ^() escapes it
@with(df2, :a .== ^(:a))
@with(df2, :a .== Symbol("a"))
x_str = "x"
y_str = "y"
df2 = DataFrame(x=1:3, y=4:6, z=7:9)
# cols(expression) selects given columns
@with(df2,cols(x_str) + cols(y_str))
@with(df2,$(x_str) + $(y_str))
df
# a very useful macro for filtering
DataFramesMeta.@subset(df, :x .< 4, :z .== true)
# create a new DataFrame based on the old one
DataFramesMeta.@select(df, :x, y = 2*:x,z=:y)
# create a new DataFrame adding columns based on old one
@btime DataFramesMeta.@transform($df,:x = 2*:x, :y=:x)
@btime transform($df,:x=>ByRow(x->2*x)=>:x,:x=>:y)
# dorting into a new data frame, less powerful than sort,
# but lightweight
@btime DataFramesMeta.@orderby($df,:z,-:x)
@btime sort($df,[:z,:x],rev=[false,true])
using Chain
# chainning of operations on DataFrame
@chain df begin
@subset(:x .< 5)
@orderby(:z)
@transform(:x²= :x .^2)
@select(:z,:x,:x²)
end
DataFrame
¶df = DataFrame(a=1:12, b=repeat('a':'d', outer=3))
g = groupby(df, :b)
using Statistics
# groupby + combine in one shot
@btime DataFramesMeta.@by(df,:b,:first=first(:a),:last=last(:a),:mean=mean(:a))
@btime @chain df begin
groupby(:b)
combine(:a=>first=>:first,:a=>last=>:last,:a=>mean=>:mean)
end
# the same as by but on grouped DataFrame
@btime @combine($g,:first=first(:a),:last=last(:a), :mean=mean(:a))
# similar in DataFrames.jl
@btime combine($g,:a .=> [first,last, mean] .=> [:first,:last,:mean])
# perform operations within a group and return ungrouped DataFrame
@btime @transform($g, :center=mean(:a), :centered = :a .- mean(:a))
@btime transform($g,:a.=>[mean,(a-> (a .- mean(a)))] .=> [:center, :centered])
# this is defined in DataFrames.jl
DataFrame(g)
# actually this is not the same as DataFrame()
# as it preserves the original row order
@transform(g)
DataFrame
¶df = DataFrame(a=1:12, b=repeat(1:4, outer=3))
# such conditions are often needed but are complex to write
@btime @transform($df, :x=ifelse.((:a .> 6) .& (:b .== 4),"yes","no"))
@btime transform($df, [:a,:b]=>
ByRow((a,b)->ifelse((a > 6) & (b == 4),"yes","no"))=>:x)
# one option is to use a function that works on a single observation and
# broadcast it
myfunc(a,b) = a > 6 && b == 4 ? "yes" : "no"
@btime @transform($df, :x=myfunc.(:a,:b))
# or you can use @eachrow macro that allows you to process DataFrame rowwise
@btime @eachrow $df begin
@newcol x::Vector{String}
:x = :a > 6 && :b == 4 ? "yes" : "no"
end
Very Fast!!!
@btime @chain $df begin
_.x = ifelse.((_.a .> 6) .& (_.b .== 4) ,"yes","no")
end
df
@btime transform($df,[:a,:b]=>ByRow((a,b)->ifelse(a > 6 && b == 4,"yes","no"))=>:x)
You can also use eachrow
from DataFrames to perform the same transformation.
Very Fast!!!
df2 = copy(df)
df2.x = Vector{String}(undef,nrow(df2))
@btime begin
for row in eachrow($df2)
row[:x] = row[:a] > 6 && row[:b] == 4 ? "yes" : "no"
end
end
df2
using StatsPlots
default(fmt=:png)
We present only a minimal functionality of the package
using Random
Random.seed!(1)
df = DataFrame(x=sort(randn(1_000)), y=randn(1_000),
z=[fill("b",500);fill("a",500)])
@df df plot(:x,:y, legend=:topleft,label="y(x)")
@df df density(:x,label="")
@df df histogram(:y, label="y")
@df df boxplot(:z,:x,label="x")
@df df violin(:z,:y,label="y")