Introduction to DataFrames¶

DataFrame v1.2, Julia 1.6.1

Extras - selected functionalities of selected packages¶

using DataFrames

using Pipe

FreqTables

using FreqTables

using CategoricalArrays

using BenchmarkTools

ENV["LINES"] = 15

15

df = DataFrame(a=rand('a':'d',1000), b=rand(["x","y","z"],1000))
# observe that dimensions are sorted if possible
ft = freqtable(df, :a, :b)

4×3 Named Matrix{Int64}
a ╲ b │  x   y   z
──────┼───────────
'a'   │ 78  86  80
'b'   │ 87  87  92
'c'   │ 79  86  84
'd'   │ 80  79  82

# you can index the result using number of names
ft[1,1], ft['b',"z"]

(78, 92)

# getting proportions - 1 means we want to calculate them in rows
# (first dimension)
pr = prop(ft, margins=1)

4×3 Named Matrix{Float64}
a ╲ b │        x         y         z
──────┼─────────────────────────────
'a'   │ 0.319672  0.352459  0.327869
'b'   │ 0.327068  0.327068  0.345865
'c'   │ 0.317269  0.345382  0.337349
'd'   │  0.33195  0.327801  0.340249

pr[1,:] |> sum,pr[4,:] |> sum

(1.0, 1.0)

# and columns are normalized to 1.0 now
pc = prop(ft, margins=2)

4×3 Named Matrix{Float64}
a ╲ b │        x         y         z
──────┼─────────────────────────────
'a'   │ 0.240741  0.254438  0.236686
'b'   │ 0.268519  0.257396  0.272189
'c'   │ 0.243827  0.254438  0.248521
'd'   │ 0.246914  0.233728  0.242604

pc[:,"x"] |> sum,pc[:,"y"] |> sum

(1.0, 1.0)

p = prop(ft)

4×3 Named Matrix{Float64}
a ╲ b │     x      y      z
──────┼────────────────────
'a'   │ 0.078  0.086   0.08
'b'   │ 0.087  0.087  0.092
'c'   │ 0.079  0.086  0.084
'd'   │  0.08  0.079  0.082

p |> sum

0.9999999999999998

x = categorical(rand(1:3,10))

10-element CategoricalArray{Int64,1,UInt32}:
 2
 2
 1
 2
 1
 1
 3
 3
 2
 1

# rerodering levels and adding an extra level
levels!(x,[3,1,2,4])

# order is preserved and not-used level is shown
freqtable(x)

4-element Named Vector{Int64}
Dim1                              │ 
──────────────────────────────────┼──
CategoricalValue{Int64, UInt32} 3 │ 2
CategoricalValue{Int64, UInt32} 1 │ 4
CategoricalValue{Int64, UInt32} 2 │ 4
CategoricalValue{Int64, UInt32} 4 │ 0

# by default missings are listed
freqtable([1,1,2,3,missing])

4-element Named Vector{Int64}
Dim1    │ 
────────┼──
1       │ 2
2       │ 1
3       │ 1
missing │ 1

# but we can skip them
freqtable([1,1,2,3,missing],skipmissing=true)

3-element Named Vector{Int64}
Dim1  │ 
──────┼──
1     │ 2
2     │ 1
3     │ 1

df = DataFrame(a=rand(3:4,1_000),b=rand(5:6,1_000))
# now dimensions are numbers
ft = freqtable(df, :a, :b)

2×2 Named Matrix{Int64}
a ╲ b │   5    6
──────┼─────────
3     │ 265  249
4     │ 242  244

# this is an error - standard array indexing takes precedence
ft[3,5]

BoundsError: attempt to access 2×2 Matrix{Int64} at index [3, 5]

Stacktrace:
 [1] getindex
   @ ./array.jl:802 [inlined]
 [2] getindex(::NamedArrays.NamedMatrix{Int64, Matrix{Int64}, Tuple{OrderedCollections.OrderedDict{Int64, Int64}, OrderedCollections.OrderedDict{Int64, Int64}}}, ::Int64, ::Int64)
   @ NamedArrays ~/.julia/packages/NamedArrays/TuJLn/src/index.jl:17
 [3] top-level scope
   @ In[19]:2
 [4] eval
   @ ./boot.jl:360 [inlined]
 [5] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094

# you have to use Name() wrapper
ft[FreqTables.Name(3),FreqTables.Name(5)]

265

DataFramesMeta.jl - working on `DataFrame`¶

DataFramesMeta.jl provides a more terse syntax due to the benefits of metaprogramming.

using DataFramesMeta

df = DataFrame(x=1:8, y='a':'h',z=repeat([true,false],outer=4))

# expressions with columns of DataFrame
@btime DataFramesMeta.@with($df, :x + :z)

  653.782 ns (7 allocations: 576 bytes)

8-element Vector{Int64}:
 2
 2
 4
 4
 6
 6
 8
 8

@btime +($df.x,df.z)

  134.350 ns (1 allocation: 144 bytes)

8-element Vector{Int64}:
 2
 2
 4
 4
 6
 6
 8
 8

# you can define code blocks
@with df begin
  a = :x[:z] # 1,3,5,7
  b = :x[.!:z] # 2,4,6,8
  :y + [a;b] # [a;b] : 1,3,5,7,2,4,8
end

8-element Vector{Char}:
 'b': ASCII/Unicode U+0062 (category Ll: Letter, lowercase)
 'e': ASCII/Unicode U+0065 (category Ll: Letter, lowercase)
 'h': ASCII/Unicode U+0068 (category Ll: Letter, lowercase)
 'k': ASCII/Unicode U+006B (category Ll: Letter, lowercase)
 'g': ASCII/Unicode U+0067 (category Ll: Letter, lowercase)
 'j': ASCII/Unicode U+006A (category Ll: Letter, lowercase)
 'm': ASCII/Unicode U+006D (category Ll: Letter, lowercase)
 'p': ASCII/Unicode U+0070 (category Ll: Letter, lowercase)

# @with creates hard scope so variables do not leak out
a

UndefVarError: a not defined

Stacktrace:
 [1] top-level scope
   @ :0
 [2] eval
   @ ./boot.jl:360 [inlined]
 [3] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base ./loading.jl:1094

df2 = DataFrame(a=[:a,:b,:c])

# sometimes we want to work on a raw Symbol, ^() escapes it
@with(df2, :a .== ^(:a))

3-element BitVector:
 1
 0
 0

@with(df2, :a .== Symbol("a"))

3-element BitVector:
 1
 0
 0

x_str = "x"
y_str = "y"
df2 = DataFrame(x=1:3, y=4:6, z=7:9)

# cols(expression) selects given columns
@with(df2,cols(x_str) + cols(y_str))

┌ Warning: cols(x) is deprecated, use $x instead
└ @ DataFramesMeta /home/shpark/.julia/packages/DataFramesMeta/mHJrB/src/parsing.jl:62
┌ Warning: cols(x) is deprecated, use $x instead
└ @ DataFramesMeta /home/shpark/.julia/packages/DataFramesMeta/mHJrB/src/parsing.jl:62

3-element Vector{Int64}:
 5
 7
 9

@with(df2,$(x_str) + $(y_str))

3-element Vector{Int64}:
 5
 7
 9

df

# a very useful macro for filtering
DataFramesMeta.@subset(df, :x .< 4, :z .== true)

# create a new DataFrame based on the old one
DataFramesMeta.@select(df, :x, y = 2*:x,z=:y)

# create a new DataFrame adding columns based on old one
@btime DataFramesMeta.@transform($df,:x = 2*:x, :y=:x)

  49.959 μs (228 allocations: 12.59 KiB)

@btime transform($df,:x=>ByRow(x->2*x)=>:x,:x=>:y)

  45.753 μs (214 allocations: 11.95 KiB)

# dorting into a new data frame, less powerful than sort, 
# but lightweight
@btime DataFramesMeta.@orderby($df,:z,-:x)

  38.649 μs (183 allocations: 10.83 KiB)

@btime sort($df,[:z,:x],rev=[false,true])

  4.551 μs (49 allocations: 3.98 KiB)

using Chain

# chainning of operations on DataFrame
@chain df begin
  @subset(:x .< 5)
  @orderby(:z)
  @transform(:x²= :x .^2)
  @select(:z,:x,:x²)
end

DataFramesMeta -working on grouped `DataFrame`¶

df = DataFrame(a=1:12, b=repeat('a':'d', outer=3))

g = groupby(df, :b)

using Statistics

# groupby + combine in one shot
@btime DataFramesMeta.@by(df,:b,:first=first(:a),:last=last(:a),:mean=mean(:a))

  173.221 μs (486 allocations: 30.22 KiB)

@btime @chain df begin
  groupby(:b)
  combine(:a=>first=>:first,:a=>last=>:last,:a=>mean=>:mean)
end

  157.729 μs (444 allocations: 28.30 KiB)

# the same as by but on grouped DataFrame
@btime @combine($g,:first=first(:a),:last=last(:a), :mean=mean(:a))

  170.003 μs (485 allocations: 29.92 KiB)

# similar in DataFrames.jl 
@btime combine($g,:a .=> [first,last, mean] .=> [:first,:last,:mean])

  142.343 μs (411 allocations: 27.36 KiB)

# perform operations within a group and return ungrouped DataFrame
@btime @transform($g, :center=mean(:a), :centered = :a .- mean(:a))

  217.743 μs (622 allocations: 37.77 KiB)

@btime transform($g,:a.=>[mean,(a-> (a .- mean(a)))] .=> [:center, :centered])

  217.313 μs (592 allocations: 37.11 KiB)

# this is defined in DataFrames.jl
DataFrame(g)

# actually this is not the same as DataFrame()
# as it preserves the original row order
@transform(g)

DataFramesMeta - rowwise operations on `DataFrame`¶

df = DataFrame(a=1:12, b=repeat(1:4, outer=3))

# such conditions are often needed but are complex to write
@btime @transform($df, :x=ifelse.((:a .> 6) .& (:b .== 4),"yes","no"))

  38.167 μs (162 allocations: 9.62 KiB)

@btime transform($df, [:a,:b]=>
  ByRow((a,b)->ifelse((a > 6) & (b == 4),"yes","no"))=>:x)

  40.235 μs (165 allocations: 9.72 KiB)

# one option is to use a function that works on a single observation and 
# broadcast it 
myfunc(a,b) = a > 6 && b == 4 ? "yes" : "no"
@btime @transform($df, :x=myfunc.(:a,:b))

  39.701 μs (165 allocations: 9.72 KiB)

# or you can use @eachrow macro that allows you to process DataFrame rowwise
@btime @eachrow $df begin
  @newcol x::Vector{String}
    :x = :a > 6 && :b == 4 ? "yes" : "no"
end

  40.225 μs (171 allocations: 10.16 KiB)

Very Fast!!!

@btime @chain $df begin
  _.x = ifelse.((_.a .> 6) .&  (_.b .== 4) ,"yes","no")
end
df

  2.396 μs (12 allocations: 592 bytes)

@btime transform($df,[:a,:b]=>ByRow((a,b)->ifelse(a > 6 && b == 4,"yes","no"))=>:x)

  40.337 μs (169 allocations: 9.98 KiB)

You can also use eachrow from DataFrames to perform the same transformation.

Very Fast!!!

df2 = copy(df) 
df2.x = Vector{String}(undef,nrow(df2))
@btime begin
  for row in eachrow($df2)
    row[:x] = row[:a] > 6 && row[:b] == 4 ? "yes" : "no"
  end
end
df2

  1.825 μs (0 allocations: 0 bytes)

Visualizing data with StatsPlots¶

using StatsPlots
default(fmt=:png)

We present only a minimal functionality of the package

using Random
Random.seed!(1)
df = DataFrame(x=sort(randn(1_000)), y=randn(1_000),
        z=[fill("b",500);fill("a",500)])

@df df plot(:x,:y, legend=:topleft,label="y(x)")

@df df density(:x,label="")

@df df histogram(:y, label="y")

@df df boxplot(:z,:x,label="x")

@df df violin(:z,:y,label="y")

	a	b	center	centered
	Int64	Char	Float64	Float64
1	1	a	5.0	-4.0
2	2	b	6.0	-4.0
3	3	c	7.0	-4.0
4	4	d	8.0	-4.0
5	5	a	5.0	0.0
6	6	b	6.0	0.0
7	7	c	7.0	0.0
8	8	d	8.0	0.0
9	9	a	5.0	4.0
10	10	b	6.0	4.0
11	11	c	7.0	4.0
12	12	d	8.0	4.0

	a	b	center	centered
	Int64	Char	Float64	Float64
1	1	a	5.0	-4.0
2	2	b	6.0	-4.0
3	3	c	7.0	-4.0
4	4	d	8.0	-4.0
5	5	a	5.0	0.0
6	6	b	6.0	0.0
7	7	c	7.0	0.0
8	8	d	8.0	0.0
9	9	a	5.0	4.0
10	10	b	6.0	4.0
11	11	c	7.0	4.0
12	12	d	8.0	4.0

	x	y	z
	Float64	Float64	String
1	-4.72636	-1.6144	b
2	-3.29987	0.683288	b
3	-2.7486	-1.95519	b
4	-2.63399	1.18536	b
5	-2.6274	1.58449	b
6	-2.62056	-0.144271	b
7	-2.59918	0.010576	b
8	-2.54967	0.019725	b
9	-2.52371	-0.588736	b
10	-2.46726	-0.40365	b
11	-2.46364	1.24812	b
12	-2.38138	-0.533295	b
13	-2.37316	1.05646	b
14	-2.28959	0.626849	b
15	-2.26709	-0.102321	b
⋮	⋮	⋮	⋮

	a
	Symbol
1	a
2	b
3	c

	z	x	x²
	Bool	Int64	Int64
1	0	2	4
2	0	4	16
3	1	1	1
4	1	3	9

Introduction to DataFrames¶

Extras - selected functionalities of selected packages¶

DataFramesMeta.jl - working on DataFrame¶

DataFramesMeta -working on grouped DataFrame¶

DataFramesMeta - rowwise operations on DataFrame¶

Visualizing data with StatsPlots¶

DataFramesMeta.jl - working on `DataFrame`¶

DataFramesMeta -working on grouped `DataFrame`¶

DataFramesMeta - rowwise operations on `DataFrame`¶