Introduction to DataFrames¶

DataFrame v1.2, Julia 1.6.1

using DataFrames
using Pipe
using Chain
using CategoricalArrays
using PooledArrays
using BenchmarkTools
using Random
using LinearAlgebra

# row가 10줄만 표시 되게 설정
ENV["LINES"] = 10

10

Performance tips¶

Access by column number is faster than by name¶

x = DataFrame(rand(5,1000), :auto)

@btime $x[!,500]
@btime $x.x500;

  3.970 ns (0 allocations: 0 bytes)
  13.518 ns (0 allocations: 0 bytes)

When working with data `DataFrame` use barrier functions or type annotation¶

xx = DataFrame(rand(1_000_000,2),:auto)
yy ,zz = xx[!,1],xx[!,2]
zz

1000000-element Vector{Float64}:
 0.6589244269414078
 0.42719594738951416
 0.03351411006209215
 ⋮
 0.8169215747911429
 0.8637662308693475

function f_bad() # this function will be slow
  Random.seed!(1)
  x = DataFrame(rand(1_000_000,2), :auto)
  y,z = x[!,1],x[!,2]
  p = 0.0
  for i in 1:nrow(x)
    p += y[i]*z[i]
  end
  p
end

f_bad (generic function with 1 method)

@btime f_bad()
# if you run @code_warntype f_bad() then you notice
# that Julia does not know column types of `DataFrame`

  94.372 ms (5999013 allocations: 122.06 MiB)

249788.77018817046

아래 그림에서 p,y,z 변수가 Any type으로 처리 되어 매우 느리게 된다.

@code_warntype f_bad()

Variables
  #self#::Core.Const(f_bad)
  @_2::Union{Nothing, Tuple{Int64, Int64}}
  p::Any
  z::AbstractVector{T} where T
  y::AbstractVector{T} where T
  x::DataFrame
  i::Int64

Body::Any
1 ─ %1  = Random.seed!::Core.Const(Random.seed!)
│         (%1)(1)
│   %3  = Main.rand(1000000, 2)::Matrix{Float64}
│         (x = Main.DataFrame(%3, :auto))
│   %5  = Base.getindex(x, Main.:!, 1)::AbstractVector{T} where T
│   %6  = Base.getindex(x, Main.:!, 2)::AbstractVector{T} where T
│         (y = %5)
│         (z = %6)
│         (p = 0.0)
│   %10 = Main.nrow(x)::Int64
│   %11 = (1:%10)::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])
│         (@_2 = Base.iterate(%11))
│   %13 = (@_2 === nothing)::Bool
│   %14 = Base.not_int(%13)::Bool
└──       goto #4 if not %14
2 ┄ %16 = @_2::Tuple{Int64, Int64}::Tuple{Int64, Int64}
│         (i = Core.getfield(%16, 1))
│   %18 = Core.getfield(%16, 2)::Int64
│   %19 = p::Any
│   %20 = Base.getindex(y, i)::Any
│   %21 = Base.getindex(z, i)::Any
│   %22 = (%20 * %21)::Any
│         (p = %19 + %22)
│         (@_2 = Base.iterate(%11, %18))
│   %25 = (@_2 === nothing)::Bool
│   %26 = Base.not_int(%25)::Bool
└──       goto #4 if not %26
3 ─       goto #2
4 ┄       return p

solution 1 is to use barrier function (it should be possible to use it in almost any code)

function f_inner(y,z)
  p = 0.0
  for i in 1:length(y)
    p += y[i]*z[i]
  end
  p
end

function f_barrier() # extract the work to an inner function
  Random.seed!(1)
  x = DataFrame(rand(1_000_000,2),:auto)
  f_inner(x[!,1],x[!,2])
end

function f_inbuilt() # or use inbuilt function if possible
  Random.seed!(1)
  x = DataFrame(rand(1_000_000,2), :auto)
  dot(x[!,1],x[!,2])
end

f_inbuilt (generic function with 1 method)

@code_warntype f_barrier()

Variables
  #self#::Core.Const(f_barrier)
  x::DataFrame

Body::Any
1 ─ %1 = Random.seed!::Core.Const(Random.seed!)
│        (%1)(1)
│   %3 = Main.rand(1000000, 2)::Matrix{Float64}
│        (x = Main.DataFrame(%3, :auto))
│   %5 = Base.getindex(x, Main.:!, 1)::AbstractVector{T} where T
│   %6 = Base.getindex(x, Main.:!, 2)::AbstractVector{T} where T
│   %7 = Main.f_inner(%5, %6)::Any
└──      return %7

아래 f_inbuilt는 warning이 없다.

@code_warntype f_inner()

@btime f_barrier()
@btime f_inbuilt()

  5.981 ms (35 allocations: 30.52 MiB)
  5.161 ms (35 allocations: 30.52 MiB)

249788.7701881836

solution 2 is to provide the types of extracted columns \ it is simpler but there are cases in which you will not know these types \ This example assumes that you have DataFrames master at least from August 31, 2018

function f_typed()
  Random.seed!(1)
  x = DataFrame(rand(1_000_000,2),:auto)
  y::Vector{Float64}, z::Vector{Float64} = x[!,1], x[!,2]
  p = 0.0
  for i in 1:nrow(x)
    p += y[i]*z[i]
  end
  p
end

f_typed (generic function with 1 method)

@btime f_typed()

  5.968 ms (35 allocations: 30.52 MiB)

249788.77018817046

아래 그림에서 보면 p,y,z가 Float64 type으로 변수 타입이 확정되어 있다.\ 따라서 Any type 보다는 처리가 훨씬 빠르다\

@code_warntype f_typed()

Variables
  #self#::Core.Const(f_typed)
  @_2::Union{Nothing, Tuple{Int64, Int64}}
  p::Float64
  z::Vector{Float64}
  y::Vector{Float64}
  x::DataFrame
  i::Int64

Body::Float64
1 ─ %1  = Random.seed!::Core.Const(Random.seed!)
│         (%1)(1)
│   %3  = Main.rand(1000000, 2)::Matrix{Float64}
│         (x = Main.DataFrame(%3, :auto))
│   %5  = Base.getindex(x, Main.:!, 1)::AbstractVector{T} where T
│   %6  = Base.getindex(x, Main.:!, 2)::AbstractVector{T} where T
│   %7  = Core.apply_type(Main.Vector, Main.Float64)::Core.Const(Vector{Float64})
│   %8  = Base.convert(%7, %5)::Any
│         (y = Core.typeassert(%8, %7))
│   %10 = Core.apply_type(Main.Vector, Main.Float64)::Core.Const(Vector{Float64})
│   %11 = Base.convert(%10, %6)::Any
│         (z = Core.typeassert(%11, %10))
│         (p = 0.0)
│   %14 = Main.nrow(x)::Int64
│   %15 = (1:%14)::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])
│         (@_2 = Base.iterate(%15))
│   %17 = (@_2 === nothing)::Bool
│   %18 = Base.not_int(%17)::Bool
└──       goto #4 if not %18
2 ┄ %20 = @_2::Tuple{Int64, Int64}::Tuple{Int64, Int64}
│         (i = Core.getfield(%20, 1))
│   %22 = Core.getfield(%20, 2)::Int64
│   %23 = p::Float64
│   %24 = Base.getindex(y, i)::Float64
│   %25 = Base.getindex(z, i)::Float64
│   %26 = (%24 * %25)::Float64
│         (p = %23 + %26)
│         (@_2 = Base.iterate(%15, %22))
│   %29 = (@_2 === nothing)::Bool
│   %30 = Base.not_int(%29)::Bool
└──       goto #4 if not %30
3 ─       goto #2
4 ┄       return p

In general for tall and narrow tables it is often useful to use Tables.rowtable,Tables.columntable or Tables.namedtupleiterator for intermediate processing of data in a type-stable way.

Consider using delayed `DataFrame` creation technique¶

also notice the difference in performance between copying vs non-copying data frame creation

function f1()
  # we work with a DataFrame directly
  x = DataFrame([Vector{Float64}(undef,10^4) for i in 1:100], :auto,
    copycols=false)
  for c in 1:ncol(x)
    d = x[!,c]
    for r in 1:nrow(x)
      d[r] = rand()
    end
  end
  x
end

function f1a()
  # we work with a DataFrame directly
  x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100], :auto)
  for c in 1:ncol(x)
    d = x[!,c]
    for r in 1:nrow(x)
      d[r] = rand()
    end
  end
end

function f2()
  x = Vector{Any}(undef,100)
  for c in 1:length(x)
    d = Vector{Float64}(undef, 10^4)
    for r in 1:length(d)
      d[r] = rand()
    end
    x[c] = d
  end
  # we delay creation of DataFrame after we have our job done
  DataFrame(x, :auto, copycols=false)
end

function f2a()
  x = Vector{Any}(undef, 100)
  for c in 1:length(x)
    d = Vector{Float64}(undef, 10^4)
    for r in 1:length(d)
      d[r] = rand()
    end
    x[c] = d
  end
  # we delay creation of DataFrame after we have our job done
  DataFrame(x, :auto)
end

f2a (generic function with 1 method)

@btime f1();
@btime f1a();
@btime f2();
@btime f2a();

  28.599 ms (1949523 allocations: 37.40 MiB)
  30.679 ms (1949723 allocations: 45.04 MiB)
  4.525 ms (623 allocations: 7.66 MiB)
  5.422 ms (823 allocations: 15.30 MiB)

You can add rows to a `DataFrame` in place and it is fast¶

x = DataFrame(rand(10^6,5), :auto)
y = DataFrame(transpose(1.0:5.0), :auto)
z = [1.0:5.0;];
# creates a new DataFrame - slow
@btime vcat($x, $y)
# in place - fast
@btime append!($x,$y)

# reset to the same starting point
x = DataFrame(rand(10^6,5), :auto)
# add a single row in place - fast
@btime push!($x, $z)

  5.954 ms (153 allocations: 38.16 MiB)
  1.200 μs (17 allocations: 832 bytes)
  507.891 ns (16 allocations: 256 bytes)

transpose(1.0:5.0)

1×5 transpose(::StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}}) with eltype Float64:
 1.0  2.0  3.0  4.0  5.0

DataFrame(transpose(1:10), :auto)

DataFrame([1:10...]', :auto)

[1:10...]'

1×10 adjoint(::Vector{Int64}) with eltype Int64:
 1  2  3  4  5  6  7  8  9  10

[1:5...]'

1×5 adjoint(::Vector{Int64}) with eltype Int64:
 1  2  3  4  5

Insert row in specific index

insert!.(eachcol(x),1,[1:5;]);
x

Allowing `missing` as well as `categorical` shows down computations¶

using StatsBase

Uses countmap function to test performance

countmap(rand([1:10;],10000))

Dict{Int64, Int64} with 10 entries:
  5  => 1015
  4  => 910
  6  => 1037
  7  => 1015
  2  => 959
  ⋮  => ⋮

function test(data)
  println(eltype(data))
  x = rand(data, 10^6)
  y = categorical(x)
  println(" raw:")
  @btime countmap($x)
  println(" categorical:")
  @btime countmap($y)
  nothing
end

test (generic function with 1 method)

test(1:10)

Int64
 raw:
  2.847 ms (7 allocations: 7.63 MiB)
 categorical:
  11.628 ms (4 allocations: 608 bytes)

test([randstring() for i in 1:10])

String
 raw:
  34.625 ms (4 allocations: 608 bytes)
 categorical:
  33.851 ms (4 allocations: 608 bytes)

test(allowmissing(1:10))

Union{Missing, Int64}
 raw:
  7.576 ms (4 allocations: 624 bytes)
 categorical:
  12.582 ms (4 allocations: 608 bytes)

test(allowmissing([randstring() for i in 1:10]))

Union{Missing, String}
 raw:
  28.941 ms (4 allocations: 608 bytes)
 categorical:
  30.359 ms (4 allocations: 608 bytes)

When aggregating use column selector and prefer integer, categorical, or pooled array grouping variable¶

df = DataFrame(x=rand('a':'d', 10^7), y=1)

gdf = groupby(df,:x)

Traditional syntax, slow

@btime combine(v->sum(v.y), $gdf)

  36.798 ms (369 allocations: 19.08 MiB)

Use column selector to speed up

@btime combine($gdf,:y=>sum)

  13.680 ms (234 allocations: 15.00 KiB)

transform!(df, :x=>categorical=>:x)

gdf = groupby(df,:x)

@btime combine($gdf, :y=>sum)

  13.754 ms (246 allocations: 15.88 KiB)

transform!(df, :x=>PooledArray{Char}=>:x)

gdf = groupby(df,:x)

@btime combine($gdf, :y=>sum)

  13.876 ms (236 allocations: 15.06 KiB)

Use views instead of materializing a new DataFrame¶

x = DataFrame(rand(100,1000), :auto)

@btime $x[1:1,:]

  179.630 μs (2985 allocations: 190.69 KiB)

@btime $x[1,:]

  22.695 ns (0 allocations: 0 bytes)

@btime view($x,1:1,:)

  22.620 ns (0 allocations: 0 bytes)

@btime $x[1:1,1:20]

  4.047 μs (50 allocations: 4.16 KiB)

@btime $x[1,1:20]

  24.756 ns (0 allocations: 0 bytes)

@btime view($x,1:1,1:20)

  24.784 ns (0 allocations: 0 bytes)

	x1	x2	x3	x4	x5	x6	x7	x8
	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64
1	0.0397915	0.415882	0.97653	0.253379	0.219631	0.618882	0.436154	0.895649
2	0.117571	0.406994	0.995162	0.169646	0.0841598	0.0857434	0.238027	0.870607
3	0.651842	0.601816	0.807395	0.946291	0.887906	0.558083	0.388592	0.255522
4	0.511818	0.816755	0.322432	0.773936	0.389302	0.598662	0.924542	0.114845
5	0.448117	0.0719608	0.424192	0.826959	0.69131	0.263492	0.559547	0.677479

	x1	x2	x3	x4	x5
	Float64	Float64	Float64	Float64	Float64
1	0.225914	0.370305	0.849316	0.603713	0.246239
2	0.572376	0.0701989	0.264231	0.71388	0.00452865
3	0.896316	0.238564	0.0215601	0.0949987	0.441804
4	0.0242679	0.0648994	0.0666114	0.46519	0.437336
5	0.0272442	0.699469	0.773581	0.38571	0.349716
6	0.135265	0.860576	0.0389726	0.257076	0.19818
7	0.784373	0.764754	0.580283	0.700825	0.319894
8	0.283432	0.978256	0.458369	0.318221	0.572936
9	0.592983	0.04814	0.527376	0.069938	0.5806
10	0.937742	0.0943059	0.770903	0.858174	0.227221
⋮	⋮	⋮	⋮	⋮	⋮

	x1	x2	x3	x4	x5
	Float64	Float64	Float64	Float64	Float64
1	1.0	2.0	3.0	4.0	5.0
2	0.225914	0.370305	0.849316	0.603713	0.246239
3	0.572376	0.0701989	0.264231	0.71388	0.00452865
4	0.896316	0.238564	0.0215601	0.0949987	0.441804
5	0.0242679	0.0648994	0.0666114	0.46519	0.437336
6	0.0272442	0.699469	0.773581	0.38571	0.349716
7	0.135265	0.860576	0.0389726	0.257076	0.19818
8	0.784373	0.764754	0.580283	0.700825	0.319894
9	0.283432	0.978256	0.458369	0.318221	0.572936
10	0.592983	0.04814	0.527376	0.069938	0.5806
⋮	⋮	⋮	⋮	⋮	⋮

	x	x1
	Char	Int64
1	d	2498628
2	c	2500812
3	b	2500185
4	a	2500375

	x	y_sum
	Char	Int64
1	d	2498628
2	c	2500812
3	b	2500185
4	a	2500375

	x1	x2	x3	x4	x5	x6	x7	x8
	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64
1	0.611864	0.934123	0.0288492	0.824565	0.302744	0.49922	0.873075	0.946012
2	0.351153	0.958221	0.358703	0.82172	0.119042	0.353634	0.20554	0.500595
3	0.170914	0.586771	0.514804	0.997787	0.438834	0.443433	0.763381	0.785214
4	0.177539	0.423223	0.327847	0.37991	0.31623	0.600651	0.126299	0.541596
5	0.117902	0.453515	0.75005	0.357106	0.280886	0.931637	0.127927	0.705181
6	0.627909	0.0043642	0.678212	0.120133	0.0402748	0.796837	0.0185935	0.577507
7	0.71759	0.386277	0.0920944	0.412215	0.846834	0.822471	0.876706	0.552205
8	0.106608	0.242573	0.168948	0.984732	0.618067	0.999577	0.537493	0.659392
9	0.730968	0.26708	0.981783	0.891447	0.252116	0.0854978	0.794124	0.730028
10	0.728956	0.669461	0.359774	0.4171	0.295672	0.795275	0.70695	0.864416
⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮

	x1	x2	x3	x4	x5	x6	x7	x8
	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64
1	0.611864	0.934123	0.0288492	0.824565	0.302744	0.49922	0.873075	0.946012

	x1	x2	x3	x4	x5	x6	x7	x8
	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64
1	0.611864	0.934123	0.0288492	0.824565	0.302744	0.49922	0.873075	0.946012

	x1	x2	x3	x4	x5	x6	x7	x8
	Float64	Float64	Float64	Float64	Float64	Float64	Float64	Float64
1	0.611864	0.934123	0.0288492	0.824565	0.302744	0.49922	0.873075	0.946012

Introduction to DataFrames¶

Performance tips¶

Access by column number is faster than by name¶

When working with data DataFrame use barrier functions or type annotation¶

Consider using delayed DataFrame creation technique¶