Introduction to DataFrames

11_performance

DataFrame v1.2, Julia 1.6.1

In [11]:
using DataFrames
using Pipe
using Chain
using CategoricalArrays
using PooledArrays
using BenchmarkTools
using Random
using LinearAlgebra
In [3]:
# row가 10줄만 표시 되게 설정
ENV["LINES"] = 10
Out[3]:
10

Performance tips

Access by column number is faster than by name

In [2]:
x = DataFrame(rand(5,1000), :auto)
Out[2]:

5 rows × 1,000 columns (omitted printing of 992 columns)

x1x2x3x4x5x6x7x8
Float64Float64Float64Float64Float64Float64Float64Float64
10.03979150.4158820.976530.2533790.2196310.6188820.4361540.895649
20.1175710.4069940.9951620.1696460.08415980.08574340.2380270.870607
30.6518420.6018160.8073950.9462910.8879060.5580830.3885920.255522
40.5118180.8167550.3224320.7739360.3893020.5986620.9245420.114845
50.4481170.07196080.4241920.8269590.691310.2634920.5595470.677479
In [6]:
@btime $x[!,500]
@btime $x.x500;
  3.970 ns (0 allocations: 0 bytes)
  13.518 ns (0 allocations: 0 bytes)

When working with data DataFrame use barrier functions or type annotation

In [23]:
xx = DataFrame(rand(1_000_000,2),:auto)
yy ,zz = xx[!,1],xx[!,2]
zz
Out[23]:
1000000-element Vector{Float64}:
 0.6589244269414078
 0.42719594738951416
 0.03351411006209215
 ⋮
 0.8169215747911429
 0.8637662308693475
In [8]:
function f_bad() # this function will be slow
  Random.seed!(1)
  x = DataFrame(rand(1_000_000,2), :auto)
  y,z = x[!,1],x[!,2]
  p = 0.0
  for i in 1:nrow(x)
    p += y[i]*z[i]
  end
  p
end
Out[8]:
f_bad (generic function with 1 method)
In [10]:
@btime f_bad()
# if you run @code_warntype f_bad() then you notice
# that Julia does not know column types of `DataFrame`
  94.372 ms (5999013 allocations: 122.06 MiB)
Out[10]:
249788.77018817046

아래 그림에서 p,y,z 변수가 Any type으로 처리 되어 매우 느리게 된다.

image.png

In [24]:
@code_warntype f_bad()
Variables
  #self#::Core.Const(f_bad)
  @_2::Union{Nothing, Tuple{Int64, Int64}}
  p::Any
  z::AbstractVector{T} where T
  y::AbstractVector{T} where T
  x::DataFrame
  i::Int64

Body::Any
1 ─ %1  = Random.seed!::Core.Const(Random.seed!)
       (%1)(1)
 %3  = Main.rand(1000000, 2)::Matrix{Float64}
       (x = Main.DataFrame(%3, :auto))
 %5  = Base.getindex(x, Main.:!, 1)::AbstractVector{T} where T
 %6  = Base.getindex(x, Main.:!, 2)::AbstractVector{T} where T
       (y = %5)
       (z = %6)
       (p = 0.0)
 %10 = Main.nrow(x)::Int64
 %11 = (1:%10)::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])
       (@_2 = Base.iterate(%11))
 %13 = (@_2 === nothing)::Bool
 %14 = Base.not_int(%13)::Bool
└──       goto #4 if not %14
2 ┄ %16 = @_2::Tuple{Int64, Int64}::Tuple{Int64, Int64}
       (i = Core.getfield(%16, 1))
 %18 = Core.getfield(%16, 2)::Int64
 %19 = p::Any
 %20 = Base.getindex(y, i)::Any
 %21 = Base.getindex(z, i)::Any
 %22 = (%20 * %21)::Any
       (p = %19 + %22)
       (@_2 = Base.iterate(%11, %18))
 %25 = (@_2 === nothing)::Bool
 %26 = Base.not_int(%25)::Bool
└──       goto #4 if not %26
3 ─       goto #2
4 ┄       return p

solution 1 is to use barrier function (it should be possible to use it in almost any code)

In [12]:
function f_inner(y,z)
  p = 0.0
  for i in 1:length(y)
    p += y[i]*z[i]
  end
  p
end

function f_barrier() # extract the work to an inner function
  Random.seed!(1)
  x = DataFrame(rand(1_000_000,2),:auto)
  f_inner(x[!,1],x[!,2])
end

function f_inbuilt() # or use inbuilt function if possible
  Random.seed!(1)
  x = DataFrame(rand(1_000_000,2), :auto)
  dot(x[!,1],x[!,2])
end
Out[12]:
f_inbuilt (generic function with 1 method)
In [25]:
@code_warntype f_barrier()
Variables
  #self#::Core.Const(f_barrier)
  x::DataFrame

Body::Any
1 ─ %1 = Random.seed!::Core.Const(Random.seed!)
      (%1)(1)
 %3 = Main.rand(1000000, 2)::Matrix{Float64}
      (x = Main.DataFrame(%3, :auto))
 %5 = Base.getindex(x, Main.:!, 1)::AbstractVector{T} where T
 %6 = Base.getindex(x, Main.:!, 2)::AbstractVector{T} where T
 %7 = Main.f_inner(%5, %6)::Any
└──      return %7

아래 f_inbuilt는 warning이 없다.

In [26]:
@code_warntype f_inner()
In [19]:
@btime f_barrier()
@btime f_inbuilt()
  5.981 ms (35 allocations: 30.52 MiB)
  5.161 ms (35 allocations: 30.52 MiB)
Out[19]:
249788.7701881836

solution 2 is to provide the types of extracted columns \ it is simpler but there are cases in which you will not know these types \ This example assumes that you have DataFrames master at least from August 31, 2018

In [16]:
function f_typed()
  Random.seed!(1)
  x = DataFrame(rand(1_000_000,2),:auto)
  y::Vector{Float64}, z::Vector{Float64} = x[!,1], x[!,2]
  p = 0.0
  for i in 1:nrow(x)
    p += y[i]*z[i]
  end
  p
end
Out[16]:
f_typed (generic function with 1 method)
In [18]:
@btime f_typed()
  5.968 ms (35 allocations: 30.52 MiB)
Out[18]:
249788.77018817046

아래 그림에서 보면 p,y,z가 Float64 type으로 변수 타입이 확정되어 있다.\ 따라서 Any type 보다는 처리가 훨씬 빠르다\ image.png

In [27]:
@code_warntype f_typed()
Variables
  #self#::Core.Const(f_typed)
  @_2::Union{Nothing, Tuple{Int64, Int64}}
  p::Float64
  z::Vector{Float64}
  y::Vector{Float64}
  x::DataFrame
  i::Int64

Body::Float64
1 ─ %1  = Random.seed!::Core.Const(Random.seed!)
       (%1)(1)
 %3  = Main.rand(1000000, 2)::Matrix{Float64}
       (x = Main.DataFrame(%3, :auto))
 %5  = Base.getindex(x, Main.:!, 1)::AbstractVector{T} where T
 %6  = Base.getindex(x, Main.:!, 2)::AbstractVector{T} where T
 %7  = Core.apply_type(Main.Vector, Main.Float64)::Core.Const(Vector{Float64})
 %8  = Base.convert(%7, %5)::Any
       (y = Core.typeassert(%8, %7))
 %10 = Core.apply_type(Main.Vector, Main.Float64)::Core.Const(Vector{Float64})
 %11 = Base.convert(%10, %6)::Any
       (z = Core.typeassert(%11, %10))
       (p = 0.0)
 %14 = Main.nrow(x)::Int64
 %15 = (1:%14)::Core.PartialStruct(UnitRange{Int64}, Any[Core.Const(1), Int64])
       (@_2 = Base.iterate(%15))
 %17 = (@_2 === nothing)::Bool
 %18 = Base.not_int(%17)::Bool
└──       goto #4 if not %18
2 ┄ %20 = @_2::Tuple{Int64, Int64}::Tuple{Int64, Int64}
       (i = Core.getfield(%20, 1))
 %22 = Core.getfield(%20, 2)::Int64
 %23 = p::Float64
 %24 = Base.getindex(y, i)::Float64
 %25 = Base.getindex(z, i)::Float64
 %26 = (%24 * %25)::Float64
       (p = %23 + %26)
       (@_2 = Base.iterate(%15, %22))
 %29 = (@_2 === nothing)::Bool
 %30 = Base.not_int(%29)::Bool
└──       goto #4 if not %30
3 ─       goto #2
4 ┄       return p

In general for tall and narrow tables it is often useful to use Tables.rowtable,Tables.columntable or Tables.namedtupleiterator for intermediate processing of data in a type-stable way.

Consider using delayed DataFrame creation technique

also notice the difference in performance between copying vs non-copying data frame creation

In [30]:
function f1()
  # we work with a DataFrame directly
  x = DataFrame([Vector{Float64}(undef,10^4) for i in 1:100], :auto,
    copycols=false)
  for c in 1:ncol(x)
    d = x[!,c]
    for r in 1:nrow(x)
      d[r] = rand()
    end
  end
  x
end

function f1a()
  # we work with a DataFrame directly
  x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100], :auto)
  for c in 1:ncol(x)
    d = x[!,c]
    for r in 1:nrow(x)
      d[r] = rand()
    end
  end
end

function f2()
  x = Vector{Any}(undef,100)
  for c in 1:length(x)
    d = Vector{Float64}(undef, 10^4)
    for r in 1:length(d)
      d[r] = rand()
    end
    x[c] = d
  end
  # we delay creation of DataFrame after we have our job done
  DataFrame(x, :auto, copycols=false)
end

function f2a()
  x = Vector{Any}(undef, 100)
  for c in 1:length(x)
    d = Vector{Float64}(undef, 10^4)
    for r in 1:length(d)
      d[r] = rand()
    end
    x[c] = d
  end
  # we delay creation of DataFrame after we have our job done
  DataFrame(x, :auto)
end
Out[30]:
f2a (generic function with 1 method)
In [31]:
@btime f1();
@btime f1a();
@btime f2();
@btime f2a();
  28.599 ms (1949523 allocations: 37.40 MiB)
  30.679 ms (1949723 allocations: 45.04 MiB)
  4.525 ms (623 allocations: 7.66 MiB)
  5.422 ms (823 allocations: 15.30 MiB)

You can add rows to a DataFrame in place and it is fast

In [99]:
x = DataFrame(rand(10^6,5), :auto)
y = DataFrame(transpose(1.0:5.0), :auto)
z = [1.0:5.0;];
# creates a new DataFrame - slow
@btime vcat($x, $y)
# in place - fast
@btime append!($x,$y)

# reset to the same starting point
x = DataFrame(rand(10^6,5), :auto)
# add a single row in place - fast
@btime push!($x, $z)
  5.954 ms (153 allocations: 38.16 MiB)
  1.200 μs (17 allocations: 832 bytes)
  507.891 ns (16 allocations: 256 bytes)
Out[99]:

3,430,502 rows × 5 columns

x1x2x3x4x5
Float64Float64Float64Float64Float64
10.2259140.3703050.8493160.6037130.246239
20.5723760.07019890.2642310.713880.00452865
30.8963160.2385640.02156010.09499870.441804
40.02426790.06489940.06661140.465190.437336
50.02724420.6994690.7735810.385710.349716
60.1352650.8605760.03897260.2570760.19818
70.7843730.7647540.5802830.7008250.319894
80.2834320.9782560.4583690.3182210.572936
90.5929830.048140.5273760.0699380.5806
100.9377420.09430590.7709030.8581740.227221
In [100]:
transpose(1.0:5.0)
Out[100]:
1×5 transpose(::StepRangeLen{Float64, Base.TwicePrecision{Float64}, Base.TwicePrecision{Float64}}) with eltype Float64:
 1.0  2.0  3.0  4.0  5.0
In [101]:
DataFrame(transpose(1:10), :auto)
Out[101]:

1 rows × 10 columns

x1x2x3x4x5x6x7x8x9x10
Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64
112345678910
In [102]:
DataFrame([1:10...]', :auto)
Out[102]:

1 rows × 10 columns

x1x2x3x4x5x6x7x8x9x10
Int64Int64Int64Int64Int64Int64Int64Int64Int64Int64
112345678910
In [103]:
[1:10...]'
Out[103]:
1×10 adjoint(::Vector{Int64}) with eltype Int64:
 1  2  3  4  5  6  7  8  9  10
In [104]:
[1:5...]'
Out[104]:
1×5 adjoint(::Vector{Int64}) with eltype Int64:
 1  2  3  4  5

Insert row in specific index

In [105]:
insert!.(eachcol(x),1,[1:5;]);
x
Out[105]:

3,430,503 rows × 5 columns

x1x2x3x4x5
Float64Float64Float64Float64Float64
11.02.03.04.05.0
20.2259140.3703050.8493160.6037130.246239
30.5723760.07019890.2642310.713880.00452865
40.8963160.2385640.02156010.09499870.441804
50.02426790.06489940.06661140.465190.437336
60.02724420.6994690.7735810.385710.349716
70.1352650.8605760.03897260.2570760.19818
80.7843730.7647540.5802830.7008250.319894
90.2834320.9782560.4583690.3182210.572936
100.5929830.048140.5273760.0699380.5806

Allowing missing as well as categorical shows down computations

In [106]:
using StatsBase

Uses countmap function to test performance

In [135]:
countmap(rand([1:10;],10000))
Out[135]:
Dict{Int64, Int64} with 10 entries:
  5  => 1015
  4  => 910
  6  => 1037
  7  => 1015
  2  => 959
  ⋮  => ⋮
In [107]:
function test(data)
  println(eltype(data))
  x = rand(data, 10^6)
  y = categorical(x)
  println(" raw:")
  @btime countmap($x)
  println(" categorical:")
  @btime countmap($y)
  nothing
end
Out[107]:
test (generic function with 1 method)
In [108]:
test(1:10)
Int64
 raw:
  2.847 ms (7 allocations: 7.63 MiB)
 categorical:
  11.628 ms (4 allocations: 608 bytes)
In [109]:
test([randstring() for i in 1:10])
String
 raw:
  34.625 ms (4 allocations: 608 bytes)
 categorical:
  33.851 ms (4 allocations: 608 bytes)
In [136]:
test(allowmissing(1:10))
Union{Missing, Int64}
 raw:
  7.576 ms (4 allocations: 624 bytes)
 categorical:
  12.582 ms (4 allocations: 608 bytes)
In [137]:
test(allowmissing([randstring() for i in 1:10]))
Union{Missing, String}
 raw:
  28.941 ms (4 allocations: 608 bytes)
 categorical:
  30.359 ms (4 allocations: 608 bytes)

When aggregating use column selector and prefer integer, categorical, or pooled array grouping variable

In [138]:
df = DataFrame(x=rand('a':'d', 10^7), y=1)
Out[138]:

10,000,000 rows × 2 columns

xy
CharInt64
1d1
2c1
3c1
4b1
5c1
6c1
7c1
8d1
9a1
10c1
In [139]:
gdf = groupby(df,:x)
Out[139]:

GroupedDataFrame with 4 groups based on key: x

First Group (2498628 rows): x = 'd'

xy
CharInt64
1d1
2d1
3d1
4d1
5d1
6d1
7d1
8d1
9d1
10d1

Last Group (2500375 rows): x = 'a'

xy
CharInt64
1a1
2a1
3a1
4a1
5a1
6a1
7a1
8a1
9a1
10a1

Traditional syntax, slow

In [141]:
@btime combine(v->sum(v.y), $gdf)
  36.798 ms (369 allocations: 19.08 MiB)
Out[141]:

4 rows × 2 columns

xx1
CharInt64
1d2498628
2c2500812
3b2500185
4a2500375

Use column selector to speed up

In [142]:
@btime combine($gdf,:y=>sum)
  13.680 ms (234 allocations: 15.00 KiB)
Out[142]:

4 rows × 2 columns

xy_sum
CharInt64
1d2498628
2c2500812
3b2500185
4a2500375
In [143]:
transform!(df, :x=>categorical=>:x)
Out[143]:

10,000,000 rows × 2 columns

xy
Cat…Int64
1d1
2c1
3c1
4b1
5c1
6c1
7c1
8d1
9a1
10c1
In [144]:
gdf = groupby(df,:x)
Out[144]:

GroupedDataFrame with 4 groups based on key: x

First Group (2500375 rows): x = CategoricalValue{Char, UInt32} 'a'

xy
Cat…Int64
1a1
2a1
3a1
4a1
5a1
6a1
7a1
8a1
9a1
10a1

Last Group (2498628 rows): x = CategoricalValue{Char, UInt32} 'd'

xy
Cat…Int64
1d1
2d1
3d1
4d1
5d1
6d1
7d1
8d1
9d1
10d1
In [146]:
@btime combine($gdf, :y=>sum)
  13.754 ms (246 allocations: 15.88 KiB)
Out[146]:

4 rows × 2 columns

xy_sum
Cat…Int64
1a2500375
2b2500185
3c2500812
4d2498628
In [147]:
transform!(df, :x=>PooledArray{Char}=>:x)
Out[147]:

10,000,000 rows × 2 columns

xy
CharInt64
1d1
2c1
3c1
4b1
5c1
6c1
7c1
8d1
9a1
10c1
In [148]:
gdf = groupby(df,:x)
Out[148]:

GroupedDataFrame with 4 groups based on key: x

First Group (2498628 rows): x = 'd'

xy
CharInt64
1d1
2d1
3d1
4d1
5d1
6d1
7d1
8d1
9d1
10d1

Last Group (2500375 rows): x = 'a'

xy
CharInt64
1a1
2a1
3a1
4a1
5a1
6a1
7a1
8a1
9a1
10a1
In [149]:
@btime combine($gdf, :y=>sum)
  13.876 ms (236 allocations: 15.06 KiB)
Out[149]:

4 rows × 2 columns

xy_sum
CharInt64
1d2498628
2c2500812
3b2500185
4a2500375

Use views instead of materializing a new DataFrame

In [150]:
x = DataFrame(rand(100,1000), :auto)
Out[150]:

100 rows × 1,000 columns (omitted printing of 992 columns)

x1x2x3x4x5x6x7x8
Float64Float64Float64Float64Float64Float64Float64Float64
10.6118640.9341230.02884920.8245650.3027440.499220.8730750.946012
20.3511530.9582210.3587030.821720.1190420.3536340.205540.500595
30.1709140.5867710.5148040.9977870.4388340.4434330.7633810.785214
40.1775390.4232230.3278470.379910.316230.6006510.1262990.541596
50.1179020.4535150.750050.3571060.2808860.9316370.1279270.705181
60.6279090.00436420.6782120.1201330.04027480.7968370.01859350.577507
70.717590.3862770.09209440.4122150.8468340.8224710.8767060.552205
80.1066080.2425730.1689480.9847320.6180670.9995770.5374930.659392
90.7309680.267080.9817830.8914470.2521160.08549780.7941240.730028
100.7289560.6694610.3597740.41710.2956720.7952750.706950.864416
In [151]:
@btime $x[1:1,:]
  179.630 μs (2985 allocations: 190.69 KiB)
Out[151]:

1 rows × 1,000 columns (omitted printing of 992 columns)

x1x2x3x4x5x6x7x8
Float64Float64Float64Float64Float64Float64Float64Float64
10.6118640.9341230.02884920.8245650.3027440.499220.8730750.946012
In [152]:
@btime $x[1,:]
  22.695 ns (0 allocations: 0 bytes)
Out[152]:

DataFrameRow (1000 columns)

x1x2x3x4x5x6x7x8
Float64Float64Float64Float64Float64Float64Float64Float64
10.6118640.9341230.02884920.8245650.3027440.499220.8730750.946012
In [153]:
@btime view($x,1:1,:)
  22.620 ns (0 allocations: 0 bytes)
Out[153]:

1 rows × 1,000 columns (omitted printing of 992 columns)

x1x2x3x4x5x6x7x8
Float64Float64Float64Float64Float64Float64Float64Float64
10.6118640.9341230.02884920.8245650.3027440.499220.8730750.946012
In [154]:
@btime $x[1:1,1:20]
  4.047 μs (50 allocations: 4.16 KiB)
Out[154]:

1 rows × 20 columns (omitted printing of 12 columns)

x1x2x3x4x5x6x7x8
Float64Float64Float64Float64Float64Float64Float64Float64
10.6118640.9341230.02884920.8245650.3027440.499220.8730750.946012
In [155]:
@btime $x[1,1:20]
  24.756 ns (0 allocations: 0 bytes)
Out[155]:

DataFrameRow (20 columns)

x1x2x3x4x5x6x7x8
Float64Float64Float64Float64Float64Float64Float64Float64
10.6118640.9341230.02884920.8245650.3027440.499220.8730750.946012
In [156]:
@btime view($x,1:1,1:20)
  24.784 ns (0 allocations: 0 bytes)
Out[156]:

1 rows × 20 columns (omitted printing of 12 columns)

x1x2x3x4x5x6x7x8
Float64Float64Float64Float64Float64Float64Float64Float64
10.6118640.9341230.02884920.8245650.3027440.499220.8730750.946012
In [ ]: