using DataFrames
using Pipe
using Chain
using CategoricalArrays
using PooledArrays
using BenchmarkTools
using Random
using LinearAlgebra
# row가 10줄만 표시 되게 설정
ENV["LINES"] = 10
x = DataFrame(rand(5,1000), :auto)
@btime $x[!,500]
@btime $x.x500;
DataFrame
use barrier functions or type annotation¶xx = DataFrame(rand(1_000_000,2),:auto)
yy ,zz = xx[!,1],xx[!,2]
zz
function f_bad() # this function will be slow
Random.seed!(1)
x = DataFrame(rand(1_000_000,2), :auto)
y,z = x[!,1],x[!,2]
p = 0.0
for i in 1:nrow(x)
p += y[i]*z[i]
end
p
end
@btime f_bad()
# if you run @code_warntype f_bad() then you notice
# that Julia does not know column types of `DataFrame`
아래 그림에서 p,y,z 변수가 Any type으로 처리 되어 매우 느리게 된다.
@code_warntype f_bad()
solution 1 is to use barrier function (it should be possible to use it in almost any code)
function f_inner(y,z)
p = 0.0
for i in 1:length(y)
p += y[i]*z[i]
end
p
end
function f_barrier() # extract the work to an inner function
Random.seed!(1)
x = DataFrame(rand(1_000_000,2),:auto)
f_inner(x[!,1],x[!,2])
end
function f_inbuilt() # or use inbuilt function if possible
Random.seed!(1)
x = DataFrame(rand(1_000_000,2), :auto)
dot(x[!,1],x[!,2])
end
@code_warntype f_barrier()
아래 f_inbuilt는 warning이 없다.
@code_warntype f_inner()
@btime f_barrier()
@btime f_inbuilt()
solution 2 is to provide the types of extracted columns \ it is simpler but there are cases in which you will not know these types \ This example assumes that you have DataFrames master at least from August 31, 2018
function f_typed()
Random.seed!(1)
x = DataFrame(rand(1_000_000,2),:auto)
y::Vector{Float64}, z::Vector{Float64} = x[!,1], x[!,2]
p = 0.0
for i in 1:nrow(x)
p += y[i]*z[i]
end
p
end
@btime f_typed()
아래 그림에서 보면 p,y,z가 Float64 type으로 변수 타입이 확정되어 있다.\ 따라서 Any type 보다는 처리가 훨씬 빠르다\
@code_warntype f_typed()
In general for tall and narrow tables it is often useful to use Tables.rowtable
,Tables.columntable
or Tables.namedtupleiterator
for intermediate processing of data in a type-stable way.
DataFrame
creation technique¶also notice the difference in performance between copying vs non-copying data frame creation
function f1()
# we work with a DataFrame directly
x = DataFrame([Vector{Float64}(undef,10^4) for i in 1:100], :auto,
copycols=false)
for c in 1:ncol(x)
d = x[!,c]
for r in 1:nrow(x)
d[r] = rand()
end
end
x
end
function f1a()
# we work with a DataFrame directly
x = DataFrame([Vector{Float64}(undef, 10^4) for i in 1:100], :auto)
for c in 1:ncol(x)
d = x[!,c]
for r in 1:nrow(x)
d[r] = rand()
end
end
end
function f2()
x = Vector{Any}(undef,100)
for c in 1:length(x)
d = Vector{Float64}(undef, 10^4)
for r in 1:length(d)
d[r] = rand()
end
x[c] = d
end
# we delay creation of DataFrame after we have our job done
DataFrame(x, :auto, copycols=false)
end
function f2a()
x = Vector{Any}(undef, 100)
for c in 1:length(x)
d = Vector{Float64}(undef, 10^4)
for r in 1:length(d)
d[r] = rand()
end
x[c] = d
end
# we delay creation of DataFrame after we have our job done
DataFrame(x, :auto)
end
@btime f1();
@btime f1a();
@btime f2();
@btime f2a();
DataFrame
in place and it is fast¶x = DataFrame(rand(10^6,5), :auto)
y = DataFrame(transpose(1.0:5.0), :auto)
z = [1.0:5.0;];
# creates a new DataFrame - slow
@btime vcat($x, $y)
# in place - fast
@btime append!($x,$y)
# reset to the same starting point
x = DataFrame(rand(10^6,5), :auto)
# add a single row in place - fast
@btime push!($x, $z)
transpose(1.0:5.0)
DataFrame(transpose(1:10), :auto)
DataFrame([1:10...]', :auto)
[1:10...]'
[1:5...]'
Insert row in specific index
insert!.(eachcol(x),1,[1:5;]);
x
missing
as well as categorical
shows down computations¶using StatsBase
Uses countmap function to test performance
countmap(rand([1:10;],10000))
function test(data)
println(eltype(data))
x = rand(data, 10^6)
y = categorical(x)
println(" raw:")
@btime countmap($x)
println(" categorical:")
@btime countmap($y)
nothing
end
test(1:10)
test([randstring() for i in 1:10])
test(allowmissing(1:10))
test(allowmissing([randstring() for i in 1:10]))
df = DataFrame(x=rand('a':'d', 10^7), y=1)
gdf = groupby(df,:x)
Traditional syntax, slow
@btime combine(v->sum(v.y), $gdf)
Use column selector to speed up
@btime combine($gdf,:y=>sum)
transform!(df, :x=>categorical=>:x)
gdf = groupby(df,:x)
@btime combine($gdf, :y=>sum)
transform!(df, :x=>PooledArray{Char}=>:x)
gdf = groupby(df,:x)
@btime combine($gdf, :y=>sum)
x = DataFrame(rand(100,1000), :auto)
@btime $x[1:1,:]
@btime $x[1,:]
@btime view($x,1:1,:)
@btime $x[1:1,1:20]
@btime $x[1,1:20]
@btime view($x,1:1,1:20)