Introduction to DataFrames¶

DataFrame v1.2, Julia 1.6.1

Possible pitfalls¶

Know what is copied when creating a `DataFrame`¶

using DataFrames,Pipe, Chain
using BenchmarkTools

x = DataFrame(rand(3,5),:auto)

y = copy(x)
x === y # not the same objwct

false

y = DataFrame(x)

x === y

false

any(x[!,i] === y[!,i] for i in ncol(x)) # the columns are also not the same

false

y = DataFrame(x,copycols=false)
x === y

false

all(x[!,i] === y[!,i] for i in ncol(x)) # the columns are the same

true

any(x[!,i] === y[!,i] for i in ncol(x)) # the columns are also not the same

true

# the same when creating data frame using kwarg syntax
x = 1:3; y = [1,2,3]; df = DataFrame(x=x,y=y)

y === df.y # different object

false

typeof(x), typeof(df.x) # range is converted to vector

(UnitRange{Int64}, Vector{Int64})

Slicing rows always create a copy

y === df[:,:y]

false

You can avoid copying by using `copycols=false` keyword argument in functions.

df = DataFrame(x=x, y=y, copycols=false)

y === df.y # now it is the same

true

@btime select($df,:y)[!,1]

  671.210 ns (17 allocations: 1.70 KiB)

3-element Vector{Int64}:
 1
 2
 3

@btime @pipe select($df, :y) |> _[!,1]

  671.630 ns (17 allocations: 1.70 KiB)

3-element Vector{Int64}:
 1
 2
 3

select(df,:y)[!,1] === y # not the same

false

select(df, :y, copycols=false)[!,1] === y # the same

true

Do not modify the parent of `GroupedDataFrame` or `view`¶

x = DataFrame(id=repeat([1,2],inner=3),x=1:6)

x = DataFrame(id=repeat([1,2],outer=3),x=1:6)

g = groupby(x, :id)

x[1:3,1] = [2,2,2]
x

Well - it is wrong now, `g` is only a view

g

s = view(x,5:6,:)

delete!(x,3:6)

s

BoundsError: attempt to access 2-element Vector{Int64} at index [5:6]

Stacktrace:
  [1] throw_boundserror(A::Vector{Int64}, I::Tuple{UnitRange{Int64}})
    @ Base ./abstractarray.jl:651
  [2] checkbounds
    @ ./abstractarray.jl:616 [inlined]
  [3] view(A::Vector{Int64}, I::UnitRange{Int64})
    @ Base ./subarray.jl:177
  [4] view
    @ ~/.julia/packages/DataFrames/vuMM8/src/subdataframe/subdataframe.jl:129 [inlined]
  [5] getindex
    @ ~/.julia/packages/DataFrames/vuMM8/src/subdataframe/subdataframe.jl:166 [inlined]
  [6] getindex
    @ ~/.julia/packages/DataFrames/vuMM8/src/abstractdataframe/iteration.jl:200 [inlined]
  [7] iterate(itr::DataFrames.DataFrameColumns{SubDataFrame{DataFrame, DataFrames.Index, UnitRange{Int64}}}, i::Int64) (repeats 2 times)
    @ DataFrames ~/.julia/packages/DataFrames/vuMM8/src/abstractdataframe/iteration.jl:198
  [8] _show(io::IOContext{IOBuffer}, df::SubDataFrame{DataFrame, DataFrames.Index, UnitRange{Int64}}; allrows::Bool, allcols::Bool, rowlabel::Symbol, summary::Bool, eltypes::Bool, rowid::Nothing, truncate::Int64, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ DataFrames ~/.julia/packages/DataFrames/vuMM8/src/abstractdataframe/show.jl:167
  [9] #show#692
    @ ~/.julia/packages/DataFrames/vuMM8/src/abstractdataframe/show.jl:348 [inlined]
 [10] show(io::IOContext{IOBuffer}, df::SubDataFrame{DataFrame, DataFrames.Index, UnitRange{Int64}})
    @ DataFrames ~/.julia/packages/DataFrames/vuMM8/src/abstractdataframe/show.jl:348
 [11] #show#707
    @ ~/.julia/packages/DataFrames/vuMM8/src/abstractdataframe/io.jl:138 [inlined]
 [12] show
    @ ~/.julia/packages/DataFrames/vuMM8/src/abstractdataframe/io.jl:138 [inlined]
 [13] limitstringmime(mime::MIME{Symbol("text/plain")}, x::SubDataFrame{DataFrame, DataFrames.Index, UnitRange{Int64}})
    @ IJulia ~/.julia/packages/IJulia/e8kqU/src/inline.jl:43
 [14] display_mimestring
    @ ~/.julia/packages/IJulia/e8kqU/src/display.jl:71 [inlined]
 [15] display_dict(x::SubDataFrame{DataFrame, DataFrames.Index, UnitRange{Int64}})
    @ IJulia ~/.julia/packages/IJulia/e8kqU/src/display.jl:102
 [16] #invokelatest#2
    @ ./essentials.jl:708 [inlined]
 [17] invokelatest
    @ ./essentials.jl:706 [inlined]
 [18] execute_request(socket::ZMQ.Socket, msg::IJulia.Msg)
    @ IJulia ~/.julia/packages/IJulia/e8kqU/src/execute_request.jl:112
 [19] #invokelatest#2
    @ ./essentials.jl:708 [inlined]
 [20] invokelatest
    @ ./essentials.jl:706 [inlined]
 [21] eventloop(socket::ZMQ.Socket)
    @ IJulia ~/.julia/packages/IJulia/e8kqU/src/eventloop.jl:8
 [22] (::IJulia.var"#15#18")()
    @ IJulia ./task.jl:411

Single column selection for `DataFrame` creates aliases with `!` and `getproperty` syntax and copies with:¶

x = DataFrame(a=1:3)
x.b = x[!,1] # alias
x.c = x[:,1] # copy
x.d = x[!,1][:] # copy
x.e = copy(x[!,1]) # explicit copy
display(x)
x[1,1] = 100
"x.b column은 x.a의 alias로써 x[1,1]=100 으로 변경된 경우 x.b도 변경됨" |> println
display(x)

x.b column은 x.a의 alias로써 x[1,1]=100 으로 변경된 경우 x.b도 변경됨

When iterating rows of a data frame use `eachrow` to avoid compilation cost(wide tables), but `Tables.namedtupleiterator` for fast execution(tall table)¶

this table is wide

df1 = DataFrame([rand([1:2,'a':'b',false:true,1.0:2.0]) for i in 1:900], :auto)

@time collect(eachrow(df1))

  0.000013 seconds (4 allocations: 256 bytes)

2-element Vector{DataFrameRow}:
 DataFrameRow
 Row │ x1       x2       x3     x4       x5    x6     x7     x8     x9       x ⋯
     │ Float64  Float64  Int64  Float64  Char  Int64  Int64  Bool   Float64  C ⋯
─────┼──────────────────────────────────────────────────────────────────────────
   1 │     1.0      1.0      1      1.0  a         1      1  false      1.0  a ⋯
                                                             891 columns omitted
 DataFrameRow
 Row │ x1       x2       x3     x4       x5    x6     x7     x8    x9       x1 ⋯
     │ Float64  Float64  Int64  Float64  Char  Int64  Int64  Bool  Float64  Ch ⋯
─────┼──────────────────────────────────────────────────────────────────────────
   2 │     2.0      2.0      2      2.0  b         2      2  true      2.0  b  ⋯
                                                             891 columns omitted

@time collect(Tables.namedtupleiterator(df1));

  0.004972 seconds (5.28 k allocations: 12.925 MiB)

as you can see the time to compile Tables.namedtupleiterator is very large in this case, and it would get much worse if the table was wider (that is why we include this tip in pitfalls notebook)

the table below is tall

df2 = DataFrame(rand(10^6, 10),:auto)

@time map(sum,eachrow(df2))

  2.338953 seconds (60.18 M allocations: 1.061 GiB, 12.63% gc time, 5.80% compilation time)

1000000-element Vector{Float64}:
 2.704071523015175
 4.411682447713391
 5.921627951565904
 3.643846052647391
 5.511158451988555
 5.238299046906927
 5.361935745226174
 4.07640255971654
 4.188105980469235
 4.420129273862483
 4.577410055812212
 5.077443374765336
 5.199538366982413
 ⋮
 5.143906356321334
 3.1391696688346715
 6.28315535477638
 5.5665712737566295
 3.2598167419720845
 5.491525479182397
 5.781255375508597
 2.4901351267964995
 5.751675463507496
 4.2993837933271
 4.888116627432323
 4.240389765946817

@time map(sum,eachrow(df2))

  1.956384 seconds (59.99 M allocations: 1.050 GiB, 4.18% gc time)

1000000-element Vector{Float64}:
 2.704071523015175
 4.411682447713391
 5.921627951565904
 3.643846052647391
 5.511158451988555
 5.238299046906927
 5.361935745226174
 4.07640255971654
 4.188105980469235
 4.420129273862483
 4.577410055812212
 5.077443374765336
 5.199538366982413
 ⋮
 5.143906356321334
 3.1391696688346715
 6.28315535477638
 5.5665712737566295
 3.2598167419720845
 5.491525479182397
 5.781255375508597
 2.4901351267964995
 5.751675463507496
 4.2993837933271
 4.888116627432323
 4.240389765946817

@time map(sum,Tables.namedtupleiterator(df2))

  0.280857 seconds (510.88 k allocations: 38.592 MiB, 3.12% gc time, 94.90% compilation time)

1000000-element Vector{Float64}:
 2.704071523015175
 4.411682447713391
 5.921627951565904
 3.643846052647391
 5.511158451988555
 5.238299046906927
 5.361935745226174
 4.07640255971654
 4.188105980469235
 4.420129273862483
 4.577410055812212
 5.077443374765336
 5.199538366982413
 ⋮
 5.143906356321334
 3.1391696688346715
 6.28315535477638
 5.5665712737566295
 3.2598167419720845
 5.491525479182397
 5.781255375508597
 2.4901351267964995
 5.751675463507496
 4.2993837933271
 4.888116627432323
 4.240389765946817

@time map(sum,Tables.namedtupleiterator(df2))

  0.016954 seconds (17 allocations: 7.631 MiB)

1000000-element Vector{Float64}:
 2.704071523015175
 4.411682447713391
 5.921627951565904
 3.643846052647391
 5.511158451988555
 5.238299046906927
 5.361935745226174
 4.07640255971654
 4.188105980469235
 4.420129273862483
 4.577410055812212
 5.077443374765336
 5.199538366982413
 ⋮
 5.143906356321334
 3.1391696688346715
 6.28315535477638
 5.5665712737566295
 3.2598167419720845
 5.491525479182397
 5.781255375508597
 2.4901351267964995
 5.751675463507496
 4.2993837933271
 4.888116627432323
 4.240389765946817

as you can see - this time it is much faster to iterate a type stable container

	x1	x2	x3	x4	x5
	Float64	Float64	Float64	Float64	Float64
1	0.517607	0.996842	0.603599	0.231084	0.537229
2	0.853228	0.35896	0.473605	0.0554781	0.196862
3	0.867323	0.982273	0.515127	0.829447	0.0107014

	x1	x2	x3	x4	x5
	Float64	Float64	Float64	Float64	Float64
1	0.517607	0.996842	0.603599	0.231084	0.537229
2	0.853228	0.35896	0.473605	0.0554781	0.196862
3	0.867323	0.982273	0.515127	0.829447	0.0107014

	x1	x2	x3	x4	x5	x6	x7
	Float64	Float64	Float64	Float64	Float64	Float64	Float64
1	0.193108	0.322283	0.232487	0.0667451	0.099417	0.0999391	0.219174
2	0.0980823	0.372884	0.427947	0.765323	0.642446	0.898155	0.412915
3	0.911935	0.604847	0.0628406	0.87118	0.333064	0.94344	0.636854
4	0.208131	0.975856	0.41342	0.271623	0.549517	0.443459	0.299266
5	0.708572	0.7562	0.664232	0.260509	0.826777	0.369361	0.213487
6	0.426392	0.961442	0.0728744	0.535804	0.374699	0.750084	0.132052
7	0.853799	0.593091	0.724057	0.597194	0.513078	0.702625	0.374183
8	0.338775	0.763346	0.792493	0.72292	0.283221	0.116449	0.103614
9	0.222792	0.715321	0.135947	0.214149	0.927037	0.730422	0.660307
10	0.044216	0.152689	0.949124	0.266609	0.941206	0.98287	0.0150329
11	0.614781	0.00373096	0.678052	0.269892	0.0722659	0.263693	0.879764
12	0.526007	0.0669536	0.290049	0.39458	0.487458	0.841616	0.961109
13	0.463543	0.195005	0.920027	0.395063	0.661153	0.659275	0.449859
14	0.467721	0.209189	0.0156128	0.889284	0.23422	0.0532921	0.596516
15	0.845852	0.338196	0.00677797	0.00759356	0.00205555	0.140243	0.616174
16	0.575429	0.331434	0.88557	0.598161	0.78553	0.0635005	0.189848
17	0.518441	0.167293	0.802451	0.0745907	0.66592	0.172504	0.82874
18	0.577898	0.832001	0.247709	0.431077	0.872255	0.627813	0.259124
19	0.0260839	0.265003	0.457474	0.914795	0.581529	0.627005	0.105869
20	0.688403	0.871378	0.0505463	0.729807	0.984956	0.659802	0.358253
21	0.651455	0.177631	0.989661	0.566473	0.386359	0.394267	0.828212
22	0.951893	0.204018	0.66019	0.787345	0.253557	0.0197126	0.516339
23	0.181479	0.747834	0.715575	0.661484	0.557111	0.713604	0.320488
24	0.387263	0.954548	0.644909	0.62192	0.588239	0.991944	0.468822
25	0.155932	0.440701	0.294963	0.899941	0.523053	0.213067	0.219863
26	0.763775	0.419595	0.00734479	0.0496831	0.943962	0.445321	0.307321
27	0.539755	0.718269	0.491847	0.515454	0.82954	0.705841	0.0535696
28	0.209254	0.539045	0.851256	0.814385	0.546947	0.451169	0.878354
29	0.940529	0.049092	0.779194	0.0537909	0.434473	0.319948	0.287719
30	0.476081	0.369854	0.959269	0.815822	0.137305	0.20307	0.987843
⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮

	x1	x2	x3	x4	x5	x6	x7	x8	x9	x10	x11
	Float64	Float64	Int64	Float64	Char	Int64	Int64	Bool	Float64	Char	Char
1	1.0	1.0	1	1.0	a	1	1	0	1.0	a	a
2	2.0	2.0	2	2.0	b	2	2	1	2.0	b	b

Introduction to DataFrames¶

Possible pitfalls¶

Know what is copied when creating a DataFrame¶

Do not modify the parent of GroupedDataFrame or view¶

Single column selection for DataFrame creates aliases with ! and getproperty syntax and copies with:¶

When iterating rows of a data frame use eachrow to avoid compilation cost(wide tables), but Tables.namedtupleiterator for fast execution(tall table)¶

Know what is copied when creating a `DataFrame`¶

Do not modify the parent of `GroupedDataFrame` or `view`¶

Single column selection for `DataFrame` creates aliases with `!` and `getproperty` syntax and copies with:¶

When iterating rows of a data frame use `eachrow` to avoid compilation cost(wide tables), but `Tables.namedtupleiterator` for fast execution(tall table)¶