DataFrames.jl Cheat Sheet¶

DataFrame v1.x cheat sheet 예제

using Pkg

# Pkg.add("RollingFunctions")

using DataFrames
using CSV
using Statistics
using RollingFunctions

Create DataFrame¶

DataFrame(x=[1,2,3],y=4:6,z=9)

DataFrame([(x=1,y=2),(x=3,y=4)])

DataFrame("x"=>[1,2],"y"=>[3,4])

df = DataFrame(rand(5,3),[:x,:y,:z])

DataFrame(rand(5,3),:auto)

Describe DataFrame¶

df = DataFrame(x=[2,1,2,4,3],y=[1,2,3,4,5],z=[5,4,3,2,1])

describe(df)

describe(df,:mean,:std)

function zsq(x)
  x.^2
end

zsq (generic function with 1 method)

describe(df,zsq=>:k)

describe(df,extrema=>:extrema)

Sort Data¶

sort(df,:x)

sort(df,:x,rev = true)

x에 대해 오르차순 정렬 후 y에 대해서 역순 정렬

sort(df,[:x,order(:y,rev=true)])

Select Observations(rows)¶

first(df,2)

last(df,1)

unique(df)

unique(DataFrame(x=[2,1,2,4,3],y=[1,2,1,4,5],z=[5,4,5,2,1]))

unique(df,[:x])

df = CSV.read("data/participation.csv",DataFrame)

filter(:lfp=> !=("no"),df)

@time filter(row->row.noc > 3,df)

  0.139630 seconds (322.82 k allocations: 17.273 MiB, 99.46% compilation time)

@time filter(:noc=> >(3),df)

  0.000078 seconds (38 allocations: 5.016 KiB)

subset(df,:age => x-> x .> 6.0)

Select Variables (Columns)¶

정규식을 사용하여 컬럼 필터링
- 아래 예는 첫번째 글자가 n인 컬럼을 선택한다

first(select(df,r"^n"),2)

nyc, noc 컬럼이 아닌 컬럼을 선택한다.

first(select(df,Not([:nyc,:noc])))

df를 df1으로 복사

df1 = df[:,:]

df1.lfp[1]="yes"

"yes"

df1의 lft컬럼 주소를 df2에 복사 (뷰 관점)
- !를 사용함

df2 = df1[!,[:lfp]]

df2의 lfp[1]의 값을 변경하는 경우 df1의 lfp컬럼 주소를 참조 하고 있기 때문에 df1의lfp값이 변경된다.

df2.lfp[1] = "ko"

"ko"

위에서 변경한 값을 확인 할 수 있다

first(df1)

df의 컬럼명을 가져 온다

View Metadata¶

names(df)

8-element Vector{String}:
 "Column1"
 "lfp"
 "lnnlinc"
 "age"
 "educ"
 "nyc"
 "noc"
 "foreign"

df의 row 갯수

nrow(df)

872

df의 column 갯수

ncol(df)

8

df의 컬럼명을 symbol로 가져온다

propertynames(df)

8-element Vector{Symbol}:
 :Column1
 :lfp
 :lnnlinc
 :age
 :educ
 :nyc
 :noc
 :foreign

컬럼명을 symbol로 변환하여 비교함

Symbol.(names(df)) == propertynames(df)

true

지정한 컬럼의 index를 가져온다.

columnindex(df,:nyc),columnindex(df,"nyc")

(6, 6)

Handle Missing Data¶

df에서 "missing"을 제외 한다.

df3 = DataFrame(x1=[1,2,3,4],x2=[10,missing,30,40],
                x3=[100,200,missing,400])

dropmissing(df3)

allowmissing(df3)

dropmissing!(df3)
df3

allowmissing(df3)

dropmissing(df)

allowmissing(df)

row의 모든 column에 missing이 없는 경우만 1, 아니면 0을 돌려 준다.

completecases(df)

872-element BitVector:
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 ⋮
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1

Cumulative and Moving Stats¶

Cumulative Stats¶

nyc 컬럼의 누적 합계를 돌려 준다.

select(df,:nyc=>cumsum)

df의 nyc 컬럼의 누적 곱하기를 돌려 준다

select(df,:nyc=>cumprod)

age컬럼의 누적 min값을 돌려 준다

select(df,:age=>(v->accumulate(min,v)))

age컬럼의 누적 max값을 돌려 준다

select(df,:age=>(v->accumulate(max,v)))

누적합계에 대한 평균
- $\frac{1}{i}\sum_{j=1}^i age_j $

select(df,:age=>v->cumsum(v) ./(1:length(v)))

Moving Stats(a.k.a Rolling Stats)¶

5 이동평균
- 5 미만의 데이터는 누적합계의 평균과 동일하게 구하고
- 5개 이상의 데이터 부터는 현재 데이터 포함 이전 5개의 데이터 대한 이동 평균을 구함

select(df,:age=>(v->runmean(v,5)))

이동 평균과 동일한 개념으로 현재 데이터 포함 5개의 데이터에 대한 최소값을 구함

select(df,:age=>(v->runmin(v,5)))

Ranking and Lead/Lag Functions¶

*rank functions come from StatsBase.jl package
lead and lag functions come from ShifftedArrays.jl package

using StatsBase

using ShiftedArrays

tmp01 =[30,4,4,2,4,2,11]

7-element Vector{Int64}:
 30
  4
  4
  2
  4
  2
 11

Rank

각 원소의 오름차순 정렬된 index를 돌려 준다

tmp01 |> ordinalrank

7-element Vector{Int64}:
 7
 3
 4
 1
 5
 2
 6

tmp01 |> competerank

7-element Vector{Int64}:
 7
 3
 3
 1
 3
 1
 6

결번이 없는 순차 index - 단 동일값에 대해서 동일 index를 부여

tmp01 |> denserank

7-element Vector{Int64}:
 4
 2
 2
 1
 2
 1
 3

# ordinal 순위에서 4의 순위가 3,4,5 이고 평균값이 4.0, 
# 1의 순위가 1,2 이고 평균값이 1.5
# 순위 index의 평균값을 돌려 준다
tmp01 |> tiedrank

7-element Vector{Float64}:
 7.0
 4.0
 4.0
 1.5
 4.0
 1.5
 6.0

df_m = DataFrame([tmp01],[:x])

select(df_m,:x=>tiedrank)

lead, lag

tmp01을 위로 1칸 shift한다. 아래 빈곳은 missing으로 채운다.

tmp01 |> lead

7-element ShiftedVector{Int64, Missing, Vector{Int64}}:
  4
  4
  2
  4
  2
 11
   missing

tmp01을 위로 2칸 shift한다. 아래 빈곳은 missing으로 채운다.

tmp01 |> x->lead(x,2)

7-element ShiftedVector{Int64, Missing, Vector{Int64}}:
  4
  2
  4
  2
 11
   missing
   missing

tmp01을 아래로 1칸 shift한다. 위의 빈곳은 missing으로 채운다.

tmp01 |> lag

7-element ShiftedVector{Int64, Missing, Vector{Int64}}:
   missing
 30
  4
  4
  2
  4
  2

tmp01을 아래로 2칸 shift한다. 위의 빈곳은 missing으로 채운다.

tmp01 |> x->lag(x,2)

7-element ShiftedVector{Int64, Missing, Vector{Int64}}:
   missing
   missing
 30
  4
  4
  2
  4

select(df_m,:x=>lag)

Build Data Pipeline¶

@chain과 @pipe는 동일하게 동작하나 @chain을 사용하는 것이 좋음
@chain 또는 @pipe 안에 처리되는 DataFrame을 순차적으로 처리한 결과를 다음으로 넘겨 준다
_ 는 바로 위에서 받은 DataFrame을 나타냄
아래 예는 순차적으로 lfp가 yes인것만 필터링한 결과를 groupby로 foreign에 대해서 그룹으로 묶은 결과를 다음으로 넘겨 주고 combine을 사용해서 educ의 평균을 구한 결과를 최종적으로 돌려 준다.

using Chain

@chain df begin
  filter(:lfp => ==("yes"),_)
  groupby(:foreign)
  combine(:educ=>mean)
end

import Pipe

Pipe.@pipe df |>
  filter(:lfp => ==("yes"),_) |>
  groupby(_,:foreign) |>
  combine(_,:educ=>mean)

Summarize Data¶

Aggregating variables¶

combine(df,:lnnlinc=>sum)

combine(df,:lnnlinc=>sum=>:lnnlinc)

combine(df,:age=>(x->mean(skipmissing(x)))=>:mean_age)

# 최대값을 구할 컬럼을 입력한다
# :age중의 최대값, :lnnlinc중의 최대값을 구함
combine(df,[:age,:lnnlinc].=>maximum)

# age가 6.2인 사람들의 정보
filter(:age=> ==(6.2),df)

# lnnlinc가 최대인 사람들의 정보
filter(:lnnlinc=> >=(12.375),df)

Adding variables with aggregation results¶

transform이 combine과 다른점은 df에 function을 적용한 새로운 컬럼을 추가 하는 것

first(
  transform(df,:lnnlinc=>mean=>:average_lnnlinc),
  5)

# lnnlinc에 대해 5개의 이동평균을 구하고 moving_average_lnnlinc 컬럼에 추가하여
# 전체를 돌려준다
# 10 ~ 20번째 행을 보여줌
transform(df,:lnnlinc=>(x->runmean(x,5))=>:moving_average_lnnlinc)[10:15,:]

# 선택한 항목에 대해서 처리 하고 결과를 돌려 준다.
# 선택한 항목과 특정항목 예를들어 lnnlinc에 대해 5개의 이동평균을 구하고 
# moving_average_lnnlinc 컬럼에 추가하여 돌려준다
select(df,:lfp,:lnnlinc,:lnnlinc => (x->runmean(x,5))=> :moving_average_lnnlinc )[10:15,:]

Adding variables by row¶

# row별로 원하는 컬럼에 대해 function을 적용한 결과를 새로운 컬럼에 추가 하여
# 전체를 돌려 준다.
transform(df,[:nyc,:noc]=>ByRow(+)=>:nc)[1:5,:]

select(df,:lfp,:age,[:nyc,:noc]=>ByRow(+)=>:nc)[1:5,:]

transform(df,:age=>ByRow(x->split(string(x),"."))=>[:f,:s])[1:5,:]

Group Data Sets¶

gdf1 = groupby(df,:educ)

keys(gdf1)

20-element DataFrames.GroupKeys{GroupedDataFrame{DataFrame}}:
 GroupKey: (educ = 1,)
 GroupKey: (educ = 2,)
 GroupKey: (educ = 3,)
 GroupKey: (educ = 4,)
 GroupKey: (educ = 5,)
 GroupKey: (educ = 6,)
 GroupKey: (educ = 7,)
 GroupKey: (educ = 8,)
 GroupKey: (educ = 9,)
 GroupKey: (educ = 10,)
 GroupKey: (educ = 11,)
 GroupKey: (educ = 12,)
 GroupKey: (educ = 13,)
 GroupKey: (educ = 14,)
 GroupKey: (educ = 15,)
 GroupKey: (educ = 16,)
 GroupKey: (educ = 17,)
 GroupKey: (educ = 18,)
 GroupKey: (educ = 19,)
 GroupKey: (educ = 21,)

# tuple을 사용하여 key값으로 검색한다.
# 위 예제는 컬럼하나에 대해서 그룹을 했기 때문에 key의 차원은 1
# key값이 2번을 선택
gdf1[(2,)]

gdf2 = groupby(df,[:lfp,:educ])

keys(gdf2)

36-element DataFrames.GroupKeys{GroupedDataFrame{DataFrame}}:
 GroupKey: (lfp = "yes", educ = 1)
 GroupKey: (lfp = "yes", educ = 2)
 GroupKey: (lfp = "yes", educ = 3)
 GroupKey: (lfp = "yes", educ = 4)
 GroupKey: (lfp = "yes", educ = 5)
 GroupKey: (lfp = "yes", educ = 6)
 GroupKey: (lfp = "yes", educ = 7)
 GroupKey: (lfp = "yes", educ = 8)
 GroupKey: (lfp = "yes", educ = 9)
 GroupKey: (lfp = "yes", educ = 10)
 GroupKey: (lfp = "yes", educ = 11)
 GroupKey: (lfp = "yes", educ = 12)
 GroupKey: (lfp = "yes", educ = 13)
 ⋮
 GroupKey: (lfp = "no", educ = 8)
 GroupKey: (lfp = "no", educ = 9)
 GroupKey: (lfp = "no", educ = 10)
 GroupKey: (lfp = "no", educ = 11)
 GroupKey: (lfp = "no", educ = 12)
 GroupKey: (lfp = "no", educ = 13)
 GroupKey: (lfp = "no", educ = 14)
 GroupKey: (lfp = "no", educ = 15)
 GroupKey: (lfp = "no", educ = 16)
 GroupKey: (lfp = "no", educ = 17)
 GroupKey: (lfp = "no", educ = 19)
 GroupKey: (lfp = "no", educ = 21)

# tuple을 사용하여 key값으로 검색한다.
# 위 예제는 컬럼 두개에 대해서 그룹을 했기 때문에 key의 차원은 2
# key값이 yes, 2 인 그룹을 선택
gdf2[("yes",2)]

# 각 그룹에 대해 combine 수행
combine(gdf1,:age=>mean=>:average_age)

# 위와 같은 결과
combine(gdf1) do sdf
  DataFrame(average_age=mean(sdf.age))
end

combine(gdf1,AsTable(:)=>(x->mean(x.age))=>:average_age)

# 각 그룹에 대해 combine 수행
combine(gdf2,:age=>mean=>:average_age)

Combine Data Sets¶

df1 = DataFrame(id1=[1,2,3],x=[4,5,6],y=[7,8,9])

df2 = DataFrame(id2=[1,2,4,5],z=[10,11,12,13])

innerjoin(df1,df2,on=[:id1=>:id2])

leftjoin(df1,df2,on=[:id1=>:id2])

rightjoin(df1,df2,on=[:id1=>:id2])

outerjoin(df1,df2,on=[:id1=>:id2])

# df2의 id와 동일한 id를 가진 df1의 데이터를 돌려 준다
semijoin(df1,df2,on=[:id1=>:id2])

# df1의id와 df2의id를 비교하여 df2에 없는 df1의 
# id에 해당 하는 데이터를 돌려 준다
antijoin(df1,df2,on=[:id1=>:id2])

vcat

df1 = DataFrame(id=[1,2],x=[4,5],y=[7,8])

df2 = DataFrame(id=[3,4],x=[10,11],y=[12,13])

vcat(df1,df2)

hcat

df1 = DataFrame(id=[1,2],x=[4,5])

df2 = DataFrame(y=[7,8])

hcat(df1,df2)

	x	y	z
	Float64	Float64	Float64
1	0.0592809	0.0598258	0.0709284
2	0.279996	0.68633	0.149333
3	0.012851	0.595739	0.695829
4	0.749047	0.715371	0.462145
5	0.614465	0.340671	0.970881

	x1	x2	x3
	Float64	Float64	Float64
1	0.58368	0.36564	0.460039
2	0.249831	0.744277	0.557417
3	0.725893	0.272886	0.394803
4	0.826081	0.185661	0.455548
5	0.960124	0.0255188	0.702536

	variable	k
	Symbol	Array…
1	x	[4, 1, 4, 16, 9]
2	y	[1, 4, 9, 16, 25]
3	z	[25, 16, 9, 4, 1]

	variable	extrema
	Symbol	Tuple…
1	x	(1, 4)
2	y	(1, 5)
3	z	(1, 5)

	age_function
	Float64
1	3.0
2	3.0
3	3.0
4	3.0
5	3.0
6	3.0
7	3.0
8	3.0
9	3.0
10	3.0
11	3.0
12	3.0
13	3.0
14	3.0
15	3.0
16	3.0
17	3.0
18	3.0
19	3.0
20	3.0
21	2.9
22	2.9
23	2.9
24	2.0
25	2.0
26	2.0
27	2.0
28	2.0
29	2.0
30	2.0
⋮	⋮

	variable	mean	min	median	max	nmissing	eltype
	Symbol	Float64	Int64	Float64	Int64	Int64	DataType
1	x	2.4	1	2.0	4	0	Int64
2	y	3.0	1	3.0	5	0	Int64
3	z	3.0	1	3.0	5	0	Int64

	Column1	lfp	lnnlinc	age	educ	nyc	noc	foreign
	Int64	String	Float64	Float64	Int64	Int64	Int64	String
1	1	no	10.7875	3.0	8	1	1	no
2	2	yes	10.5243	4.5	8	0	1	no
3	3	no	10.9686	4.6	9	0	0	no
4	4	no	11.105	3.1	11	2	0	no
5	5	no	11.1085	4.4	12	0	2	no
6	6	yes	11.0283	4.2	12	0	1	no
7	7	no	11.4547	5.1	8	0	0	no
8	8	yes	10.4909	3.2	8	0	2	no
9	9	no	10.6247	3.9	12	0	0	no
10	10	no	10.4864	4.3	11	0	2	no
11	11	no	10.6606	4.5	11	0	2	no
12	12	no	10.4676	6.0	12	0	0	no
13	13	no	11.2296	3.3	11	2	0	no
14	14	no	11.9065	5.6	14	0	0	no
15	15	no	11.5016	5.6	11	0	0	no
16	16	no	11.2935	4.7	11	0	1	no
17	17	no	10.8613	5.0	8	0	0	no
18	18	yes	11.844	3.9	12	0	0	no
19	19	no	11.0486	4.7	8	0	1	no
20	20	yes	10.9578	5.3	11	0	0	no
21	21	no	10.8601	2.9	19	0	0	no
22	22	yes	11.1081	4.6	11	0	1	no
23	23	no	10.8486	4.4	8	0	2	no
24	24	no	10.4824	2.0	12	1	0	no
25	25	yes	10.484	5.4	8	0	0	no
26	26	yes	10.6012	4.7	8	0	0	no
27	27	no	10.5406	2.8	9	2	0	no
28	28	no	11.2243	4.7	8	0	1	no
29	29	yes	10.8411	4.9	12	0	0	no
30	30	no	10.5663	3.0	11	0	1	no
⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮

	Column1	lfp	lnnlinc	age	educ	nyc	noc	foreign
	Int64?	String?	Float64?	Float64?	Int64?	Int64?	Int64?	String?
1	1	no	10.7875	3.0	8	1	1	no
2	2	yes	10.5243	4.5	8	0	1	no
3	3	no	10.9686	4.6	9	0	0	no
4	4	no	11.105	3.1	11	2	0	no
5	5	no	11.1085	4.4	12	0	2	no
6	6	yes	11.0283	4.2	12	0	1	no
7	7	no	11.4547	5.1	8	0	0	no
8	8	yes	10.4909	3.2	8	0	2	no
9	9	no	10.6247	3.9	12	0	0	no
10	10	no	10.4864	4.3	11	0	2	no
11	11	no	10.6606	4.5	11	0	2	no
12	12	no	10.4676	6.0	12	0	0	no
13	13	no	11.2296	3.3	11	2	0	no
14	14	no	11.9065	5.6	14	0	0	no
15	15	no	11.5016	5.6	11	0	0	no
16	16	no	11.2935	4.7	11	0	1	no
17	17	no	10.8613	5.0	8	0	0	no
18	18	yes	11.844	3.9	12	0	0	no
19	19	no	11.0486	4.7	8	0	1	no
20	20	yes	10.9578	5.3	11	0	0	no
21	21	no	10.8601	2.9	19	0	0	no
22	22	yes	11.1081	4.6	11	0	1	no
23	23	no	10.8486	4.4	8	0	2	no
24	24	no	10.4824	2.0	12	1	0	no
25	25	yes	10.484	5.4	8	0	0	no
26	26	yes	10.6012	4.7	8	0	0	no
27	27	no	10.5406	2.8	9	2	0	no
28	28	no	11.2243	4.7	8	0	1	no
29	29	yes	10.8411	4.9	12	0	0	no
30	30	no	10.5663	3.0	11	0	1	no
⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮

	age_function
	Float64
1	3.0
2	3.75
3	4.03333
4	3.8
5	3.92
6	3.96667
7	4.12857
8	4.0125
9	4.0
10	4.03
11	4.07273
12	4.23333
13	4.16154
14	4.26429
15	4.35333
16	4.375
17	4.41176
18	4.38333
19	4.4
20	4.445
21	4.37143
22	4.38182
23	4.38261
24	4.28333
25	4.328
26	4.34231
27	4.28519
28	4.3
29	4.32069
30	4.27667
⋮	⋮

	educ	average_age
	Int64	Float64
1	1	2.9
2	2	3.73333
3	3	4.32727
4	4	4.426
5	5	4.19552
6	6	3.74286
7	7	4.11935
8	8	4.23121
9	9	4.02193
10	10	3.83
11	11	3.821
12	12	3.84758
13	13	3.69535
14	14	3.76
15	15	4.00667
16	16	3.57
17	17	3.66667
18	18	5.1
19	19	4.0
20	21	3.0

	Column1	lfp	lnnlinc	age	educ	nyc	noc	foreign
	Int64	String	Float64	Float64	Int64	Int64	Int64	String
1	63	no	12.0191	6.1	13	0	0	no
2	68	no	11.3369	6.1	12	0	0	no
3	98	no	10.3385	6.2	7	0	0	no
4	185	no	10.6599	6.1	9	0	0	no
5	349	no	9.99962	6.1	11	0	0	no
6	378	no	10.6445	6.1	8	0	0	no
7	416	yes	10.8196	6.1	4	0	0	no
8	431	no	9.74372	6.1	4	0	0	no
9	568	no	10.0098	6.2	11	0	0	no
10	610	no	10.9624	6.1	5	0	0	no
11	614	no	9.85599	6.2	11	0	0	no

	x1	x2	x3
	Int64	Int64?	Int64?
1	1	10	100
2	2	missing	200
3	3	30	missing
4	4	40	400

	x1	x2	x3
	Int64?	Int64?	Int64?
1	1	10	100
2	2	missing	200
3	3	30	missing
4	4	40	400