DataFrames.jl Cheat Sheet

In [1]:
using Pkg
In [2]:
# Pkg.add("RollingFunctions")
In [3]:
using DataFrames
using CSV
using Statistics
using RollingFunctions

Create DataFrame

In [86]:
DataFrame(x=[1,2,3],y=4:6,z=9)
Out[86]:

3 rows × 3 columns

xyz
Int64Int64Int64
1149
2259
3369
In [87]:
DataFrame([(x=1,y=2),(x=3,y=4)])
Out[87]:

2 rows × 2 columns

xy
Int64Int64
112
234
In [88]:
DataFrame("x"=>[1,2],"y"=>[3,4])
Out[88]:

2 rows × 2 columns

xy
Int64Int64
113
224
In [89]:
df = DataFrame(rand(5,3),[:x,:y,:z])
Out[89]:

5 rows × 3 columns

xyz
Float64Float64Float64
10.05928090.05982580.0709284
20.2799960.686330.149333
30.0128510.5957390.695829
40.7490470.7153710.462145
50.6144650.3406710.970881
In [90]:
DataFrame(rand(5,3),:auto)
Out[90]:

5 rows × 3 columns

x1x2x3
Float64Float64Float64
10.583680.365640.460039
20.2498310.7442770.557417
30.7258930.2728860.394803
40.8260810.1856610.455548
50.9601240.02551880.702536

Describe DataFrame

In [94]:
df = DataFrame(x=[2,1,2,4,3],y=[1,2,3,4,5],z=[5,4,3,2,1])
Out[94]:

5 rows × 3 columns

xyz
Int64Int64Int64
1215
2124
3233
4442
5351
In [95]:
describe(df)
Out[95]:

3 rows × 7 columns

variablemeanminmedianmaxnmissingeltype
SymbolFloat64Int64Float64Int64Int64DataType
1x2.412.040Int64
2y3.013.050Int64
3z3.013.050Int64
In [100]:
describe(df,:mean,:std)
Out[100]:

3 rows × 3 columns

variablemeanstd
SymbolFloat64Float64
1x2.41.14018
2y3.01.58114
3z3.01.58114
In [101]:
function zsq(x)
  x.^2
end
Out[101]:
zsq (generic function with 1 method)
In [102]:
describe(df,zsq=>:k)
Out[102]:

3 rows × 2 columns

variablek
SymbolArray…
1x[4, 1, 4, 16, 9]
2y[1, 4, 9, 16, 25]
3z[25, 16, 9, 4, 1]
In [103]:
describe(df,extrema=>:extrema)
Out[103]:

3 rows × 2 columns

variableextrema
SymbolTuple…
1x(1, 4)
2y(1, 5)
3z(1, 5)

Sort Data

In [96]:
sort(df,:x)
Out[96]:

5 rows × 3 columns

xyz
Int64Int64Int64
1124
2215
3233
4351
5442
In [97]:
sort(df,:x,rev = true)
Out[97]:

5 rows × 3 columns

xyz
Int64Int64Int64
1442
2351
3215
4233
5124
  • x에 대해 오르차순 정렬 후 y에 대해서 역순 정렬
In [98]:
sort(df,[:x,order(:y,rev=true)])
Out[98]:

5 rows × 3 columns

xyz
Int64Int64Int64
1124
2233
3215
4351
5442

Select Observations(rows)

In [104]:
first(df,2)
Out[104]:

2 rows × 3 columns

xyz
Int64Int64Int64
1215
2124
In [105]:
last(df,1)
Out[105]:

1 rows × 3 columns

xyz
Int64Int64Int64
1351
In [106]:
unique(df)
Out[106]:

5 rows × 3 columns

xyz
Int64Int64Int64
1215
2124
3233
4442
5351
In [107]:
unique(DataFrame(x=[2,1,2,4,3],y=[1,2,1,4,5],z=[5,4,5,2,1]))
Out[107]:

4 rows × 3 columns

xyz
Int64Int64Int64
1215
2124
3442
4351
In [108]:
unique(df,[:x])
Out[108]:

4 rows × 3 columns

xyz
Int64Int64Int64
1215
2124
3442
4351
In [109]:
df = CSV.read("data/participation.csv",DataFrame)
Out[109]:

872 rows × 8 columns

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
11no10.78753.0811no
22yes10.52434.5801no
33no10.96864.6900no
44no11.1053.11120no
55no11.10854.41202no
66yes11.02834.21201no
77no11.45475.1800no
88yes10.49093.2802no
99no10.62473.91200no
1010no10.48644.31102no
1111no10.66064.51102no
1212no10.46766.01200no
1313no11.22963.31120no
1414no11.90655.61400no
1515no11.50165.61100no
1616no11.29354.71101no
1717no10.86135.0800no
1818yes11.8443.91200no
1919no11.04864.7801no
2020yes10.95785.31100no
2121no10.86012.91900no
2222yes11.10814.61101no
2323no10.84864.4802no
2424no10.48242.01210no
2525yes10.4845.4800no
2626yes10.60124.7800no
2727no10.54062.8920no
2828no11.22434.7801no
2929yes10.84114.91200no
3030no10.56633.01101no
In [110]:
filter(:lfp=> !=("no"),df)
Out[110]:

401 rows × 8 columns

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
12yes10.52434.5801no
26yes11.02834.21201no
38yes10.49093.2802no
418yes11.8443.91200no
520yes10.95785.31100no
622yes11.10814.61101no
725yes10.4845.4800no
826yes10.60124.7800no
929yes10.84114.91200no
1039yes11.15663.61302no
1141yes11.10634.11202no
1250yes10.64464.21002no
1352yes10.4673.2801no
1453yes10.47823.4902no
1554yes10.43554.4901no
1655yes10.42735.81200no
1756yes10.47184.4902no
1864yes11.41124.01302no
1966yes10.49333.5402no
2067yes10.71863.11011no
2169yes10.81475.51001no
2272yes10.47293.5902no
2376yes10.51765.6900no
2478yes10.90994.3800no
2579yes10.36362.41200no
2680yes9.646155.4800no
2782yes10.27985.2702no
2885yes10.54093.41000no
2990yes11.23584.91202no
30100yes10.34824.8800no
In [111]:
@time filter(row->row.noc > 3,df)
  0.139630 seconds (322.82 k allocations: 17.273 MiB, 99.46% compilation time)
Out[111]:

18 rows × 8 columns

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
175no10.6194.41704no
281no10.15713.4804no
389no11.33894.41304no
496no11.05254.8805no
5130yes10.92574.7804no
6164yes11.21624.01304no
7178no10.9333.5804no
8187no10.923.9805no
9223yes10.70094.0904no
10231yes10.80064.5806no
11338yes10.71994.5304no
12388yes10.95994.51104no
13394yes10.61863.61004no
14436yes10.24564.0906no
15484no11.12154.01104no
16624no12.37573.91204no
17655no10.75953.71204no
18724yes10.47283.8904yes
In [112]:
@time filter(:noc=> >(3),df)
  0.000078 seconds (38 allocations: 5.016 KiB)
Out[112]:

18 rows × 8 columns

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
175no10.6194.41704no
281no10.15713.4804no
389no11.33894.41304no
496no11.05254.8805no
5130yes10.92574.7804no
6164yes11.21624.01304no
7178no10.9333.5804no
8187no10.923.9805no
9223yes10.70094.0904no
10231yes10.80064.5806no
11338yes10.71994.5304no
12388yes10.95994.51104no
13394yes10.61863.61004no
14436yes10.24564.0906no
15484no11.12154.01104no
16624no12.37573.91204no
17655no10.75953.71204no
18724yes10.47283.8904yes
In [113]:
subset(df,:age => x-> x .> 6.0)
Out[113]:

11 rows × 8 columns

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
163no12.01916.11300no
268no11.33696.11200no
398no10.33856.2700no
4185no10.65996.1900no
5349no9.999626.11100no
6378no10.64456.1800no
7416yes10.81966.1400no
8431no9.743726.1400no
9568no10.00986.21100no
10610no10.96246.1500no
11614no9.855996.21100no

Select Variables (Columns)

  • 정규식을 사용하여 컬럼 필터링
    • 아래 예는 첫번째 글자가 n인 컬럼을 선택한다
In [114]:
first(select(df,r"^n"),2)
Out[114]:

2 rows × 2 columns

nycnoc
Int64Int64
111
201
  • nyc, noc 컬럼이 아닌 컬럼을 선택한다.
In [115]:
first(select(df,Not([:nyc,:noc])))
Out[115]:

DataFrameRow (6 columns)

Column1lfplnnlincageeducforeign
Int64StringFloat64Float64Int64String
11no10.78753.08no
  • df를 df1으로 복사
In [116]:
df1 = df[:,:]
Out[116]:

872 rows × 8 columns

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
11no10.78753.0811no
22yes10.52434.5801no
33no10.96864.6900no
44no11.1053.11120no
55no11.10854.41202no
66yes11.02834.21201no
77no11.45475.1800no
88yes10.49093.2802no
99no10.62473.91200no
1010no10.48644.31102no
1111no10.66064.51102no
1212no10.46766.01200no
1313no11.22963.31120no
1414no11.90655.61400no
1515no11.50165.61100no
1616no11.29354.71101no
1717no10.86135.0800no
1818yes11.8443.91200no
1919no11.04864.7801no
2020yes10.95785.31100no
2121no10.86012.91900no
2222yes11.10814.61101no
2323no10.84864.4802no
2424no10.48242.01210no
2525yes10.4845.4800no
2626yes10.60124.7800no
2727no10.54062.8920no
2828no11.22434.7801no
2929yes10.84114.91200no
3030no10.56633.01101no
In [117]:
df1.lfp[1]="yes"
Out[117]:
"yes"
  • df1의 lft컬럼 주소를 df2에 복사 (뷰 관점)
    • !를 사용함
In [118]:
df2 = df1[!,[:lfp]]
Out[118]:

872 rows × 1 columns

lfp
String
1yes
2yes
3no
4no
5no
6yes
7no
8yes
9no
10no
11no
12no
13no
14no
15no
16no
17no
18yes
19no
20yes
21no
22yes
23no
24no
25yes
26yes
27no
28no
29yes
30no
  • df2의 lfp[1]의 값을 변경하는 경우 df1의 lfp컬럼 주소를 참조 하고 있기 때문에 df1의lfp값이 변경된다.
In [119]:
df2.lfp[1] = "ko"
Out[119]:
"ko"
  • 위에서 변경한 값을 확인 할 수 있다
In [120]:
first(df1)
Out[120]:

DataFrameRow (8 columns)

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
11ko10.78753.0811no
  • df의 컬럼명을 가져 온다

View Metadata

In [121]:
names(df)
Out[121]:
8-element Vector{String}:
 "Column1"
 "lfp"
 "lnnlinc"
 "age"
 "educ"
 "nyc"
 "noc"
 "foreign"
  • df의 row 갯수
In [122]:
nrow(df)
Out[122]:
872
  • df의 column 갯수
In [123]:
ncol(df)
Out[123]:
8
  • df의 컬럼명을 symbol로 가져온다
In [124]:
propertynames(df)
Out[124]:
8-element Vector{Symbol}:
 :Column1
 :lfp
 :lnnlinc
 :age
 :educ
 :nyc
 :noc
 :foreign
  • 컬럼명을 symbol로 변환하여 비교함
In [125]:
Symbol.(names(df)) == propertynames(df)
Out[125]:
true
  • 지정한 컬럼의 index를 가져온다.
In [126]:
columnindex(df,:nyc),columnindex(df,"nyc")
Out[126]:
(6, 6)

Handle Missing Data

  • df에서 "missing"을 제외 한다.
In [127]:
df3 = DataFrame(x1=[1,2,3,4],x2=[10,missing,30,40],
                x3=[100,200,missing,400])
Out[127]:

4 rows × 3 columns

x1x2x3
Int64Int64?Int64?
1110100
22missing200
3330missing
4440400
In [128]:
dropmissing(df3)
Out[128]:

2 rows × 3 columns

x1x2x3
Int64Int64Int64
1110100
2440400
In [129]:
allowmissing(df3)
Out[129]:

4 rows × 3 columns

x1x2x3
Int64?Int64?Int64?
1110100
22missing200
3330missing
4440400
In [130]:
dropmissing!(df3)
df3
Out[130]:

2 rows × 3 columns

x1x2x3
Int64Int64Int64
1110100
2440400
In [131]:
allowmissing(df3)
Out[131]:

2 rows × 3 columns

x1x2x3
Int64?Int64?Int64?
1110100
2440400
In [132]:
dropmissing(df)
Out[132]:

872 rows × 8 columns

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
11no10.78753.0811no
22yes10.52434.5801no
33no10.96864.6900no
44no11.1053.11120no
55no11.10854.41202no
66yes11.02834.21201no
77no11.45475.1800no
88yes10.49093.2802no
99no10.62473.91200no
1010no10.48644.31102no
1111no10.66064.51102no
1212no10.46766.01200no
1313no11.22963.31120no
1414no11.90655.61400no
1515no11.50165.61100no
1616no11.29354.71101no
1717no10.86135.0800no
1818yes11.8443.91200no
1919no11.04864.7801no
2020yes10.95785.31100no
2121no10.86012.91900no
2222yes11.10814.61101no
2323no10.84864.4802no
2424no10.48242.01210no
2525yes10.4845.4800no
2626yes10.60124.7800no
2727no10.54062.8920no
2828no11.22434.7801no
2929yes10.84114.91200no
3030no10.56633.01101no
In [133]:
allowmissing(df)
Out[133]:

872 rows × 8 columns

Column1lfplnnlincageeducnycnocforeign
Int64?String?Float64?Float64?Int64?Int64?Int64?String?
11no10.78753.0811no
22yes10.52434.5801no
33no10.96864.6900no
44no11.1053.11120no
55no11.10854.41202no
66yes11.02834.21201no
77no11.45475.1800no
88yes10.49093.2802no
99no10.62473.91200no
1010no10.48644.31102no
1111no10.66064.51102no
1212no10.46766.01200no
1313no11.22963.31120no
1414no11.90655.61400no
1515no11.50165.61100no
1616no11.29354.71101no
1717no10.86135.0800no
1818yes11.8443.91200no
1919no11.04864.7801no
2020yes10.95785.31100no
2121no10.86012.91900no
2222yes11.10814.61101no
2323no10.84864.4802no
2424no10.48242.01210no
2525yes10.4845.4800no
2626yes10.60124.7800no
2727no10.54062.8920no
2828no11.22434.7801no
2929yes10.84114.91200no
3030no10.56633.01101no

row의 모든 column에 missing이 없는 경우만 1, 아니면 0을 돌려 준다.

In [134]:
completecases(df)
Out[134]:
872-element BitVector:
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 ⋮
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1

Cumulative and Moving Stats

Cumulative Stats

  • nyc 컬럼의 누적 합계를 돌려 준다.
In [135]:
select(df,:nyc=>cumsum)
Out[135]:

872 rows × 1 columns

nyc_cumsum
Int64
11
21
31
43
53
63
73
83
93
103
113
123
135
145
155
165
175
185
195
205
215
225
235
246
256
266
278
288
298
308
  • df의 nyc 컬럼의 누적 곱하기를 돌려 준다
In [136]:
select(df,:nyc=>cumprod)
Out[136]:

872 rows × 1 columns

nyc_cumprod
Int64
11
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
  • age컬럼의 누적 min값을 돌려 준다
In [137]:
select(df,:age=>(v->accumulate(min,v)))
Out[137]:

872 rows × 1 columns

age_function
Float64
13.0
23.0
33.0
43.0
53.0
63.0
73.0
83.0
93.0
103.0
113.0
123.0
133.0
143.0
153.0
163.0
173.0
183.0
193.0
203.0
212.9
222.9
232.9
242.0
252.0
262.0
272.0
282.0
292.0
302.0
  • age컬럼의 누적 max값을 돌려 준다
In [138]:
select(df,:age=>(v->accumulate(max,v)))
Out[138]:

872 rows × 1 columns

age_function
Float64
13.0
24.5
34.6
44.6
54.6
64.6
75.1
85.1
95.1
105.1
115.1
126.0
136.0
146.0
156.0
166.0
176.0
186.0
196.0
206.0
216.0
226.0
236.0
246.0
256.0
266.0
276.0
286.0
296.0
306.0
  • 누적합계에 대한 평균
    • $\frac{1}{i}\sum_{j=1}^i age_j $
In [139]:
select(df,:age=>v->cumsum(v) ./(1:length(v)))
Out[139]:

872 rows × 1 columns

age_function
Float64
13.0
23.75
34.03333
43.8
53.92
63.96667
74.12857
84.0125
94.0
104.03
114.07273
124.23333
134.16154
144.26429
154.35333
164.375
174.41176
184.38333
194.4
204.445
214.37143
224.38182
234.38261
244.28333
254.328
264.34231
274.28519
284.3
294.32069
304.27667

Moving Stats(a.k.a Rolling Stats)

  • 5 이동평균
    • 5 미만의 데이터는 누적합계의 평균과 동일하게 구하고
    • 5개 이상의 데이터 부터는 현재 데이터 포함 이전 5개의 데이터 대한 이동 평균을 구함
In [140]:
select(df,:age=>(v->runmean(v,5)))
Out[140]:

872 rows × 1 columns

age_function
Float64
13.0
23.75
34.03333
43.8
53.92
64.16
74.28
84.0
94.16
104.14
114.2
124.38
134.4
144.74
155.0
165.04
174.84
184.96
194.78
204.72
214.36
224.28
234.38
243.84
253.86
264.22
273.86
283.92
294.5
304.02
  • 이동 평균과 동일한 개념으로 현재 데이터 포함 5개의 데이터에 대한 최소값을 구함
In [141]:
select(df,:age=>(v->runmin(v,5)))
Out[141]:

872 rows × 1 columns

age_function
Float64
13.0
23.0
33.0
43.0
53.0
63.1
73.1
83.1
93.2
103.2
113.2
123.2
133.3
143.3
153.3
163.3
173.3
183.9
193.9
203.9
212.9
222.9
232.9
242.0
252.0
262.0
272.0
282.0
292.8
302.8

Ranking and Lead/Lag Functions

  • *rank functions come from StatsBase.jl package
  • lead and lag functions come from ShifftedArrays.jl package
In [142]:
using StatsBase
In [143]:
using ShiftedArrays
In [144]:
tmp01 =[30,4,4,2,4,2,11]
Out[144]:
7-element Vector{Int64}:
 30
  4
  4
  2
  4
  2
 11
  • Rank

각 원소의 오름차순 정렬된 index를 돌려 준다

In [145]:
tmp01 |> ordinalrank
Out[145]:
7-element Vector{Int64}:
 7
 3
 4
 1
 5
 2
 6
In [146]:
tmp01 |> competerank
Out[146]:
7-element Vector{Int64}:
 7
 3
 3
 1
 3
 1
 6

결번이 없는 순차 index - 단 동일값에 대해서 동일 index를 부여

In [147]:
tmp01 |> denserank
Out[147]:
7-element Vector{Int64}:
 4
 2
 2
 1
 2
 1
 3
In [148]:
# ordinal 순위에서 4의 순위가 3,4,5 이고 평균값이 4.0, 
# 1의 순위가 1,2 이고 평균값이 1.5
# 순위 index의 평균값을 돌려 준다
tmp01 |> tiedrank
Out[148]:
7-element Vector{Float64}:
 7.0
 4.0
 4.0
 1.5
 4.0
 1.5
 6.0
In [149]:
df_m = DataFrame([tmp01],[:x])
Out[149]:

7 rows × 1 columns

x
Int64
130
24
34
42
54
62
711
In [150]:
select(df_m,:x=>tiedrank)
Out[150]:

7 rows × 1 columns

x_tiedrank
Float64
17.0
24.0
34.0
41.5
54.0
61.5
76.0
  • lead, lag

tmp01을 위로 1칸 shift한다. 아래 빈곳은 missing으로 채운다.

In [151]:
tmp01 |> lead
Out[151]:
7-element ShiftedVector{Int64, Missing, Vector{Int64}}:
  4
  4
  2
  4
  2
 11
   missing

tmp01을 위로 2칸 shift한다. 아래 빈곳은 missing으로 채운다.

In [152]:
tmp01 |> x->lead(x,2)
Out[152]:
7-element ShiftedVector{Int64, Missing, Vector{Int64}}:
  4
  2
  4
  2
 11
   missing
   missing

tmp01을 아래로 1칸 shift한다. 위의 빈곳은 missing으로 채운다.

In [153]:
tmp01 |> lag
Out[153]:
7-element ShiftedVector{Int64, Missing, Vector{Int64}}:
   missing
 30
  4
  4
  2
  4
  2

tmp01을 아래로 2칸 shift한다. 위의 빈곳은 missing으로 채운다.

In [154]:
tmp01 |> x->lag(x,2)
Out[154]:
7-element ShiftedVector{Int64, Missing, Vector{Int64}}:
   missing
   missing
 30
  4
  4
  2
  4
In [155]:
select(df_m,:x=>lag)
Out[155]:

7 rows × 1 columns

x_lag
Int64?
1missing
230
34
44
52
64
72

Build Data Pipeline

  • @chain과 @pipe는 동일하게 동작하나 @chain을 사용하는 것이 좋음
  • @chain 또는 @pipe 안에 처리되는 DataFrame을 순차적으로 처리한 결과를 다음으로 넘겨 준다
  • _ 는 바로 위에서 받은 DataFrame을 나타냄
  • 아래 예는 순차적으로 lfp가 yes인것만 필터링한 결과를 groupby로 foreign에 대해서 그룹으로 묶은 결과를 다음으로 넘겨 주고 combine을 사용해서 educ의 평균을 구한 결과를 최종적으로 돌려 준다.
In [156]:
using Chain
In [157]:
@chain df begin
  filter(:lfp => ==("yes"),_)
  groupby(:foreign)
  combine(:educ=>mean)
end
Out[157]:

2 rows × 2 columns

foreigneduc_mean
StringFloat64
1no10.1969
2yes6.85034
In [158]:
import Pipe
In [159]:
Pipe.@pipe df |>
  filter(:lfp => ==("yes"),_) |>
  groupby(_,:foreign) |>
  combine(_,:educ=>mean)
Out[159]:

2 rows × 2 columns

foreigneduc_mean
StringFloat64
1no10.1969
2yes6.85034

Summarize Data

Aggregating variables

In [160]:
combine(df,:lnnlinc=>sum)
Out[160]:

1 rows × 1 columns

lnnlinc_sum
Float64
19317.81
In [161]:
combine(df,:lnnlinc=>sum=>:lnnlinc)
Out[161]:

1 rows × 1 columns

lnnlinc
Float64
19317.81
In [169]:
combine(df,:age=>(x->mean(skipmissing(x)))=>:mean_age)
Out[169]:

1 rows × 1 columns

mean_age
Float64
13.99553
In [180]:
# 최대값을 구할 컬럼을 입력한다
# :age중의 최대값, :lnnlinc중의 최대값을 구함
combine(df,[:age,:lnnlinc].=>maximum)
Out[180]:

1 rows × 2 columns

age_maximumlnnlinc_maximum
Float64Float64
16.212.3757
In [178]:
# age가 6.2인 사람들의 정보
filter(:age=> ==(6.2),df)
Out[178]:

3 rows × 8 columns

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
198no10.33856.2700no
2568no10.00986.21100no
3614no9.855996.21100no
In [179]:
# lnnlinc가 최대인 사람들의 정보
filter(:lnnlinc=> >=(12.375),df)
Out[179]:

1 rows × 8 columns

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
1624no12.37573.91204no

Adding variables with aggregation results

  • transform이 combine과 다른점은 df에 function을 적용한 새로운 컬럼을 추가 하는 것
In [182]:
first(
  transform(df,:lnnlinc=>mean=>:average_lnnlinc),
  5)
Out[182]:

5 rows × 9 columns

Column1lfplnnlincageeducnycnocforeignaverage_lnnlinc
Int64StringFloat64Float64Int64Int64Int64StringFloat64
11no10.78753.0811no10.6856
22yes10.52434.5801no10.6856
33no10.96864.6900no10.6856
44no11.1053.11120no10.6856
55no11.10854.41202no10.6856
In [194]:
# lnnlinc에 대해 5개의 이동평균을 구하고 moving_average_lnnlinc 컬럼에 추가하여
# 전체를 돌려준다
# 10 ~ 20번째 행을 보여줌
transform(df,:lnnlinc=>(x->runmean(x,5))=>:moving_average_lnnlinc)[10:15,:]
Out[194]:

6 rows × 9 columns

Column1lfplnnlincageeducnycnocforeignmoving_average_lnnlinc
Int64StringFloat64Float64Int64Int64Int64StringFloat64
110no10.48644.31102no10.817
211no10.66064.51102no10.7435
312no10.46766.01200no10.5461
413no11.22963.31120no10.6938
514no11.90655.61400no10.9501
615no11.50165.61100no11.1532
In [193]:
# 선택한 항목에 대해서 처리 하고 결과를 돌려 준다.
# 선택한 항목과 특정항목 예를들어 lnnlinc에 대해 5개의 이동평균을 구하고 
# moving_average_lnnlinc 컬럼에 추가하여 돌려준다
select(df,:lfp,:lnnlinc,:lnnlinc => (x->runmean(x,5))=> :moving_average_lnnlinc )[10:15,:]
Out[193]:

6 rows × 3 columns

lfplnnlincmoving_average_lnnlinc
StringFloat64Float64
1no10.486410.817
2no10.660610.7435
3no10.467610.5461
4no11.229610.6938
5no11.906510.9501
6no11.501611.1532

Adding variables by row

In [199]:
# row별로 원하는 컬럼에 대해 function을 적용한 결과를 새로운 컬럼에 추가 하여
# 전체를 돌려 준다.
transform(df,[:nyc,:noc]=>ByRow(+)=>:nc)[1:5,:]
Out[199]:

5 rows × 9 columns

Column1lfplnnlincageeducnycnocforeignnc
Int64StringFloat64Float64Int64Int64Int64StringInt64
11no10.78753.0811no2
22yes10.52434.5801no1
33no10.96864.6900no0
44no11.1053.11120no2
55no11.10854.41202no2
In [202]:
select(df,:lfp,:age,[:nyc,:noc]=>ByRow(+)=>:nc)[1:5,:]
Out[202]:

5 rows × 3 columns

lfpagenc
StringFloat64Int64
1no3.02
2yes4.51
3no4.60
4no3.12
5no4.42
In [214]:
transform(df,:age=>ByRow(x->split(string(x),"."))=>[:f,:s])[1:5,:]
Out[214]:

5 rows × 10 columns

Column1lfplnnlincageeducnycnocforeignfs
Int64StringFloat64Float64Int64Int64Int64StringSubStri…SubStri…
11no10.78753.0811no30
22yes10.52434.5801no45
33no10.96864.6900no46
44no11.1053.11120no31
55no11.10854.41202no44

Group Data Sets

In [217]:
gdf1 = groupby(df,:educ)
Out[217]:

GroupedDataFrame with 20 groups based on key: educ

First Group (1 row): educ = 1

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
1763yes9.439712.9120yes

Last Group (1 row): educ = 21

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
1218no10.97593.02120no
In [237]:
keys(gdf1)
Out[237]:
20-element DataFrames.GroupKeys{GroupedDataFrame{DataFrame}}:
 GroupKey: (educ = 1,)
 GroupKey: (educ = 2,)
 GroupKey: (educ = 3,)
 GroupKey: (educ = 4,)
 GroupKey: (educ = 5,)
 GroupKey: (educ = 6,)
 GroupKey: (educ = 7,)
 GroupKey: (educ = 8,)
 GroupKey: (educ = 9,)
 GroupKey: (educ = 10,)
 GroupKey: (educ = 11,)
 GroupKey: (educ = 12,)
 GroupKey: (educ = 13,)
 GroupKey: (educ = 14,)
 GroupKey: (educ = 15,)
 GroupKey: (educ = 16,)
 GroupKey: (educ = 17,)
 GroupKey: (educ = 18,)
 GroupKey: (educ = 19,)
 GroupKey: (educ = 21,)
In [235]:
# tuple을 사용하여 key값으로 검색한다.
# 위 예제는 컬럼하나에 대해서 그룹을 했기 때문에 key의 차원은 1
# key값이 2번을 선택
gdf1[(2,)]
Out[235]:

6 rows × 8 columns

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
1219no10.36862.4200no
2711no10.70543.9210yes
3751yes10.5314.1202yes
4780yes10.61034.3202yes
5833no10.48364.4201yes
6852yes10.48373.3213yes
In [225]:
gdf2 = groupby(df,[:lfp,:educ])
Out[225]:

GroupedDataFrame with 36 groups based on keys: lfp, educ

First Group (1 row): lfp = "yes", educ = 1

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
1763yes9.439712.9120yes

Last Group (1 row): lfp = "no", educ = 21

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
1218no10.97593.02120no
In [226]:
keys(gdf2)
Out[226]:
36-element DataFrames.GroupKeys{GroupedDataFrame{DataFrame}}:
 GroupKey: (lfp = "yes", educ = 1)
 GroupKey: (lfp = "yes", educ = 2)
 GroupKey: (lfp = "yes", educ = 3)
 GroupKey: (lfp = "yes", educ = 4)
 GroupKey: (lfp = "yes", educ = 5)
 GroupKey: (lfp = "yes", educ = 6)
 GroupKey: (lfp = "yes", educ = 7)
 GroupKey: (lfp = "yes", educ = 8)
 GroupKey: (lfp = "yes", educ = 9)
 GroupKey: (lfp = "yes", educ = 10)
 GroupKey: (lfp = "yes", educ = 11)
 GroupKey: (lfp = "yes", educ = 12)
 GroupKey: (lfp = "yes", educ = 13)
 ⋮
 GroupKey: (lfp = "no", educ = 8)
 GroupKey: (lfp = "no", educ = 9)
 GroupKey: (lfp = "no", educ = 10)
 GroupKey: (lfp = "no", educ = 11)
 GroupKey: (lfp = "no", educ = 12)
 GroupKey: (lfp = "no", educ = 13)
 GroupKey: (lfp = "no", educ = 14)
 GroupKey: (lfp = "no", educ = 15)
 GroupKey: (lfp = "no", educ = 16)
 GroupKey: (lfp = "no", educ = 17)
 GroupKey: (lfp = "no", educ = 19)
 GroupKey: (lfp = "no", educ = 21)
In [241]:
# tuple을 사용하여 key값으로 검색한다.
# 위 예제는 컬럼 두개에 대해서 그룹을 했기 때문에 key의 차원은 2
# key값이 yes, 2 인 그룹을 선택
gdf2[("yes",2)]
Out[241]:

3 rows × 8 columns

Column1lfplnnlincageeducnycnocforeign
Int64StringFloat64Float64Int64Int64Int64String
1751yes10.5314.1202yes
2780yes10.61034.3202yes
3852yes10.48373.3213yes
In [242]:
# 각 그룹에 대해 combine 수행
combine(gdf1,:age=>mean=>:average_age)
Out[242]:

20 rows × 2 columns

educaverage_age
Int64Float64
112.9
223.73333
334.32727
444.426
554.19552
663.74286
774.11935
884.23121
994.02193
10103.83
11113.821
12123.84758
13133.69535
14143.76
15154.00667
16163.57
17173.66667
18185.1
19194.0
20213.0
In [246]:
# 위와 같은 결과
combine(gdf1) do sdf
  DataFrame(average_age=mean(sdf.age))
end
Out[246]:

20 rows × 2 columns

educaverage_age
Int64Float64
112.9
223.73333
334.32727
444.426
554.19552
663.74286
774.11935
884.23121
994.02193
10103.83
11113.821
12123.84758
13133.69535
14143.76
15154.00667
16163.57
17173.66667
18185.1
19194.0
20213.0
In [251]:
combine(gdf1,AsTable(:)=>(x->mean(x.age))=>:average_age)
Out[251]:

20 rows × 2 columns

educaverage_age
Int64Float64
112.9
223.73333
334.32727
444.426
554.19552
663.74286
774.11935
884.23121
994.02193
10103.83
11113.821
12123.84758
13133.69535
14143.76
15154.00667
16163.57
17173.66667
18185.1
19194.0
20213.0
In [243]:
# 각 그룹에 대해 combine 수행
combine(gdf2,:age=>mean=>:average_age)
Out[243]:

36 rows × 3 columns

lfpeducaverage_age
StringInt64Float64
1yes12.9
2yes23.9
3yes34.47778
4yes44.16897
5yes53.8725
6yes63.53636
7yes74.04375
8yes84.11194
9yes93.94909
10yes103.73714
11yes113.70789
12yes123.93778
13yes133.44
14yes143.38182
15yes153.9125
16yes163.53333
17yes173.2
18yes185.1
19no23.56667
20no33.65
21no44.78095
22no54.67407
23no63.97
24no74.2
25no84.32
26no94.08983
27no103.88909
28no113.89032
29no123.7962
30no134.05

Combine Data Sets

In [271]:
df1 = DataFrame(id1=[1,2,3],x=[4,5,6],y=[7,8,9])
Out[271]:

3 rows × 3 columns

id1xy
Int64Int64Int64
1147
2258
3369
In [272]:
df2 = DataFrame(id2=[1,2,4,5],z=[10,11,12,13])
Out[272]:

4 rows × 2 columns

id2z
Int64Int64
1110
2211
3412
4513
In [274]:
innerjoin(df1,df2,on=[:id1=>:id2])
Out[274]:

2 rows × 4 columns

id1xyz
Int64Int64Int64Int64
114710
225811
In [275]:
leftjoin(df1,df2,on=[:id1=>:id2])
Out[275]:

3 rows × 4 columns

id1xyz
Int64Int64Int64Int64?
114710
225811
3369missing
In [276]:
rightjoin(df1,df2,on=[:id1=>:id2])
Out[276]:

4 rows × 4 columns

id1xyz
Int64Int64?Int64?Int64
114710
225811
34missingmissing12
45missingmissing13
In [277]:
outerjoin(df1,df2,on=[:id1=>:id2])
Out[277]:

5 rows × 4 columns

id1xyz
Int64Int64?Int64?Int64?
114710
225811
3369missing
44missingmissing12
55missingmissing13
In [278]:
# df2의 id와 동일한 id를 가진 df1의 데이터를 돌려 준다
semijoin(df1,df2,on=[:id1=>:id2])
Out[278]:

2 rows × 3 columns

id1xy
Int64Int64Int64
1147
2258
In [281]:
# df1의id와 df2의id를 비교하여 df2에 없는 df1의 
# id에 해당 하는 데이터를 돌려 준다
antijoin(df1,df2,on=[:id1=>:id2])
Out[281]:

1 rows × 3 columns

id1xy
Int64Int64Int64
1369
  • vcat
In [285]:
df1 = DataFrame(id=[1,2],x=[4,5],y=[7,8])
Out[285]:

2 rows × 3 columns

idxy
Int64Int64Int64
1147
2258
In [286]:
df2 = DataFrame(id=[3,4],x=[10,11],y=[12,13])
Out[286]:

2 rows × 3 columns

idxy
Int64Int64Int64
131012
241113
In [287]:
vcat(df1,df2)
Out[287]:

4 rows × 3 columns

idxy
Int64Int64Int64
1147
2258
331012
441113
  • hcat
In [288]:
df1 = DataFrame(id=[1,2],x=[4,5])
Out[288]:

2 rows × 2 columns

idx
Int64Int64
114
225
In [289]:
df2 = DataFrame(y=[7,8])
Out[289]:

2 rows × 1 columns

y
Int64
17
28
In [290]:
hcat(df1,df2)
Out[290]:

2 rows × 3 columns

idxy
Int64Int64Int64
1147
2258
In [ ]: