using DataFrames, Pipe, BenchmarkTools
x = DataFrame(id=[1,2,3,4,1,2,3,4],id2=[1,2,1,2,1,2,1,2], v=rand(8))
groupby(x,:id)
groupby(x,[])
gx2 = groupby(x,[:id,:id2])
p1 = parent(gx2) # get the parent DataFrame
parent
는 parent의 주소를 돌려 준다.
p1 === x
# back to the DataFrame, but in a different order of rows than the original
vcat(gx2...)
DataFrame(gx2) # the same
# drop grouping columns when creating a data frame
DataFrame(gx2,keepkeys=false)
# vector of names of grouping variables
groupcols(gx2)
valuecols(gx2) # and non-grouping variables
groupindices(gx2) # group indices in parent(gx2)
kgx2 = keys(gx2)
you can index into a GroupDataFrame
like to a vector or to a dictionary. The second form acceps GroupKey,NameTuple
or Tuple
gx2
k = keys(gx2)[1]
ntk = NamedTuple(k)
tk = Tuple(k)
the operations below produce the same result and are fast
@btime gx2[1]
@btime gx2[k]
@btime gx2[ntk]
@btime gx2[tk]
handling missing values
x = DataFrame(id = [missing,5,1,3,missing],x=1:5)
# by default group include missing values and are not sorted
groupby(x,:id)
groupby(x,:id,sort=true,skipmissing=true) # but we can change it
combine,select,select!,transform
and transform!
¶using Statistics
using Chain
Reduce the number of rows in the output
ENV["LINES"] = 15
x = DataFrame(id=rand('a':'d',100), v=rand(100))
Apply a function to each group of data frame
combine keeps as many rows as are returned from the function
@chain x begin
groupby(:id)
combine(:v=>mean)
end
x.id2 = axes(x,1)
x
axes(x)
# select and transform keep as many rows as are in the source data frame
# and in correct order additionally transform keeps all columns from the source
@chain x begin
groupby(:id)
transform(:v=>mean)
end
@pipe x |> groupby(_,:id)
# note that combine reorders rows by group of GroupedDataFrame
@chain x begin
groupby(:id)
combine(:id2,:v=>mean)
end
# we give a custom name for the result column
@chain x begin
groupby(:id)
combine(:v=>mean=>:res)
end
# you can have multiple operations
@chain x begin
groupby(:id)
combine(:v=>mean=>:res1, :v=>sum=>:res2, nrow => :n, ncol)
end
combine(groupby(x,:id)) do sdf
n = nrow(sdf)
n < 25 ? DataFrame() : DataFrame(n=n) # drop groups with low number of rows
end
df = DataFrame(id=[1,1,2,2],val=[1,2,3,4])
@chain df begin
groupby(:id)
combine(:val=>(x->[x])=>AsTable)
end
@chain df begin
groupby(:id)
combine(:val=>(x->[x]) => [:c1,:c2])
end
df = DataFrame(a=[(p=1,q=2),(p=3,q=4)])
df = DataFrame(a=[[1,2],[3,4]])
select(df, :a)
select(df, :a=>AsTable) # Automatic column names generated
select(df,:a=>[:C1,:C2])
Finally, observe that one can conveniently apply multiple transformations using broadcasting:
df = DataFrame(id=repeat(1:10,10),x1=1:100,x2=101:200)
groupby(df,:id)
@chain df begin
groupby(:id)
combine([:x1,:x2] .=> minimum)
end
@chain df begin
groupby(:id)
combine([:x1,:x2] .=>[minimum,maximum])
end
mapcols
¶x = DataFrame(rand(11,10),:auto)
mapcols(mean,x)
eachcol
and eachrow
¶map(mean,eachcol(x)) # map a function over each column and return a vector
# an iteration returns a Pair with column name and values
foreach(c->println(c[1], ": ",mean(c[2])),pairs(eachcol(x)))
keys(pairs(eachcol(x)))
values(pairs(eachcol(x)))
now the returned value is DataFrameRow which works as NamedTuple but is a view to a parent DataFrame
map(r->r.x1/r.x2,eachrow(x))
map(c->mean(c),eachcol(x))
It prints like a data frame, only caption is different so that you know the type of the object
er = eachrow(x)
# you can access columns of parent data frame directly
er.x1
It prints like a data frame, only the caption is different so that you know the type of the object
ec = eachcol(x)
ec.x1
er[1]
ec[1]
you can transpose a data frame using permuteddims
:
df = DataFrame(reshape(1:12,3,4),:auto)
df.names=["a","b","c"]
df
permutedims(df,:names)