my_matrix_π = Matrix{Float64}(undef,2,2)

2×2 Matrix{Float64}:
 2.122e-314  5.0e-324
 2.122e-314  1.29994e-315


fill!(my_matrix_π,3.14)

2×2 Matrix{Float64}:
 3.14  3.14
 3.14  3.14


[ones(Int,2,2) zeros(Int,2,2)]

2×4 Matrix{Int64}:
 1  1  0  0
 1  1  0  0


[ones(Int,2,2);zeros(Int,2,2)]

4×2 Matrix{Int64}:
 1  1
 1  1
 0  0
 0  0


[x^2 for x in 1:10 if isodd(x)]

5-element Vector{Int64}:
  1
  9
 25
 49
 81


(   x->
    begin
        println(x)
        println(x)
        println(x)
    end
).("ddd")

ddd
ddd
ddd


my_example_matrix = [[1 2 3];[4 5 6];[7 8 9]]

3×3 Matrix{Int64}:
 1  2  3
 4  5  6
 7  8  9


map(x->x+1, my_example_matrix[3,:])

3-element Vector{Int64}:
  8
  9
 10


using BenchmarkTools


@benchmark mapslices(sum,my_example_matrix,dims=1)

BenchmarkTools.Trial: 
  memory estimate:  2.70 KiB
  allocs estimate:  57
  --------------
  minimum time:     5.400 μs (0.00% GC)
  median time:      6.500 μs (0.00% GC)
  mean time:        7.043 μs (2.48% GC)
  maximum time:     598.550 μs (98.14% GC)
  --------------
  samples:          10000
  evals/sample:     6


@benchmark sum(my_example_matrix,dims=1)

BenchmarkTools.Trial: 
  memory estimate:  112 bytes
  allocs estimate:  1
  --------------
  minimum time:     93.444 ns (0.00% GC)
  median time:      139.438 ns (0.00% GC)
  mean time:        137.157 ns (1.36% GC)
  maximum time:     2.402 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     961


@benchmark sort(data) setup=(data=rand(10))

BenchmarkTools.Trial: 
  memory estimate:  160 bytes
  allocs estimate:  1
  --------------
  minimum time:     94.203 ns (0.00% GC)
  median time:      113.458 ns (0.00% GC)
  mean time:        118.443 ns (1.50% GC)
  maximum time:     760.663 ns (82.34% GC)
  --------------
  samples:          10000
  evals/sample:     966


column_major = [[1 3];[2 4]]

2×2 Matrix{Int64}:
 1  3
 2  4


row_major = [[1 2];[3 4]]

2×2 Matrix{Int64}:
 1  2
 3  4


for c in eachcol(column_major)
    println(c)
end
for r in eachrow(column_major)
    println(r)
end

[1, 2]
[3, 4]
[1, 3]
[2, 4]


my_pair = "Julia"=>42

"Julia" => 42


my_pair.first

"Julia"


my_pair.second

42


first(my_pair)

"Julia"


last(my_pair)

42


name2number_map = Dict("one"=>1, "two"=>2)

Dict{String, Int64} with 2 entries:
  "two" => 2
  "one" => 1


"two" in keys(name2number_map)

true


name2number_map["three"] = 3

3


name2number_map

Dict{String, Int64} with 3 entries:
  "two"   => 2
  "one"   => 1
  "three" => 3


delete!(name2number_map,"three")

Dict{String, Int64} with 2 entries:
  "two" => 2
  "one" => 1


popped_value = pop!(name2number_map,"two")

2


A = ["one","two","three"]
B = [1,2,3]
name2number_map = Dict(zip(A,B))

Dict{String, Int64} with 3 entries:
  "two"   => 2
  "one"   => 1
  "three" => 3


pop!(name2number_map)

"two" => 2


add_elements(a,b,c) = a + b+ c

add_elements (generic function with 1 method)


my_collection = [1,2,3]
add_elements(my_collection...)

6


add_elements(1:3...)

6


dirname(@__FILE__)

""


using Dates


subtypes(Period)

2-element Vector{Any}:
 DatePeriod
 TimePeriod


subtypes(DatePeriod)

5-element Vector{Any}:
 Day
 Month
 Quarter
 Week
 Year


subtypes(TimePeriod)

6-element Vector{Any}:
 Hour
 Microsecond
 Millisecond
 Minute
 Nanosecond
 Second


methods(DateTime)


methods(Date)


Date("19870913","yyyymmdd")

1987-09-13


m_day = Date("1987-09-13")

1987-09-13


yearmonth(m_day)

(1987, 9)


monthday(m_day)

(9, 13)


yearmonthday(m_day)

(1987, 9, 13)


m_day

1987-09-13


dayofweek(m_day)

7


dayname(m_day)

"Sunday"


dayofweekofmonth(m_day)

2


dayofweekofmonth(Date("1987-08-29")+Day(1))

5


first(methodswith(Date),20)


collect(Date("2021-01-01"):Day(1):Date("2021-01-07"))

7-element Vector{Date}:
 2021-01-01
 2021-01-02
 2021-01-03
 2021-01-04
 2021-01-05
 2021-01-06
 2021-01-07


DateTime("2021-01-01T21:10:11")

2021-01-01T21:10:11


DateTime("2021-01-01")

2021-01-01T00:00:00


methods(DateTime)


rand((42,"Julia",3.14))

42


rand(Dict(:one=>1,:two=>2))

:two => 2


using Downloads


readlines(Downloads.download("https://naver.com"))[1:2]

2-element Vector{String}:
 ""
 "<!doctype html>                          <html lang=\"ko\" data-dark=\"false\"> <head> <meta charset=\"utf-8\"> <title>NAVER</title> <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\"> <meta name=\"viewport\" content=\"width=1190\"> <meta name=\"apple-mobile-web-app-title\" content=\"NAVER\"/> <meta name=\"robots\" content=\"index,nofollow\"/> <meta name=\"description\" content=\"네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요\"/> <meta property=\"og:title\" content=\"네이버\"> <meta property=\"og:url\" content=\"https://www.naver.com/\"> <meta property=\"og:image\" content=\"https://s.pstatic.net/static/www/mobile/edit/2016/0705/mobile_212852414260.png\"> <meta property=\"og:description\" content=\"네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요\"/> <meta name=\"twitter:card\" content=\"summary\"> <meta name=\"twitter:title\" content=\"\"> <meta name=\"twitter:url\" content=\"https://www.naver.com/\"> <meta name=\"twitter:image\" content=\"https://s.pstatic.net/static/www/mobile/edit/2016/0705/mobile_212852414260.png\"> <meta name=\"twitter:description\" content=\"네이버 메인에서 다양한 정보와 유용한 컨텐츠를 만나 보세요\"/>  <link rel=\"stylesheet\" href=\"https://pm.pstatic.net/dist/css/nmain.20210826.css\"> <link rel=\"stylesheet\" href=\"https://ssl.pstatic.net/sstatic/search/pc/css/sp_autocomplete_210318.css\"> <link rel=\"shortcut icon\" type=\"image/x-icon\" href=\"/favicon.ico?1\"/>  <link rel=\"apple-touch-icon\" sizes=\"114x114\" href=\"https://s.pstatic.net/static/www/u/2014/0328/mma_204243574.png\"/> <link rel=\"apple-touch-icon\" href=\"https://s.pstatic.net/static/www/u/2014/0328/mma_20432863.png\"/> <script>document.domain=\"naver.com\",window.nmain=window.nmain||{},window.nmain.supportFlicking=!1;var nsc=\"navertop.v4\",ua=navigator.userAgent;window.nmain.isIE=navigator.appName&&navigator.appName.indexOf(\"Explorer\")>0&&ua.toLocaleLowerCase().indexOf(\"msie 10.0\")<0,document.getElementsByTagName(\"html\")[0].setAttribute(\"data-useragent\",ua),window.nmain.isIE&&(Object.create=function(n){function a(){}return a.prototype=n,new a})</script> <script>var darkmode= false;window.naver_corp_da=window.naver_corp_da||{main:{}},window.naver_corp_da.main=window.naver_corp_da.main||{},window.naver_corp_da.main.darkmode=darkmode,window.gladsdk=window.gladsdk||{cmd:[]},window.gladsdk.cmd.push((function(){window.gladsdk.setHostMeta(\"theme\",darkmode?\"dark\":\"light\")}))</script> <script> window.nmain.gv = {  isLogin: false,"


using DataFrames


using XLSX:
    eachtablerow,
    readxlsx,
    writetable


function write_xlsx(name,df::DataFrame)
    path = "$name.xlsx"
    data = collect(eachcol(df))
    cols = names(df)
    writetable(path,data,cols)
end

write_xlsx (generic function with 1 method)


function write_grades_xlsx(df::DataFrame)
    path = "grades"
    write_xlsx(path,df)
    "$path.xlsx"
end

write_grades_xlsx (generic function with 1 method)


df = DataFrame(name=["sally","Bob","Alice","Hank"],grade_2020=[1.0,5.0,8.5,4.0])


path = write_grades_xlsx(df)
path

"grades.xlsx"


xf = readxlsx(path)
xf

XLSXFile("grades.xlsx") containing 1 Worksheet
            sheetname size          range        
-------------------------------------------------
               Sheet1 5x2           A1:B5


sheet = xf["Sheet1"]
sheet

5×2 XLSX.Worksheet: ["Sheet1"](A1:B5)


eachtablerow(sheet) |> DataFrame


df.name

4-element Vector{String}:
 "sally"
 "Bob"
 "Alice"
 "Hank"


df[!,:name]

4-element Vector{String}:
 "sally"
 "Bob"
 "Alice"
 "Hank"


df2 = DataFrame(id=[1])


@benchmark filter(:name=>x->x=="Alice",df)

BenchmarkTools.Trial: 
  memory estimate:  1.23 KiB
  allocs estimate:  16
  --------------
  minimum time:     1.240 μs (0.00% GC)
  median time:      1.810 μs (0.00% GC)
  mean time:        2.130 μs (3.87% GC)
  maximum time:     417.110 μs (99.24% GC)
  --------------
  samples:          10000
  evals/sample:     10


@benchmark filter(:name=> ==("Alice"),df)

BenchmarkTools.Trial: 
  memory estimate:  1.31 KiB
  allocs estimate:  19
  --------------
  minimum time:     1.260 μs (0.00% GC)
  median time:      1.570 μs (0.00% GC)
  mean time:        1.960 μs (6.87% GC)
  maximum time:     487.250 μs (99.09% GC)
  --------------
  samples:          10000
  evals/sample:     10


filter(:name => !=("Alice"),df)

df


function complex_filter(name,grade)::Bool
    i_name = startswith(name,'A') || startswith(name,'B')
    i_grade = 6 < grade
    i_name && i_grade
end

complex_filter (generic function with 1 method)


filter([:name,:grade_2020]=>complex_filter,df)


subset(df,:name=>ByRow(x->x=="Alice"))


subset(df,:name=> ByRow(==("Alice")))


function salaries()
    names = ["John","Hank","Karen","Zed"]
    salary = [1_900,2_800,2_800,missing]
    DataFrame(;names,salary)
end
salaries()


filter(:salary => > (2_000), salaries())

syntax: space before "(" not allowed in "> (" at In[148]:1

Stacktrace:
 [1] top-level scope
   @ In[148]:1
 [2] eval
   @ .\boot.jl:360 [inlined]
 [3] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base .\loading.jl:1094


subset(salaries(), :salary=>ByRow(>(2_000)))

ArgumentError: missing was returned in condition number 1 but only true or false are allowed; pass skipmissing=true to skip missing values

Stacktrace:
  [1] _and(x::Missing)
    @ DataFrames C:\julia_env\turing\packages\DataFrames\vuMM8\src\abstractdataframe\subset.jl:11
  [2] _broadcast_getindex_evalf
    @ .\broadcast.jl:648 [inlined]
  [3] _broadcast_getindex
    @ .\broadcast.jl:621 [inlined]
  [4] getindex
    @ .\broadcast.jl:575 [inlined]
  [5] macro expansion
    @ .\broadcast.jl:984 [inlined]
  [6] macro expansion
    @ .\simdloop.jl:77 [inlined]
  [7] copyto!
    @ .\broadcast.jl:983 [inlined]
  [8] copyto!
    @ .\broadcast.jl:994 [inlined]
  [9] copyto!
    @ .\broadcast.jl:936 [inlined]
 [10] copy
    @ .\broadcast.jl:908 [inlined]
 [11] materialize(bc::Base.Broadcast.Broadcasted{Base.Broadcast.DefaultArrayStyle{1}, Nothing, typeof(DataFrames._and), Tuple{Vector{Union{Missing, Bool}}}})
    @ Base.Broadcast .\broadcast.jl:883
 [12] _get_subset_conditions(df::DataFrame, ::Base.RefValue{Any}, skipmissing::Bool)
    @ DataFrames C:\julia_env\turing\packages\DataFrames\vuMM8\src\abstractdataframe\subset.jl:74
 [13] subset(df::DataFrame, args::Any; skipmissing::Bool, view::Bool)
    @ DataFrames C:\julia_env\turing\packages\DataFrames\vuMM8\src\abstractdataframe\subset.jl:173
 [14] subset(df::DataFrame, args::Any)
    @ DataFrames C:\julia_env\turing\packages\DataFrames\vuMM8\src\abstractdataframe\subset.jl:173
 [15] top-level scope
    @ In[149]:1
 [16] eval
    @ .\boot.jl:360 [inlined]
 [17] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base .\loading.jl:1094


subset(salaries(),:salary=>ByRow(>(2_000));skipmissing=true)


function responses()
    id = [1,2]
    q1 = [28,61]
    q2 = [:us,:fr]
    q3 = ["F","B"]
    q4 = ["B","C"]
    q5 = ["A","E"]
    DataFrame(;id,q1,q2,q3,q4,q5)
end
responses()


select(responses(),:id,:q1)


select(responses(), Not(:q5))


select(responses(),Not([:q4,:q5]))


select(responses(),:q5, Not(:id))


select(responses(), Not(:id))


select(responses(),:)


select(responses(),:q5,:)


select(responses(),1,:q5,:)


select(responses(),1=>"participant",:q1=>"age",:q2=>"nationality")


renames = (1=>"participant", :q1=>"age", :q2=>"nationality")
select(responses(), renames...)


function wrong_types()
    id = 1:4
    date = ["28-01-2018","03-04-2019","01-08-2018","22-11-2020"]
    age = ["adolescent","adult","infant","adult"]
    DataFrame(;id,date,age)
end
wrong_types()


sort(wrong_types(),:date)


function fix_date_column(df::DataFrame)
    string2dates(dates::Vector) = Date.(dates,dateformat"dd-mm-yyy")
    dates = string2dates(df[!,:date])
    df[!,:date] = dates
    df
end
fix_date_column(wrong_types())


df = fix_date_column(wrong_types())
sort(df,:date)


sort(wrong_types(),:age)


using CategoricalArrays


function fix_age_column(df)
    levels = ["infant","adolescent","adult"]
    ages = categorical(df[!,:age];levels,ordered=true)
    df[!,:age] = ages
    df
end
fix_age_column(wrong_types())


df = fix_age_column(wrong_types())
sort(df,:age)


function correct_types()
    df = wrong_types()
    df = fix_date_column(df)
    df = fix_age_column(df)
end
correct_types()


df = correct_types()
a = df[1,:age]
b = df[2,:age]
a < b

true


function grades_2020()
    DataFrame(;name=["Sally","Bob","Alice","Hank"],
                grade_2020=[1.0,5.0,8.5,4.0])

end
grades_2020()


function grades_2021()
    DataFrame(;name=["Bob2","Sally","Hank"],
                grade_2021=[9.5,9.5,6.0])

end
grades_2021()


innerjoin(grades_2020(),grades_2021(); on=:name)


outerjoin(grades_2020(),grades_2021();on=:name)


crossjoin(grades_2020(),grades_2021();on=:id)

MethodError: no method matching crossjoin(::DataFrame, ::DataFrame; on=:id)
Closest candidates are:
  crossjoin(::AbstractDataFrame, ::AbstractDataFrame; makeunique) at C:\julia_env\turing\packages\DataFrames\vuMM8\src\join\composer.jl:1332 got unsupported keyword argument "on"
  crossjoin(::AbstractDataFrame, ::AbstractDataFrame, ::AbstractDataFrame...; makeunique) at C:\julia_env\turing\packages\DataFrames\vuMM8\src\join\composer.jl:1343 got unsupported keyword argument "on"

Stacktrace:
 [1] kwerr(::NamedTuple{(:on,), Tuple{Symbol}}, ::Function, ::DataFrame, ::DataFrame)
   @ Base .\error.jl:157
 [2] top-level scope
   @ In[191]:1
 [3] eval
   @ .\boot.jl:360 [inlined]
 [4] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
   @ Base .\loading.jl:1094


crossjoin(grades_2020(),grades_2021())

ArgumentError: Duplicate variable names: :name. Pass makeunique=true to make them unique using a suffix automatically.

Stacktrace:
 [1] add_names(ind::DataFrames.Index, add_ind::DataFrames.Index; makeunique::Bool)
   @ DataFrames C:\julia_env\turing\packages\DataFrames\vuMM8\src\other\index.jl:323


crossjoin(grades_2020(),grades_2021();makeunique=true)


leftjoin(grades_2020(),grades_2021();on=:name)


rightjoin(grades_2020(), grades_2021(); on=:name)


leftjoin(grades_2021(),grades_2020(); on=:name)


semijoin(grades_2020(),grades_2021();on=:name)


antijoin(grades_2020(),grades_2021();on=:name)


function grades_2020()
    name = ["Sally","Bob","Alice","Hank"]
    grade_2020 = [1,5,8.5,4]
    DataFrame(;name,grade_2020)
end
grades_2020()


plus_one(grades) = grades .+ 1
transform(grades_2020(),:grade_2020=>plus_one)


transform(grades_2020(),:grade_2020=>plus_one=> :grade_2020)


transform(grades_2020(),:grade_2020=>plus_one; renamecols=false)


select(grades_2020(),:,:grade_2020=>plus_one)


select(grades_2020(),:,:grade_2020=>plus_one=>:grade_2020)


df = grades_2020()
df.grade_2020 = plus_one.(df.grade_2020)
df


leftjoined = leftjoin(grades_2020(), grades_2021(); on=:name)


pass(A,B) = [5.5 < a || 5.5 < b for (a,b) in zip(A,B)]
transform(leftjoined,[:grade_2020,:grade_2021]=>pass; renamecols=false)


function only_pass()
    leftjoined = leftjoin(grades_2020(), grades_2021();on=:name)
    pass(A,B) = [5.5 < a || 5.5 < b for (a,b) in zip(A,B)]
    leftjoined = transform(leftjoined,[:grade_2020,:grade_2021]=>pass=>:pass)
    passed = subset(leftjoined,:pass; skipmissing=true)
    return passed.name
end
only_pass()

3-element Vector{String}:
 "Sally"
 "Hank"
 "Alice"


function all_grades()
    df1 = grades_2020()
    df1 = select(df1,:name,:grade_2020=>:grade)
    df2 = grades_2021()
    df2 = select(df2,:name,:grade_2021=>:grade)
    rename_bob2(data_col) = replace.(data_col,"Bob2" => "Bob")
    df2 = transform(df2, :name => rename_bob2 => :name)
    return vcat(df1,df2)
end
all_grades()


collect(groupby(all_grades(),:name))

4-element Vector{Any}:
 2×2 SubDataFrame
 Row │ name    grade   
     │ String  Float64 
─────┼─────────────────
   1 │ Sally       1.0
   2 │ Sally       9.5
 2×2 SubDataFrame
 Row │ name    grade   
     │ String  Float64 
─────┼─────────────────
   1 │ Bob         5.0
   2 │ Bob         9.5
 1×2 SubDataFrame
 Row │ name    grade   
     │ String  Float64 
─────┼─────────────────
   1 │ Alice       8.5
 2×2 SubDataFrame
 Row │ name    grade   
     │ String  Float64 
─────┼─────────────────
   1 │ Hank        4.0
   2 │ Hank        6.0


using Statistics


gdf = groupby(all_grades(),:name)
combine(gdf,:grade=>mean)


group = [:A,:A,:B,:B]
X = 1:4
Y = 5:8
df = DataFrame(;group,X,Y)


gdf = groupby(df,:group)
combine(gdf,[:X,:Y] .=> mean; renamecols=false)


gdf = groupby(df,:group)
rounded_mean(data_col) = round(Int, mean(data_col))
combine(gdf,[:X,:Y] .=> rounded_mean; renamecols=false)


df = responses()
@allocated select(df,:id,:q1)

7648


df = responses()
@allocated select!(df,:id,:q1)

7360


@allocated col = df[:,:id]

96


@allocated col = df[!,:id]

0

Julia Data Science – DataFrames.jl¶

Subset¶

Select¶

use "splat" operator¶

Types and Missing Data¶

innerjoin¶

Variable Transformations¶

Multiple Transformations¶

Groupbuy and Combine¶

Multiple Source Columns¶

Performance¶

	id	date	age
	Int64	String	String
1	1	28-01-2018	adolescent
2	2	03-04-2019	adult
3	3	01-08-2018	infant
4	4	22-11-2020	adult

	id	date	age
	Int64	Date	String
1	1	2018-01-28	adolescent
2	2	2019-04-03	adult
3	3	2018-08-01	infant
4	4	2020-11-22	adult

	id	date	age
	Int64	String	Cat…
1	1	28-01-2018	adolescent
2	2	03-04-2019	adult
3	3	01-08-2018	infant
4	4	22-11-2020	adult

	name	grade_2020	grade_2021
	String	Float64?	Float64?
1	Sally	1.0	9.5
2	Hank	4.0	6.0
3	Bob	5.0	missing
4	Alice	8.5	missing
5	Bob2	missing	9.5

	name	grade_2020	name_1	grade_2021
	String	Float64	String	Float64
1	Sally	1.0	Bob2	9.5
2	Sally	1.0	Sally	9.5
3	Sally	1.0	Hank	6.0
4	Bob	5.0	Bob2	9.5
5	Bob	5.0	Sally	9.5
6	Bob	5.0	Hank	6.0
7	Alice	8.5	Bob2	9.5
8	Alice	8.5	Sally	9.5
9	Alice	8.5	Hank	6.0
10	Hank	4.0	Bob2	9.5
11	Hank	4.0	Sally	9.5
12	Hank	4.0	Hank	6.0

	names	salary
	String	Int64?
1	John	1900
2	Hank	2800
3	Karen	2800
4	Zed	missing

	names	salary
	String	Int64?
1	Hank	2800
2	Karen	2800

	participant	age	nationality
	Int64	Int64	Symbol
1	1	28	us
2	2	61	fr

	participant	age	nationality
	Int64	Int64	Symbol
1	1	28	us
2	2	61	fr