class: center, middle, inverse, title-slide .title[ # Databases with R (part 2) ] .subtitle[ ##
https://pos.it/databases-24
] .author[ ### Kirill Müller ] .author[ ### cynkra GmbH ] .date[ ### August 12, 2024 ] --- <style type="text/css"> .pull-left { margin-top: -25px; } .pull-right { margin-top: -25px; } .r.hljs.remark-code { font-size: 20px; } .remark-code { font-size: 20px; } .fansi.fansi-error { color: #D70040; } pre { white-space: pre-wrap; /* css-3 */ white-space: -moz-pre-wrap; /* Mozilla, since 1999 */ white-space: -pre-wrap; /* Opera 4-6 */ white-space: -o-pre-wrap; /* Opera 7 */ word-wrap: break-word; /* Internet Explorer 5.5+ */ margin-bottom: 0px; } .reg-margin-bottom { margin-bottom: 25px; } .remark-slide-content { padding-top: 0px; padding-bottom: 0px; } .remark-slide-scaler { overflow-y: auto; } .font17 { font-size: 17px; } .font14 { font-size: 14px; } .script-number-font { color: inherit !important; opacity: 0.9 !important; transform: scale(1.8); transform-origin: top left; } .script-number { color: inherit !important; opacity: 0.9 !important; position: absolute; top: 12px; right: 20px; transform: scale(1.8); transform-origin: top right; } .exercise-timer { position: fixed; bottom: 12px; left: 20px; transform: scale(1.8); transform-origin: bottom left; } .exercise-timer img { max-width: none; /* Prevent the image from being constrained by the container */ } </style> # Learning goals <img src="data:image/png;base64,#databases-2_files/figure-html/fig-learn-duckdb-1.png" style="display: block; margin: auto;" /> --- class: middle <img src="data:image/png;base64,#images/triangle.svg" width="70%" height="70%" style="display: block; margin: auto;" /> --- background-image: url("data:image/png;base64,#images/21.webp") background-size: contain background-position: 100% 100% # Working with database dumps .pull-left[ - Single file - Partitioned dataset - Parquet - CSV <div class="script-number-font"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> ```r library(DBI) library(dplyr) requireNamespace("duckdb") requireNamespace("duckplyr") ``` ] --- background-image: url("data:image/png;base64,#images/21-frame.webp") background-size: contain background-position: 100% 100% # Working with database dumps .pull-left[ - Single file - Partitioned dataset - Parquet - CSV <div class="script-number-font"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> ```r library(DBI) library(dplyr) requireNamespace("duckdb") requireNamespace("duckplyr") ``` ] --- background-image: url("data:image/png;base64,#images/magic-hat-and-wand-nath-r.svg") background-size: 25% background-position: 100% 100% # Prepare dump Imagine someone gave you a Parquet file. ``` r arrow::write_parquet(nycflights13::flights, "flights.parquet") fs::file_size("flights.parquet") ``` ``` 5.43M ``` ``` r lobstr::obj_size(nycflights13::flights) ``` ``` 40.65 MB ``` <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Processing the local data ``` r df <- arrow::read_parquet("flights.parquet") df ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 336,776 × 19</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>dep_t…¹</span> <span style='font-weight: bold;'>sched…²</span> <span style='font-weight: bold;'>dep_d…³</span> <span style='font-weight: bold;'>arr_t…⁴</span> <span style='font-weight: bold;'>sched…⁵</span> <span style='font-weight: bold;'>arr_d…⁶</span> <span style='font-weight: bold;'>carrier</span> <span style='font-weight: bold;'>flight</span> <span style='font-weight: bold;'>tailnum</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 517 515 2 830 819 11 UA <span style='text-decoration: underline;'>1</span>545 N14228 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 1 533 529 4 850 830 20 UA <span style='text-decoration: underline;'>1</span>714 N24211 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 1 542 540 2 923 850 33 AA <span style='text-decoration: underline;'>1</span>141 N619AA <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 1 544 545 -<span style='color: #BB0000;'>1</span> <span style='text-decoration: underline;'>1</span>004 <span style='text-decoration: underline;'>1</span>022 -<span style='color: #BB0000;'>18</span> B6 725 N804JB <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 1 554 600 -<span style='color: #BB0000;'>6</span> 812 837 -<span style='color: #BB0000;'>25</span> DL 461 N668DN <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 1 554 558 -<span style='color: #BB0000;'>4</span> 740 728 12 UA <span style='text-decoration: underline;'>1</span>696 N39463 <span style='color: #949494;'># ℹ 336,770 more rows</span> <span style='color: #949494;'># ℹ abbreviated names: ¹dep_time, ²sched_dep_time, ³dep_delay, ⁴arr_time, …</span> </CODE></PRE> <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Read as Arrow dataset .pull-left[ ``` r ds <- arrow::open_dataset("flights.parquet") ds ``` ``` FileSystemDataset with 1 Parquet file 19 columns year: int32 month: int32 day: int32 dep_time: int32 sched_dep_time: int32 dep_delay: double arr_time: int32 sched_arr_time: int32 arr_delay: double carrier: string flight: int32 tailnum: string origin: string dest: string air_time: double distance: double hour: double minute: double time_hour: timestamp[us] ``` ] .pull-right[ ``` r ds |> count(year, month, day) |> collect() ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 365 × 4</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 842 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 2 943 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 3 914 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 4 915 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 5 720 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 6 832 <span style='color: #949494;'># ℹ 359 more rows</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Register as duckdb lazy table .pull-left[ ``` r con_memory <- dbConnect( duckdb::duckdb(), dbdir = ":memory:") tbl <- duckdb::tbl_file( con_memory, "flights.parquet") tbl |> explain() ``` ``` <SQL> FROM 'flights.parquet' <PLAN> physical_plan ┌───────────────────────────┐ │ PARQUET_SCAN │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ │ dep_time │ │ sched_dep_time │ │ dep_delay │ │ arr_time │ │ sched_arr_time │ │ arr_delay │ │ carrier │ │ flight │ │ tailnum │ │ origin │ │ dest │ │ air_time │ │ distance │ │ hour │ │ minute │ │ time_hour │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ EC: 336776 │ └───────────────────────────┘ ``` ] .pull-right[ ``` r tbl |> count(year, month, day) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: SQL [?? x 4]</span> <span style='color: #949494;'># Database: DuckDB v1.0.0</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 842 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 2 943 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 3 914 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 4 915 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 5 720 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 6 832 <span style='color: #949494;'># ℹ more rows</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Register as duckplyr lazy data frame .reg-margin-bottom[ ``` r duckplyr_df <- duckplyr::duckplyr_df_from_parquet("flights.parquet") class(duckplyr_df) ``` ``` [1] "duckplyr_df" "tbl_df" "tbl" "data.frame" ``` ] .pull-left[ ``` r duckplyr_df |> explain() ``` ``` ┌───────────────────────────┐ │ READ_PARQUET │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ │ dep_time │ │ sched_dep_time │ │ dep_delay │ │ arr_time │ │ sched_arr_time │ │ arr_delay │ │ carrier │ │ flight │ │ tailnum │ │ origin │ │ dest │ │ air_time │ │ distance │ │ hour │ │ minute │ │ time_hour │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ EC: 336776 │ └───────────────────────────┘ ``` ] .pull-right[ ``` r duckplyr_df ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 336,776 × 19</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>dep_time</span> <span style='font-weight: bold;'>sched_d…¹</span> <span style='font-weight: bold;'>dep_d…²</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 517 515 2 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 1 533 529 4 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 1 542 540 2 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 1 544 545 -<span style='color: #BB0000;'>1</span> <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 1 554 600 -<span style='color: #BB0000;'>6</span> <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 1 554 558 -<span style='color: #BB0000;'>4</span> <span style='color: #949494;'># ℹ 336,770 more rows</span> <span style='color: #949494;'># ℹ abbreviated names: ¹sched_dep_time, …</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # The future: Register as duckplyr lazy data frame .pull-left[ ``` r duckplyr_df |> count(year, month, day) |> filter(month == 1) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 31 × 4</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 842 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 2 943 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 3 914 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 4 915 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 5 720 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 6 832 <span style='color: #949494;'># ℹ 25 more rows</span> </CODE></PRE> ] .pull-right[ ``` r duckplyr_df |> count(year, month, day) |> filter(month == 1) |> explain() ``` ``` ┌───────────────────────────┐ │ ORDER_BY │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ ORDERS: │ │dataframe_105553178666960_7│ │ 62464429."year" ASC │ │dataframe_105553178666960_7│ │ 62464429."month" ASC │ │dataframe_105553178666960_7│ │ 62464429."day" ASC │ └─────────────┬─────────────┘ ┌─────────────┴─────────────┐ │ PROJECTION │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ │ n │ └─────────────┬─────────────┘ ┌─────────────┴─────────────┐ │ HASH_GROUP_BY │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ #0 │ │ #1 │ │ #2 │ │ count_star() │ └─────────────┬─────────────┘ ┌─────────────┴─────────────┐ │ PROJECTION │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ └─────────────┬─────────────┘ ┌─────────────┴─────────────┐ │ FILTER │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ r_base::==(month, 1.0) │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ EC: 67355 │ └─────────────┬─────────────┘ ┌─────────────┴─────────────┐ │ R_DATAFRAME_SCAN │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ data.frame │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ EC: 336776 │ └───────────────────────────┘ ``` ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- background-image: url("data:image/png;base64,#images/magic-hat-and-wand-nath-r.svg") background-size: 25% background-position: 100% 450px # Prepare partitioned dump .pull-left[ Imagine someone gave you a bunch of Parquet files. ``` r arrow::write_dataset( nycflights13::flights, "flights-part/", partitioning = c("year", "month") ) fs::dir_tree("flights-part") ``` ] .pull-right[ <PRE class="fansi fansi-output"><CODE><span style='color: #00BBBB; font-weight: bold;'>flights-part</span> └── <span style='color: #00BBBB; font-weight: bold;'>year=2013</span> ├── <span style='color: #00BBBB; font-weight: bold;'>month=1</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=10</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=11</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=12</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=2</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=3</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=4</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=5</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=6</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=7</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=8</span> │ └── part-0.parquet └── <span style='color: #00BBBB; font-weight: bold;'>month=9</span> └── part-0.parquet </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Read partitioned data ``` r tbl_part <- duckdb::tbl_query( con_memory, "read_parquet('flights-part/*/*/*.parquet', hive_partitioning = true)" ) tbl_part ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: SQL [?? x 19]</span> <span style='color: #949494;'># Database: DuckDB v1.0.0</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>dep_time</span> <span style='font-weight: bold;'>sched_dep…¹</span> <span style='font-weight: bold;'>dep_d…²</span> <span style='font-weight: bold;'>arr_t…³</span> <span style='font-weight: bold;'>sched…⁴</span> <span style='font-weight: bold;'>arr_d…⁵</span> <span style='font-weight: bold;'>carrier</span> <span style='font-weight: bold;'>flight</span> <span style='font-weight: bold;'>tailnum</span> <span style='font-weight: bold;'>origin</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'>1</span> 1 517 515 2 830 819 11 UA <span style='text-decoration: underline;'>1</span>545 N14228 EWR <span style='color: #BCBCBC;'>2</span> 1 533 529 4 850 830 20 UA <span style='text-decoration: underline;'>1</span>714 N24211 LGA <span style='color: #BCBCBC;'>3</span> 1 542 540 2 923 850 33 AA <span style='text-decoration: underline;'>1</span>141 N619AA JFK <span style='color: #BCBCBC;'>4</span> 1 544 545 -<span style='color: #BB0000;'>1</span> <span style='text-decoration: underline;'>1</span>004 <span style='text-decoration: underline;'>1</span>022 -<span style='color: #BB0000;'>18</span> B6 725 N804JB JFK <span style='color: #BCBCBC;'>5</span> 1 554 600 -<span style='color: #BB0000;'>6</span> 812 837 -<span style='color: #BB0000;'>25</span> DL 461 N668DN LGA <span style='color: #BCBCBC;'>6</span> 1 554 558 -<span style='color: #BB0000;'>4</span> 740 728 12 UA <span style='text-decoration: underline;'>1</span>696 N39463 EWR <span style='color: #949494;'># ℹ more rows</span> <span style='color: #949494;'># ℹ abbreviated names: ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, …</span> </CODE></PRE> <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- background-image: url("data:image/png;base64,#images/magic-hat-and-wand-nath-r.svg") background-size: 25% background-position: 100% 100% # Prepare CSV dump Imagine someone gave you a CSV file. ``` r readr::write_csv(nycflights13::flights, "flights.csv") fs::file_size("flights.csv") ``` ``` 29.6M ``` ``` r lobstr::obj_size(nycflights13::flights) ``` ``` 40.65 MB ``` <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Read CSV data with duckdb .pull-left[ ``` r tbl_csv <- duckdb::tbl_file( con_memory, "flights.csv") tbl_csv |> explain() ``` ``` <SQL> FROM 'flights.csv' <PLAN> physical_plan ┌───────────────────────────┐ │ READ_CSV_AUTO │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ │ dep_time │ │ sched_dep_time │ │ dep_delay │ │ arr_time │ │ sched_arr_time │ │ arr_delay │ │ carrier │ │ flight │ │ tailnum │ │ origin │ │ dest │ │ air_time │ │ distance │ │ hour │ │ minute │ │ time_hour │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ EC: 326882 │ └───────────────────────────┘ ``` ] .pull-right[ ``` r tbl_csv |> count(year, month, day) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: SQL [?? x 4]</span> <span style='color: #949494;'># Database: DuckDB v1.0.0</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 7 11 <span style='text-decoration: underline;'>1</span>006 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 7 15 999 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 7 16 996 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 7 22 <span style='text-decoration: underline;'>1</span>000 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 7 26 999 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 7 31 <span style='text-decoration: underline;'>1</span>001 <span style='color: #949494;'># ℹ more rows</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Read CSV data with duckplyr .pull-left[ ``` r duckplyr_df_csv <- duckplyr::duckplyr_df_from_csv("flights.csv") class(duckplyr_df_csv) ``` ``` [1] "duckplyr_df" "tbl_df" "tbl" [4] "data.frame" ``` ``` r duckplyr_df_csv |> explain() ``` ``` ┌───────────────────────────┐ │ READ_CSV_AUTO │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ │ dep_time │ │ sched_dep_time │ │ dep_delay │ │ arr_time │ │ sched_arr_time │ │ arr_delay │ │ carrier │ │ flight │ │ tailnum │ │ origin │ │ dest │ │ air_time │ │ distance │ │ hour │ │ minute │ │ time_hour │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ EC: 326882 │ └───────────────────────────┘ ``` ] .pull-right[ ``` r duckplyr_df_csv |> count(year, month, day) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 365 × 4</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 842 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 2 943 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 3 914 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 4 915 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 5 720 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 6 832 <span style='color: #949494;'># ℹ 359 more rows</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Create derived Parquet data with duckplyr .pull-left[ ``` r duckplyr_df_csv |> count(year, month, day) |> duckplyr::df_to_parquet( "flights-count.parquet") fs::file_size("flights-count.parquet") ``` ``` 2.13K ``` ] .pull-right[ ``` r duckplyr::duckplyr_df_from_parquet( "flights-count.parquet") ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 365 × 4</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 842 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 2 943 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 3 914 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 4 915 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 5 720 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 6 832 <span style='color: #949494;'># ℹ 359 more rows</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- background-image: url("data:image/png;base64,#images/21-frame.webp") background-size: contain background-position: 100% 100% .pull-left[ # Working with database dumps: Exercises 1. From the Parquet file, compute two lazy dbplyr tables showing the mean and median delay for each month. 2. Compute the same data as duckplyr lazy data frames. 3. Store this data as a Parquet file. 4. Read the Parquet file and plot the data. ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> <div class="exercise-timer"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="84" height="20" role="img" aria-label="Timer: 10:00"><title>Timer: 10:00</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="84" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="41" height="20" fill="#555"/><rect x="41" width="43" height="20" fill="#007ec6"/><rect width="84" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="215" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="310">Timer</text><text x="215" y="140" transform="scale(.1)" fill="#fff" textLength="310">Timer</text><text aria-hidden="true" x="615" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">10:00</text><text x="615" y="140" transform="scale(.1)" fill="#fff" textLength="330">10:00</text></g></svg></div> --- # DuckDB showcase .middle[ <img src="data:image/png;base64,#images/hub.svg" width="130%" height="130%" style="display: block; margin: auto;" /> ] --- # Application: DBI + dbplyr and pivoting .pull-left[ ``` r con_memory <- dbConnect( duckdb::duckdb(), dbdir = ":memory:") tbl <- duckdb::tbl_file( con_memory, "flights.parquet") daily_flights_by_dest <- tbl |> count(year, month, day, dest) ``` ] .pull-right[ ``` r daily_flights_by_dest ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: SQL [?? x 5]</span> <span style='color: #949494;'># Database: DuckDB v1.0.0</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>dest</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 6 30 BTV 9 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 6 30 ORD 49 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 6 30 LAS 18 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 6 30 CLE 16 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 6 30 SFO 39 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 6 30 HNL 2 <span style='color: #949494;'># ℹ more rows</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> --- # Application: DBI + dbplyr and pivoting .pull-left[ ``` r daily_flights_by_dest_sql <- daily_flights_by_dest |> dbplyr::sql_render() daily_flights_by_dest_sql ``` ``` <SQL> SELECT "year", "month", "day", dest, COUNT(*) AS n FROM (FROM 'flights.parquet') q01 GROUP BY "year", "month", "day", dest ``` ] .pull-right[ ``` r pivot_sql <- paste0( "PIVOT (", daily_flights_by_dest_sql, ") ON dest USING SUM(n)" ) SQL(pivot_sql) ``` ``` <SQL> PIVOT (SELECT "year", "month", "day", dest, COUNT(*) AS n FROM (FROM 'flights.parquet') q01 GROUP BY "year", "month", "day", dest) ON dest USING SUM(n) ``` ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> --- # Application: DBI + dbplyr and pivoting ``` r as_tibble(dbGetQuery(con_memory, pivot_sql)) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 365 × 108</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>ABQ</span> <span style='font-weight: bold;'>ACK</span> <span style='font-weight: bold;'>ALB</span> <span style='font-weight: bold;'>ANC</span> <span style='font-weight: bold;'>ATL</span> <span style='font-weight: bold;'>AUS</span> <span style='font-weight: bold;'>AVL</span> <span style='font-weight: bold;'>BDL</span> <span style='font-weight: bold;'>BGR</span> <span style='font-weight: bold;'>BHM</span> <span style='font-weight: bold;'>BNA</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 3 <span style='color: #BB0000;'>NA</span> 40 6 1 2 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 10 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 2 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 3 <span style='color: #BB0000;'>NA</span> 51 7 1 2 <span style='color: #BB0000;'>NA</span> 1 11 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 3 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 2 <span style='color: #BB0000;'>NA</span> 49 6 <span style='color: #BB0000;'>NA</span> 1 <span style='color: #BB0000;'>NA</span> 1 11 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 4 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 2 <span style='color: #BB0000;'>NA</span> 48 6 <span style='color: #BB0000;'>NA</span> 1 <span style='color: #BB0000;'>NA</span> 1 11 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 5 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 2 <span style='color: #BB0000;'>NA</span> 35 4 <span style='color: #BB0000;'>NA</span> 2 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 8 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 6 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 2 <span style='color: #BB0000;'>NA</span> 41 5 <span style='color: #BB0000;'>NA</span> 1 <span style='color: #BB0000;'>NA</span> 1 13 <span style='color: #949494;'># ℹ 359 more rows</span> <span style='color: #949494;'># ℹ 94 more variables: </span><span style='color: #949494; font-weight: bold;'>BOS</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>BQN</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>BTV</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>BUF</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>BUR</span><span style='color: #949494;'> <dbl>, …</span> </CODE></PRE> <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> --- # Application: DBI + dbplyr and pivoting .pull-left[ ``` r system.time( as_tibble(dbGetQuery( con_memory, pivot_sql)) ) ``` ``` user system elapsed 0.083 0.003 0.041 ``` ] .pull-right[ ``` r system.time( nycflights13::flights |> count(year, month, day, dest) |> tidyr::pivot_wider( names_from = dest, values_from = n) ) ``` ``` user system elapsed 0.149 0.001 0.150 ``` ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> --- # Application: DBI + dbplyr and pivoting ``` r write_pivot_sql <- paste0( "COPY (", pivot_sql, ") TO 'pivot.parquet' (FORMAT PARQUET)" ) dbExecute(con_memory, write_pivot_sql) ``` ``` [1] 365 ``` ``` r q_unpivot_dyn <- "(SELECT * FROM (UNPIVOT 'pivot.parquet' ON COLUMNS(* EXCLUDE (year, month, day)) INTO NAME dest VALUE n))" SQL(q_unpivot_dyn) ``` ``` <SQL> (SELECT * FROM (UNPIVOT 'pivot.parquet' ON COLUMNS(* EXCLUDE (year, month, day)) INTO NAME dest VALUE n)) ``` <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> --- # Application: DBI + dbplyr and pivoting ``` r tbl(con_memory, from = q_unpivot_dyn) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: SQL [?? x 5]</span> <span style='color: #949494;'># Database: DuckDB v1.0.0</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>dest</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 ALB 3 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 1 ATL 40 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 1 AUS 6 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 1 AVL 1 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 1 BDL 2 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 1 BNA 10 <span style='color: #949494;'># ℹ more rows</span> </CODE></PRE> <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> --- background-image: url("data:image/png;base64,#images/21-frame.webp") background-size: contain background-position: 100% 100% .pull-left[ # DuckDB showcase: Breakout Discuss in groups of 4-6 people: - How often is the data you work with updated? Realtime, daily, monthly, never? - Does the size of the data you are working with justify the overhead of working with local database dumps? - Could the proposed workflow be implemented in your current projects? What constraints would you face? ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> <div class="exercise-timer"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="84" height="20" role="img" aria-label="Timer: 10:00"><title>Timer: 10:00</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="84" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="41" height="20" fill="#555"/><rect x="41" width="43" height="20" fill="#007ec6"/><rect width="84" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="215" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="310">Timer</text><text x="215" y="140" transform="scale(.1)" fill="#fff" textLength="310">Timer</text><text aria-hidden="true" x="615" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">10:00</text><text x="615" y="140" transform="scale(.1)" fill="#fff" textLength="330">10:00</text></g></svg></div> --- class: middle <img src="data:image/png;base64,#images/triangle.svg" width="70%" height="70%" style="display: block; margin: auto;" /> --- background-image: url("data:image/png;base64,#images/22.webp") background-size: 40% background-position: 100% 100% # Full circle .pull-left[ - Dump as a single file - Dump as a partitioned dataset <div class="script-number-font"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> ```r library(DBI) library(dplyr) requireNamespace("arrow") ``` ] --- background-image: url("data:image/png;base64,#images/22-frame.webp") background-size: 40% background-position: 100% 100% # Full circle .pull-left[ - Dump as a single file - Dump as a partitioned dataset <div class="script-number-font"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> ```r library(DBI) library(dplyr) requireNamespace("arrow") ``` ] --- background-image: url("data:image/png;base64,#images/magic-hat-and-wand-nath-r.svg") background-size: 25% background-position: 100% 450px # Prepare database .pull-left[ Imagine someone has set up a database for you. ``` r con_rw <- dbConnect( duckdb::duckdb(), dbdir = "flights.duckdb" ) copy_to( con_rw, nycflights13::flights, name = "flights", temporary = FALSE ) dbDisconnect(con_rw) ``` ] .pull-right[ ``` r con <- dbConnect( duckdb::duckdb(), dbdir = "flights.duckdb", read_only = TRUE ) flights_duckdb <- tbl(con, "flights") flights_duckdb ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: table<flights> [?? x 19]</span> <span style='color: #949494;'># Database: DuckDB v1.0.0</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>dep_time</span> <span style='font-weight: bold;'>sched_d…¹</span> <span style='font-weight: bold;'>dep_d…²</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 517 515 2 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 1 533 529 4 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 1 542 540 2 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 1 544 545 -<span style='color: #BB0000;'>1</span> <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 1 554 600 -<span style='color: #BB0000;'>6</span> <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 1 554 558 -<span style='color: #BB0000;'>4</span> <span style='color: #949494;'># ℹ more rows</span> <span style='color: #949494;'># ℹ abbreviated names: ¹sched_dep_time, …</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> --- # Method 1: via local data frame .pull-left[ ``` r flights_duckdb |> filter(month == 1) |> collect() |> duckplyr::df_to_parquet( "flights-jan.parquet") ``` ] .pull-right[ ``` r flights_duckdb |> collect() |> duckplyr::df_to_parquet("flights.parquet") ``` ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> --- # Method 2: via DBI .pull-left[ ## Setup ``` r sql_jan <- flights_duckdb |> filter(month == 1) |> dbplyr::sql_render() sql_jan ``` ``` <SQL> SELECT flights.* FROM flights WHERE ("month" = 1.0) ``` ``` r fs::dir_create("flights-arrow") ``` ] .pull-right[ ``` r res <- dbSendQuery(con, sql_jan) i <- 0 repeat { df <- dbFetch(res, n = 10000) if (nrow(df) == 0) break path <- fs::path( "flights-arrow", sprintf("part-%05d.parquet", i)) duckplyr::df_to_parquet(df, path) i <- i + 1 } dbClearResult(res) ``` ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> --- # Method 3: via parquetize ``` r parquetize::dbi_to_parquet( con, sql_jan, "flights-parquetized", max_rows = 10000 ) fs::dir_tree("flights-parquetized/") ``` <PRE class="fansi fansi-output"><CODE><span style='color: #00BBBB; font-weight: bold;'>flights-parquetized/</span> ├── part-1-10000.parquet ├── part-10001-20000.parquet └── part-20001-27004.parquet </CODE></PRE> <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> --- background-image: url("data:image/png;base64,#images/magic-hat-and-wand-nath-r.svg") background-size: 25% background-position: 100% 100% # Method 4: via DBI and arrow .pull-left[ ``` r con_adbi <- dbConnect( adbi::adbi(duckdb::duckdb_adbc()), path = "flights.duckdb" ) sql <- "SELECT * FROM flights" ``` ] .pull-right[ ``` r system.time({ res <- dbSendQueryArrow(con_adbi, sql) stream <- dbFetchArrow(res) arrow::write_dataset( arrow::as_record_batch_reader(stream), "flights-adbi/", max_rows_per_file = 10000 ) dbClearResult(res) }) ``` ``` user system elapsed 0.372 0.159 0.175 ``` ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> --- # Adding partitions to a dataset .pull-left[ ``` r write_month <- function(month) { sql <- flights_duckdb |> filter(month == !!month) |> dbplyr::sql_render() dir <- fs::path( "flights-part-manual", sprintf("month=%d", month)) fs::dir_create(dir) df <- dbGetQuery(con, sql) duckplyr::df_to_parquet( df, fs::path(dir, "part-0.parquet")) } ``` ] .pull-right[ ``` r write_month(1) write_month(2) write_month(3) fs::dir_tree("flights-part-manual") ``` <PRE class="fansi fansi-output"><CODE><span style='color: #00BBBB; font-weight: bold;'>flights-part-manual</span> ├── <span style='color: #00BBBB; font-weight: bold;'>month=1</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=2</span> │ └── part-0.parquet └── <span style='color: #00BBBB; font-weight: bold;'>month=3</span> └── part-0.parquet </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> --- background-image: url("data:image/png;base64,#images/22-frame.webp") background-size: 40% background-position: 100% 100% # Full circle: Exercises .pull-left[ Write code to create a partitioned dataset with the `flights` table, partitioned by `origin`. - Hint: The dataset only contains flights departing from New York City airports. <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> <div class="exercise-timer"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="84" height="20" role="img" aria-label="Timer: 10:00"><title>Timer: 10:00</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="84" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="41" height="20" fill="#555"/><rect x="41" width="43" height="20" fill="#007ec6"/><rect width="84" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="215" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="310">Timer</text><text x="215" y="140" transform="scale(.1)" fill="#fff" textLength="310">Timer</text><text aria-hidden="true" x="615" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">10:00</text><text x="615" y="140" transform="scale(.1)" fill="#fff" textLength="330">10:00</text></g></svg></div> ] --- class: inverse, middle, center # https://posit-conf-2024.github.io/databases/part-3.html