5  Databricks Connect

5.1 Connect to Databricks Connect cluster

Copy a simple table into the Spark session

library(sparklyr)
library(dplyr)
sc <- spark_connect(method = "databricks_connect")

5.2 Uploading data from R

  1. Load the nycflights13 library
library(nycflights13)
  1. Use copy_to() to upload planes to your Spark session. Assign to a variable called tbl_planes
tbl_planes <- copy_to(sc, planes)
  1. Use glimpse() to preview the data
tbl_planes |> 
  glimpse()
#> Rows: ??
#> Columns: 9
#> Database: spark_connection
#> $ tailnum      <chr> "N10156", "N102UW", "N103US", "N104UW", "N10575", "N105UW…
#> $ year         <int> 2004, 1998, 1999, 1999, 2002, 1999, 1999, 1999, 1999, 199…
#> $ type         <chr> "Fixed wing multi engine", "Fixed wing multi engine", "Fi…
#> $ manufacturer <chr> "EMBRAER", "AIRBUS INDUSTRIE", "AIRBUS INDUSTRIE", "AIRBU…
#> $ model        <chr> "EMB-145XR", "A320-214", "A320-214", "A320-214", "EMB-145…
#> $ engines      <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
#> $ seats        <int> 55, 182, 182, 182, 55, 182, 182, 182, 182, 182, 55, 55, 5…
#> $ speed        <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ engine       <chr> "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turb…
  1. Use show_query() to see how Spark refers to the data you just uploaded
tbl_planes |> 
  show_query()
#> <SQL>
#> SELECT *
#> FROM `planes`

5.3 Caching data

Create a table from elements of another table

  1. Using tbl(), create a reference to the “diamonds” table. Assign it to a variable called tbl_diamonds
tbl_diamonds <- tbl(sc, "diamonds")
  1. Select the “cut”, “color”, “clarity” and “price” fields from tbl_diamonds and assign to a variable called tbl_temp
tbl_temp <- tbl_diamonds |> 
  select(cut, color, clarity, price)
  1. Use show_query() to see the underlying query of tbl_temp
tbl_temp |> 
  show_query()
#> <SQL>
#> SELECT `cut`, `color`, `clarity`, `price`
#> FROM `diamonds`
  1. Pass tbl_temp to compute(), and reassign the operation to tbl_temp
tbl_temp <- tbl_temp |> 
  compute()
  1. Use show_query() again to see the new underlying query of tbl_temp
tbl_temp  |> 
  show_query()
#> <SQL>
#> SELECT *
#> FROM `table_2079291d_ef80_4ad4_bb2c_7f8d9dc942ef`
  1. Preview tbl_temp
tbl_temp 
#> # Source:   table<`table_2079291d_ef80_4ad4_bb2c_7f8d9dc942ef`> [?? x 4]
#> # Database: spark_connection
#>    cut       color clarity price
#>    <chr>     <chr> <chr>   <int>
#>  1 Ideal     E     SI2       326
#>  2 Premium   E     SI1       326
#>  3 Good      E     VS1       327
#>  4 Premium   I     VS2       334
#>  5 Good      J     SI2       335
#>  6 Very Good J     VVS2      336
#>  7 Very Good I     VVS1      336
#>  8 Very Good H     SI1       337
#>  9 Fair      E     VS2       337
#> 10 Very Good H     VS1       338
#> # ℹ more rows

5.4 Reading files

Upload a CSV file that is located inside Databricks, into your Spark session

  1. Use spark_read_csv() to upload the airports CSV file to Spark. The path should be /Volumes/workshops/nycflights/2013/airports.csv, and name should be airports_csv. Assign it to a variable called tbl_airports
tbl_airports <- spark_read_csv(
  sc = sc,
  name = "airports_csv",
  path = "/Volumes/workshops/nycflights/2013/airports.csv"
  )
  1. Preview tbl_airports
tbl_airports
#> # Source:   table<`airports_csv`> [?? x 8]
#> # Database: spark_connection
#>    faa   name                             lat    lon   alt    tz dst   tzone    
#>    <chr> <chr>                          <dbl>  <dbl> <int> <int> <chr> <chr>    
#>  1 04G   Lansdowne Airport               41.1  -80.6  1044    -5 A     America/…
#>  2 06A   Moton Field Municipal Airport   32.5  -85.7   264    -6 A     America/…
#>  3 06C   Schaumburg Regional             42.0  -88.1   801    -6 A     America/…
#>  4 06N   Randall Airport                 41.4  -74.4   523    -5 A     America/…
#>  5 09J   Jekyll Island Airport           31.1  -81.4    11    -5 A     America/…
#>  6 0A9   Elizabethton Municipal Airport  36.4  -82.2  1593    -5 A     America/…
#>  7 0G6   Williams County Airport         41.5  -84.5   730    -5 A     America/…
#>  8 0G7   Finger Lakes Regional Airport   42.9  -76.8   492    -5 A     America/…
#>  9 0P2   Shoestring Aviation Airfield    39.8  -76.6  1000    -5 U     America/…
#> 10 0S9   Jefferson County Intl           48.1 -123.    108    -8 A     America/…
#> # ℹ more rows
  1. Pass tbl_airports to show_query() to see the underlying query
tbl_airports |> 
  show_query()
#> <SQL>
#> SELECT *
#> FROM `airports_csv`
  1. Use sdf_sql() to access the top 10 rows of “airports_csv”
sdf_sql(sc, "select * from airports_csv limit 10")
#> # Source:   table<`sparklyr_tmp_table_7edd58f9_2813_4a80_a191_0ddf254d4e05`> [10 x 8]
#> # Database: spark_connection
#>    faa   name                             lat    lon   alt    tz dst   tzone    
#>    <chr> <chr>                          <dbl>  <dbl> <int> <int> <chr> <chr>    
#>  1 04G   Lansdowne Airport               41.1  -80.6  1044    -5 A     America/…
#>  2 06A   Moton Field Municipal Airport   32.5  -85.7   264    -6 A     America/…
#>  3 06C   Schaumburg Regional             42.0  -88.1   801    -6 A     America/…
#>  4 06N   Randall Airport                 41.4  -74.4   523    -5 A     America/…
#>  5 09J   Jekyll Island Airport           31.1  -81.4    11    -5 A     America/…
#>  6 0A9   Elizabethton Municipal Airport  36.4  -82.2  1593    -5 A     America/…
#>  7 0G6   Williams County Airport         41.5  -84.5   730    -5 A     America/…
#>  8 0G7   Finger Lakes Regional Airport   42.9  -76.8   492    -5 A     America/…
#>  9 0P2   Shoestring Aviation Airfield    39.8  -76.6  1000    -5 U     America/…
#> 10 0S9   Jefferson County Intl           48.1 -123.    108    -8 A     America/…

5.5 “Mapping” files

Map the flights data without caching the data

  1. Use spark_read_csv() to upload the airports CSV file to Spark. The path should be /Volumes/workshops/nycflights/2013/nycflights.csv, and name should be mapped. Important, make sure to set memory to false. Assign it to a variable called flights_mapped
flights_mapped <- spark_read_csv(
  sc = sc,
  name = "mapped",
  path = "/Volumes/workshops/nycflights/2013/nycflights.csv",
  memory = FALSE
  )
  1. Use glimpse() to preview the data from flights_mapped
flights_mapped |> 
  glimpse()
#> Rows: ??
#> Columns: 16
#> Database: spark_connection
#> $ year      <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, …
#> $ month     <int> 6, 5, 12, 5, 7, 1, 12, 8, 9, 4, 6, 11, 4, 3, 10, 1, 2, 8, 10…
#> $ day       <int> 30, 7, 8, 14, 21, 1, 9, 13, 26, 30, 17, 22, 26, 25, 21, 23, …
#> $ dep_time  <int> 940, 1657, 859, 1841, 1102, 1817, 1259, 1920, 725, 1323, 940…
#> $ dep_delay <int> 15, -3, -1, -4, -3, -3, 14, 85, -10, 62, 5, 5, -2, 115, -4, …
#> $ arr_time  <int> 1216, 2104, 1238, 2122, 1230, 2008, 1617, 2032, 1027, 1549, …
#> $ arr_delay <int> -4, 10, 11, -34, -8, 3, 22, 71, -8, 60, -4, -2, 22, 91, -6, …
#> $ carrier   <chr> "VX", "DL", "DL", "DL", "9E", "AA", "WN", "B6", "AA", "EV", …
#> $ tailnum   <chr> "N626VA", "N3760C", "N712TW", "N914DL", "N823AY", "N3AXAA", …
#> $ flight    <int> 407, 329, 422, 2391, 3652, 353, 1428, 1407, 2279, 4162, 20, …
#> $ origin    <chr> "JFK", "JFK", "JFK", "JFK", "LGA", "LGA", "EWR", "JFK", "LGA…
#> $ dest      <chr> "LAX", "SJU", "LAX", "TPA", "ORF", "ORD", "HOU", "IAD", "MIA…
#> $ air_time  <int> 313, 216, 376, 135, 50, 138, 240, 48, 148, 110, 50, 161, 87,…
#> $ distance  <int> 2475, 1598, 2475, 1005, 296, 733, 1411, 228, 1096, 820, 264,…
#> $ hour      <int> 9, 16, 8, 18, 11, 18, 12, 19, 7, 13, 9, 13, 8, 20, 12, 20, 6…
#> $ minute    <int> 40, 57, 59, 41, 2, 17, 59, 20, 25, 23, 40, 20, 9, 54, 17, 24…
  1. Run a quick count by the carrier field against flights_mapped
flights_mapped |> 
  count(carrier)
#> # Source:   SQL [?? x 2]
#> # Database: spark_connection
#>    carrier     n
#>    <chr>   <dbl>
#>  1 UA       5770
#>  2 AA       3188
#>  3 EV       5142
#>  4 B6       5376
#>  5 DL       4751
#>  6 OO          3
#>  7 F9         69
#>  8 YV         53
#>  9 US       2015
#> 10 MQ       2507
#> # ℹ more rows

5.6 Partial cache

Cache a section of the flights data into Spark

  1. Filter flights_mapped to only show flights destined to Orlando (code “ORD”)
flights_mapped |> 
  filter(dest == "ORD")
#> # Source:   SQL [?? x 16]
#> # Database: spark_connection
#>     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
#>    <int> <int> <int>    <int>     <int>    <int>     <int> <chr>   <chr>  
#>  1  2013     1     1     1817        -3     2008         3 AA      N3AXAA 
#>  2  2013    10    21      859        -1     1036        11 UA      N57852 
#>  3  2013     9    18      933        -7     1037       -43 AA      N514AA 
#>  4  2013     7    25      752        -3      917         2 MQ      N6EAMQ 
#>  5  2013     7     1     2007        82     2123        53 AA      N547AA 
#>  6  2013     9    29     2003        -2     2108       -22 MQ      N509MQ 
#>  7  2013     4    15      625        -5      752       -13 AA      N589AA 
#>  8  2013     8    24     1017        12     1113        -7 MQ      N517MQ 
#>  9  2013     9    20     1948       -12     2112       -20 UA      N814UA 
#> 10  2013     1    25     1814        54     2020        65 AA      N3CXAA 
#> # ℹ more rows
#> # ℹ 7 more variables: flight <int>, origin <chr>, dest <chr>, air_time <int>,
#> #   distance <int>, hour <int>, minute <int>
  1. Assign the previous operation to a variable called flights_ord
flights_ord <- flights_mapped |> 
  filter(dest == "ORD")
  1. Pass flights_ord to compute(), and reassign to flights_ord
flights_ord <- flights_ord |> 
  compute()
  1. Preview flights_ord
flights_ord
#> # Source:   table<`table_6c908dd8_b420_4046_959b_b74d79da8e42`> [?? x 16]
#> # Database: spark_connection
#>     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
#>    <int> <int> <int>    <int>     <int>    <int>     <int> <chr>   <chr>  
#>  1  2013     1     1     1817        -3     2008         3 AA      N3AXAA 
#>  2  2013    10    21      859        -1     1036        11 UA      N57852 
#>  3  2013     9    18      933        -7     1037       -43 AA      N514AA 
#>  4  2013     7    25      752        -3      917         2 MQ      N6EAMQ 
#>  5  2013     7     1     2007        82     2123        53 AA      N547AA 
#>  6  2013     9    29     2003        -2     2108       -22 MQ      N509MQ 
#>  7  2013     4    15      625        -5      752       -13 AA      N589AA 
#>  8  2013     8    24     1017        12     1113        -7 MQ      N517MQ 
#>  9  2013     9    20     1948       -12     2112       -20 UA      N814UA 
#> 10  2013     1    25     1814        54     2020        65 AA      N3CXAA 
#> # ℹ more rows
#> # ℹ 7 more variables: flight <int>, origin <chr>, dest <chr>, air_time <int>,
#> #   distance <int>, hour <int>, minute <int>
  1. Pass flights_ord to show_query() to see the new query
flights_ord |> 
  show_query()
#> <SQL>
#> SELECT *
#> FROM `table_6c908dd8_b420_4046_959b_b74d79da8e42`

5.7 End game

Prepare a working data set using several of the techniques covered in this unit

  1. From tbl_airports, select “faa” and “name”. Rename the variables to “dest” and “dest_name” respectively. Assign it to a variable called dest_airports
dest_airports <- tbl_airports |> 
  select(dest = faa, dest_name = name)
  1. Preview dest_airports
dest_airports
#> # Source:   SQL [?? x 2]
#> # Database: spark_connection
#>    dest  dest_name                     
#>    <chr> <chr>                         
#>  1 04G   Lansdowne Airport             
#>  2 06A   Moton Field Municipal Airport 
#>  3 06C   Schaumburg Regional           
#>  4 06N   Randall Airport               
#>  5 09J   Jekyll Island Airport         
#>  6 0A9   Elizabethton Municipal Airport
#>  7 0G6   Williams County Airport       
#>  8 0G7   Finger Lakes Regional Airport 
#>  9 0P2   Shoestring Aviation Airfield  
#> 10 0S9   Jefferson County Intl         
#> # ℹ more rows
  1. From tbl_airports, select “faa” and “name”. Rename the variables to “origin” and “origin_name” respectively. Assign it to a variable called origin_airports
origin_airports <- tbl_airports |> 
  select(origin = faa, origin_name = name)
  1. Preview origin_airports
origin_airports
#> # Source:   SQL [?? x 2]
#> # Database: spark_connection
#>    origin origin_name                   
#>    <chr>  <chr>                         
#>  1 04G    Lansdowne Airport             
#>  2 06A    Moton Field Municipal Airport 
#>  3 06C    Schaumburg Regional           
#>  4 06N    Randall Airport               
#>  5 09J    Jekyll Island Airport         
#>  6 0A9    Elizabethton Municipal Airport
#>  7 0G6    Williams County Airport       
#>  8 0G7    Finger Lakes Regional Airport 
#>  9 0P2    Shoestring Aviation Airfield  
#> 10 0S9    Jefferson County Intl         
#> # ℹ more rows
  1. Select “tailnum”, “dest”, “origin”, and “distance” from flights_mapped. Assign it to a variable called flights_select
flights_select <- flights_mapped |> 
  select(tailnum, dest, origin, distance) 
  1. Preview flights_select
flights_select
#> # Source:   SQL [?? x 4]
#> # Database: spark_connection
#>    tailnum dest  origin distance
#>    <chr>   <chr> <chr>     <int>
#>  1 N626VA  LAX   JFK        2475
#>  2 N3760C  SJU   JFK        1598
#>  3 N712TW  LAX   JFK        2475
#>  4 N914DL  TPA   JFK        1005
#>  5 N823AY  ORF   LGA         296
#>  6 N3AXAA  ORD   LGA         733
#>  7 N218WN  HOU   EWR        1411
#>  8 N284JB  IAD   JFK         228
#>  9 N3FSAA  MIA   LGA        1096
#> 10 N12163  JAX   EWR         820
#> # ℹ more rows
  1. Pipe flights_select into head()
flights_select |> 
  head()
#> # Source:   SQL [6 x 4]
#> # Database: spark_connection
#>   tailnum dest  origin distance
#>   <chr>   <chr> <chr>     <int>
#> 1 N626VA  LAX   JFK        2475
#> 2 N3760C  SJU   JFK        1598
#> 3 N712TW  LAX   JFK        2475
#> 4 N914DL  TPA   JFK        1005
#> 5 N823AY  ORF   LGA         296
#> 6 N3AXAA  ORD   LGA         733
  1. Add a left_join() step, after head(). Join to the dest_airports variable, using “dest” as the joining field
flights_select |> 
  head() |> 
  left_join(dest_airports, by = "dest") 
#> # Source:   SQL [6 x 5]
#> # Database: spark_connection
#>   tailnum dest  origin distance dest_name         
#>   <chr>   <chr> <chr>     <int> <chr>             
#> 1 N626VA  LAX   JFK        2475 Los Angeles Intl  
#> 2 N3760C  SJU   JFK        1598 <NA>              
#> 3 N712TW  LAX   JFK        2475 Los Angeles Intl  
#> 4 N914DL  TPA   JFK        1005 Tampa Intl        
#> 5 N823AY  ORF   LGA         296 Norfolk Intl      
#> 6 N3AXAA  ORD   LGA         733 Chicago Ohare Intl
  1. Add another left_join() step. Join to the origin_airports variable, using “origin” as the joining field
flights_select |> 
  head() |> 
  left_join(dest_airports, by = "dest") |> 
  left_join(origin_airports, by = "origin") 
#> # Source:   SQL [6 x 6]
#> # Database: spark_connection
#>   tailnum dest  origin distance dest_name          origin_name        
#>   <chr>   <chr> <chr>     <int> <chr>              <chr>              
#> 1 N626VA  LAX   JFK        2475 Los Angeles Intl   John F Kennedy Intl
#> 2 N3760C  SJU   JFK        1598 <NA>               John F Kennedy Intl
#> 3 N712TW  LAX   JFK        2475 Los Angeles Intl   John F Kennedy Intl
#> 4 N914DL  TPA   JFK        1005 Tampa Intl         John F Kennedy Intl
#> 5 N823AY  ORF   LGA         296 Norfolk Intl       La Guardia         
#> 6 N3AXAA  ORD   LGA         733 Chicago Ohare Intl La Guardia
  1. Add another left_join() step. Join to the tbl_planes variable, using “tailnum” as the joining field
flights_select |> 
  head() |> 
  left_join(dest_airports, by = "dest") |> 
  left_join(origin_airports, by = "origin") |> 
  left_join(tbl_planes, by = "tailnum")
#> # Source:   SQL [6 x 14]
#> # Database: spark_connection
#>   tailnum dest  origin distance dest_name   origin_name  year type  manufacturer
#>   <chr>   <chr> <chr>     <int> <chr>       <chr>       <dbl> <chr> <chr>       
#> 1 N626VA  LAX   JFK        2475 Los Angele… John F Ken…  2006 Fixe… AIRBUS      
#> 2 N3760C  SJU   JFK        1598 <NA>        John F Ken…  2001 Fixe… BOEING      
#> 3 N712TW  LAX   JFK        2475 Los Angele… John F Ken…  1997 Fixe… BOEING      
#> 4 N914DL  TPA   JFK        1005 Tampa Intl  John F Ken…  1988 Fixe… MCDONNELL D…
#> 5 N823AY  ORF   LGA         296 Norfolk In… La Guardia   2005 Fixe… BOMBARDIER …
#> 6 N3AXAA  ORD   LGA         733 Chicago Oh… La Guardia    NaN <NA>  <NA>        
#> # ℹ 5 more variables: model <chr>, engines <dbl>, seats <dbl>, speed <dbl>,
#> #   engine <chr>
  1. Add a mutate() step. Use ifelse() to modify “dest_name”. If “dest_name” is NA, then change its value to “Unknown”, if not NA, leave alone
flights_select |> 
  head() |> 
  left_join(dest_airports, by = "dest") |> 
  left_join(origin_airports, by = "origin") |> 
  left_join(tbl_planes, by = "tailnum") |> 
  mutate(dest_name = ifelse(is.na(dest_name), "Unknown", dest_name))
#> # Source:   SQL [6 x 14]
#> # Database: spark_connection
#>   tailnum dest  origin distance dest_name   origin_name  year type  manufacturer
#>   <chr>   <chr> <chr>     <int> <chr>       <chr>       <dbl> <chr> <chr>       
#> 1 N626VA  LAX   JFK        2475 Los Angele… John F Ken…  2006 Fixe… AIRBUS      
#> 2 N3760C  SJU   JFK        1598 Unknown     John F Ken…  2001 Fixe… BOEING      
#> 3 N712TW  LAX   JFK        2475 Los Angele… John F Ken…  1997 Fixe… BOEING      
#> 4 N914DL  TPA   JFK        1005 Tampa Intl  John F Ken…  1988 Fixe… MCDONNELL D…
#> 5 N823AY  ORF   LGA         296 Norfolk In… La Guardia   2005 Fixe… BOMBARDIER …
#> 6 N3AXAA  ORD   LGA         733 Chicago Oh… La Guardia    NaN <NA>  <NA>        
#> # ℹ 5 more variables: model <chr>, engines <dbl>, seats <dbl>, speed <dbl>,
#> #   engine <chr>
  1. Inside the mutate() step, add another ifelse() to modify “origin_name”. If “origin_name” is NA, then change its value to “Unknown”, if not NA, leave alone
flights_select |> 
  head() |> 
  left_join(dest_airports, by = "dest") |> 
  left_join(origin_airports, by = "origin") |> 
  left_join(tbl_planes, by = "tailnum") |> 
  mutate(
    dest_name = ifelse(is.na(dest_name), "Unknown", dest_name),
    origin_name = ifelse(is.na(origin_name), "Unknown", origin_name)
    )
#> # Source:   SQL [6 x 14]
#> # Database: spark_connection
#>   tailnum dest  origin distance dest_name   origin_name  year type  manufacturer
#>   <chr>   <chr> <chr>     <int> <chr>       <chr>       <dbl> <chr> <chr>       
#> 1 N626VA  LAX   JFK        2475 Los Angele… John F Ken…  2006 Fixe… AIRBUS      
#> 2 N3760C  SJU   JFK        1598 Unknown     John F Ken…  2001 Fixe… BOEING      
#> 3 N712TW  LAX   JFK        2475 Los Angele… John F Ken…  1997 Fixe… BOEING      
#> 4 N914DL  TPA   JFK        1005 Tampa Intl  John F Ken…  1988 Fixe… MCDONNELL D…
#> 5 N823AY  ORF   LGA         296 Norfolk In… La Guardia   2005 Fixe… BOMBARDIER …
#> 6 N3AXAA  ORD   LGA         733 Chicago Oh… La Guardia    NaN <NA>  <NA>        
#> # ℹ 5 more variables: model <chr>, engines <dbl>, seats <dbl>, speed <dbl>,
#> #   engine <chr>
  1. Inside the mutate() step, add another ifelse() to modify “tailnum”. If “tailnum” is NA, then change its value to “Unknown”, if not NA, leave alone
flights_select |> 
  head() |> 
  left_join(dest_airports, by = "dest") |> 
  left_join(origin_airports, by = "origin") |> 
  left_join(tbl_planes, by = "tailnum") |> 
  mutate(
    dest_name = ifelse(is.na(dest_name), "Unknown", dest_name),
    origin_name = ifelse(is.na(origin_name), "Unknown", origin_name),
    tailnum = ifelse(is.na(tailnum), "Unknown", tailnum)
    )
#> # Source:   SQL [6 x 14]
#> # Database: spark_connection
#>   tailnum dest  origin distance dest_name   origin_name  year type  manufacturer
#>   <chr>   <chr> <chr>     <int> <chr>       <chr>       <dbl> <chr> <chr>       
#> 1 N626VA  LAX   JFK        2475 Los Angele… John F Ken…  2006 Fixe… AIRBUS      
#> 2 N3760C  SJU   JFK        1598 Unknown     John F Ken…  2001 Fixe… BOEING      
#> 3 N712TW  LAX   JFK        2475 Los Angele… John F Ken…  1997 Fixe… BOEING      
#> 4 N914DL  TPA   JFK        1005 Tampa Intl  John F Ken…  1988 Fixe… MCDONNELL D…
#> 5 N823AY  ORF   LGA         296 Norfolk In… La Guardia   2005 Fixe… BOMBARDIER …
#> 6 N3AXAA  ORD   LGA         733 Chicago Oh… La Guardia    NaN <NA>  <NA>        
#> # ℹ 5 more variables: model <chr>, engines <dbl>, seats <dbl>, speed <dbl>,
#> #   engine <chr>
  1. Add a show_query() step to see the resulting query
flights_select |> 
  head() |> 
  left_join(dest_airports, by = "dest") |> 
  left_join(origin_airports, by = "origin") |> 
  left_join(tbl_planes, by = "tailnum") |> 
  mutate(
    dest_name = ifelse(is.na(dest_name), "Unknown", dest_name),
    origin_name = ifelse(is.na(origin_name), "Unknown", origin_name),
    tailnum = ifelse(is.na(tailnum), "Unknown", tailnum)
    ) |> 
  show_query()
#> <SQL>
#> SELECT
#>   IF(ISNULL((`tailnum` IS NULL)), NULL, IF((`tailnum` IS NULL), "Unknown", `tailnum`)) AS `tailnum`,
#>   `dest`,
#>   `origin`,
#>   `distance`,
#>   IF(ISNULL((`dest_name` IS NULL)), NULL, IF((`dest_name` IS NULL), "Unknown", `dest_name`)) AS `dest_name`,
#>   IF(ISNULL((`origin_name` IS NULL)), NULL, IF((`origin_name` IS NULL), "Unknown", `origin_name`)) AS `origin_name`,
#>   `year`,
#>   `type`,
#>   `manufacturer`,
#>   `model`,
#>   `engines`,
#>   `seats`,
#>   `speed`,
#>   `engine`
#> FROM (
#>   SELECT
#>     `...1`.*,
#>     `airports_csv...2`.`name` AS `dest_name`,
#>     `airports_csv...3`.`name` AS `origin_name`,
#>     `year`,
#>     `type`,
#>     `manufacturer`,
#>     `model`,
#>     `engines`,
#>     `seats`,
#>     `speed`,
#>     `engine`
#>   FROM (
#>     SELECT `tailnum`, `dest`, `origin`, `distance`
#>     FROM `mapped`
#>     LIMIT 6
#>   ) `...1`
#>   LEFT JOIN `airports_csv` `airports_csv...2`
#>     ON (`...1`.`dest` = `airports_csv...2`.`faa`)
#>   LEFT JOIN `airports_csv` `airports_csv...3`
#>     ON (`...1`.`origin` = `airports_csv...3`.`faa`)
#>   LEFT JOIN `planes`
#>     ON (`...1`.`tailnum` = `planes`.`tailnum`)
#> ) `q01`
  1. Remove the head() step, and run again to see the new query
flights_select |> 
  head() |> 
  left_join(dest_airports, by = "dest") |> 
  left_join(origin_airports, by = "origin") |> 
  left_join(tbl_planes, by = "tailnum") |> 
  mutate(
    dest_name = ifelse(is.na(dest_name), "Unknown", dest_name),
    origin_name = ifelse(is.na(origin_name), "Unknown", origin_name),
    tailnum = ifelse(is.na(tailnum), "Unknown", tailnum)
    ) |> 
  show_query()
#> <SQL>
#> SELECT
#>   IF(ISNULL((`tailnum` IS NULL)), NULL, IF((`tailnum` IS NULL), "Unknown", `tailnum`)) AS `tailnum`,
#>   `dest`,
#>   `origin`,
#>   `distance`,
#>   IF(ISNULL((`dest_name` IS NULL)), NULL, IF((`dest_name` IS NULL), "Unknown", `dest_name`)) AS `dest_name`,
#>   IF(ISNULL((`origin_name` IS NULL)), NULL, IF((`origin_name` IS NULL), "Unknown", `origin_name`)) AS `origin_name`,
#>   `year`,
#>   `type`,
#>   `manufacturer`,
#>   `model`,
#>   `engines`,
#>   `seats`,
#>   `speed`,
#>   `engine`
#> FROM (
#>   SELECT
#>     `...1`.*,
#>     `airports_csv...2`.`name` AS `dest_name`,
#>     `airports_csv...3`.`name` AS `origin_name`,
#>     `year`,
#>     `type`,
#>     `manufacturer`,
#>     `model`,
#>     `engines`,
#>     `seats`,
#>     `speed`,
#>     `engine`
#>   FROM (
#>     SELECT `tailnum`, `dest`, `origin`, `distance`
#>     FROM `mapped`
#>     LIMIT 6
#>   ) `...1`
#>   LEFT JOIN `airports_csv` `airports_csv...2`
#>     ON (`...1`.`dest` = `airports_csv...2`.`faa`)
#>   LEFT JOIN `airports_csv` `airports_csv...3`
#>     ON (`...1`.`origin` = `airports_csv...3`.`faa`)
#>   LEFT JOIN `planes`
#>     ON (`...1`.`tailnum` = `planes`.`tailnum`)
#> ) `q01`
  1. Assign to a variable called tbl_prep, swap the show_query() step with compute()
tbl_prep <- flights_select |> 
  left_join(dest_airports, by = "dest") |> 
  left_join(origin_airports, by = "origin") |> 
  left_join(tbl_planes, by = "tailnum") |> 
  mutate(
    dest_name = ifelse(is.na(dest_name), "Unknown", dest_name),
    origin_name = ifelse(is.na(origin_name), "Unknown", origin_name),
    tailnum = ifelse(is.na(tailnum), "Unknown", tailnum)
    ) |> 
  compute()
  1. Pipe tbl_prep to show_query() to see the new query
tbl_prep |> 
  show_query()
#> <SQL>
#> SELECT *
#> FROM `table_46ab746a_e8ea_4ab6_b409_2c3da7056e34`