library(sparklyr)
library(dplyr)
<- spark_connect(method = "databricks_connect") sc
5 Databricks Connect
5.1 Connect to Databricks Connect cluster
Copy a simple table into the Spark session
5.2 Uploading data from R
- Load the
nycflights13
library
library(nycflights13)
- Use
copy_to()
to uploadplanes
to your Spark session. Assign to a variable calledtbl_planes
<- copy_to(sc, planes) tbl_planes
- Use
glimpse()
to preview the data
|>
tbl_planes glimpse()
#> Rows: ??
#> Columns: 9
#> Database: spark_connection
#> $ tailnum <chr> "N10156", "N102UW", "N103US", "N104UW", "N10575", "N105UW…
#> $ year <int> 2004, 1998, 1999, 1999, 2002, 1999, 1999, 1999, 1999, 199…
#> $ type <chr> "Fixed wing multi engine", "Fixed wing multi engine", "Fi…
#> $ manufacturer <chr> "EMBRAER", "AIRBUS INDUSTRIE", "AIRBUS INDUSTRIE", "AIRBU…
#> $ model <chr> "EMB-145XR", "A320-214", "A320-214", "A320-214", "EMB-145…
#> $ engines <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
#> $ seats <int> 55, 182, 182, 182, 55, 182, 182, 182, 182, 182, 55, 55, 5…
#> $ speed <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
#> $ engine <chr> "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turb…
- Use
show_query()
to see how Spark refers to the data you just uploaded
|>
tbl_planes show_query()
#> <SQL>
#> SELECT *
#> FROM `planes`
5.3 Caching data
Create a table from elements of another table
- Using
tbl()
, create a reference to the “diamonds” table. Assign it to a variable calledtbl_diamonds
<- tbl(sc, "diamonds") tbl_diamonds
- Select the “cut”, “color”, “clarity” and “price” fields from
tbl_diamonds
and assign to a variable calledtbl_temp
<- tbl_diamonds |>
tbl_temp select(cut, color, clarity, price)
- Use
show_query()
to see the underlying query oftbl_temp
|>
tbl_temp show_query()
#> <SQL>
#> SELECT `cut`, `color`, `clarity`, `price`
#> FROM `diamonds`
- Pass
tbl_temp
tocompute()
, and reassign the operation totbl_temp
<- tbl_temp |>
tbl_temp compute()
- Use
show_query()
again to see the new underlying query oftbl_temp
|>
tbl_temp show_query()
#> <SQL>
#> SELECT *
#> FROM `table_2079291d_ef80_4ad4_bb2c_7f8d9dc942ef`
- Preview
tbl_temp
tbl_temp #> # Source: table<`table_2079291d_ef80_4ad4_bb2c_7f8d9dc942ef`> [?? x 4]
#> # Database: spark_connection
#> cut color clarity price
#> <chr> <chr> <chr> <int>
#> 1 Ideal E SI2 326
#> 2 Premium E SI1 326
#> 3 Good E VS1 327
#> 4 Premium I VS2 334
#> 5 Good J SI2 335
#> 6 Very Good J VVS2 336
#> 7 Very Good I VVS1 336
#> 8 Very Good H SI1 337
#> 9 Fair E VS2 337
#> 10 Very Good H VS1 338
#> # ℹ more rows
5.4 Reading files
Upload a CSV file that is located inside Databricks, into your Spark session
- Use
spark_read_csv()
to upload the airports CSV file to Spark. Thepath
should be /Volumes/workshops/nycflights/2013/airports.csv, andname
should beairports_csv
. Assign it to a variable calledtbl_airports
<- spark_read_csv(
tbl_airports sc = sc,
name = "airports_csv",
path = "/Volumes/workshops/nycflights/2013/airports.csv"
)
- Preview
tbl_airports
tbl_airports#> # Source: table<`airports_csv`> [?? x 8]
#> # Database: spark_connection
#> faa name lat lon alt tz dst tzone
#> <chr> <chr> <dbl> <dbl> <int> <int> <chr> <chr>
#> 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/…
#> 2 06A Moton Field Municipal Airport 32.5 -85.7 264 -6 A America/…
#> 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/…
#> 4 06N Randall Airport 41.4 -74.4 523 -5 A America/…
#> 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/…
#> 6 0A9 Elizabethton Municipal Airport 36.4 -82.2 1593 -5 A America/…
#> 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/…
#> 8 0G7 Finger Lakes Regional Airport 42.9 -76.8 492 -5 A America/…
#> 9 0P2 Shoestring Aviation Airfield 39.8 -76.6 1000 -5 U America/…
#> 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/…
#> # ℹ more rows
- Pass
tbl_airports
toshow_query()
to see the underlying query
|>
tbl_airports show_query()
#> <SQL>
#> SELECT *
#> FROM `airports_csv`
- Use
sdf_sql()
to access the top 10 rows of “airports_csv”
sdf_sql(sc, "select * from airports_csv limit 10")
#> # Source: table<`sparklyr_tmp_table_7edd58f9_2813_4a80_a191_0ddf254d4e05`> [10 x 8]
#> # Database: spark_connection
#> faa name lat lon alt tz dst tzone
#> <chr> <chr> <dbl> <dbl> <int> <int> <chr> <chr>
#> 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/…
#> 2 06A Moton Field Municipal Airport 32.5 -85.7 264 -6 A America/…
#> 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/…
#> 4 06N Randall Airport 41.4 -74.4 523 -5 A America/…
#> 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/…
#> 6 0A9 Elizabethton Municipal Airport 36.4 -82.2 1593 -5 A America/…
#> 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/…
#> 8 0G7 Finger Lakes Regional Airport 42.9 -76.8 492 -5 A America/…
#> 9 0P2 Shoestring Aviation Airfield 39.8 -76.6 1000 -5 U America/…
#> 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/…
5.5 “Mapping” files
Map the flights data without caching the data
- Use
spark_read_csv()
to upload the airports CSV file to Spark. Thepath
should be /Volumes/workshops/nycflights/2013/nycflights.csv, andname
should bemapped
. Important, make sure to setmemory
to false. Assign it to a variable calledflights_mapped
<- spark_read_csv(
flights_mapped sc = sc,
name = "mapped",
path = "/Volumes/workshops/nycflights/2013/nycflights.csv",
memory = FALSE
)
- Use
glimpse()
to preview the data fromflights_mapped
|>
flights_mapped glimpse()
#> Rows: ??
#> Columns: 16
#> Database: spark_connection
#> $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, …
#> $ month <int> 6, 5, 12, 5, 7, 1, 12, 8, 9, 4, 6, 11, 4, 3, 10, 1, 2, 8, 10…
#> $ day <int> 30, 7, 8, 14, 21, 1, 9, 13, 26, 30, 17, 22, 26, 25, 21, 23, …
#> $ dep_time <int> 940, 1657, 859, 1841, 1102, 1817, 1259, 1920, 725, 1323, 940…
#> $ dep_delay <int> 15, -3, -1, -4, -3, -3, 14, 85, -10, 62, 5, 5, -2, 115, -4, …
#> $ arr_time <int> 1216, 2104, 1238, 2122, 1230, 2008, 1617, 2032, 1027, 1549, …
#> $ arr_delay <int> -4, 10, 11, -34, -8, 3, 22, 71, -8, 60, -4, -2, 22, 91, -6, …
#> $ carrier <chr> "VX", "DL", "DL", "DL", "9E", "AA", "WN", "B6", "AA", "EV", …
#> $ tailnum <chr> "N626VA", "N3760C", "N712TW", "N914DL", "N823AY", "N3AXAA", …
#> $ flight <int> 407, 329, 422, 2391, 3652, 353, 1428, 1407, 2279, 4162, 20, …
#> $ origin <chr> "JFK", "JFK", "JFK", "JFK", "LGA", "LGA", "EWR", "JFK", "LGA…
#> $ dest <chr> "LAX", "SJU", "LAX", "TPA", "ORF", "ORD", "HOU", "IAD", "MIA…
#> $ air_time <int> 313, 216, 376, 135, 50, 138, 240, 48, 148, 110, 50, 161, 87,…
#> $ distance <int> 2475, 1598, 2475, 1005, 296, 733, 1411, 228, 1096, 820, 264,…
#> $ hour <int> 9, 16, 8, 18, 11, 18, 12, 19, 7, 13, 9, 13, 8, 20, 12, 20, 6…
#> $ minute <int> 40, 57, 59, 41, 2, 17, 59, 20, 25, 23, 40, 20, 9, 54, 17, 24…
- Run a quick count by the
carrier
field againstflights_mapped
|>
flights_mapped count(carrier)
#> # Source: SQL [?? x 2]
#> # Database: spark_connection
#> carrier n
#> <chr> <dbl>
#> 1 UA 5770
#> 2 AA 3188
#> 3 EV 5142
#> 4 B6 5376
#> 5 DL 4751
#> 6 OO 3
#> 7 F9 69
#> 8 YV 53
#> 9 US 2015
#> 10 MQ 2507
#> # ℹ more rows
5.6 Partial cache
Cache a section of the flights data into Spark
- Filter
flights_mapped
to only show flights destined to Orlando (code “ORD”)
|>
flights_mapped filter(dest == "ORD")
#> # Source: SQL [?? x 16]
#> # Database: spark_connection
#> year month day dep_time dep_delay arr_time arr_delay carrier tailnum
#> <int> <int> <int> <int> <int> <int> <int> <chr> <chr>
#> 1 2013 1 1 1817 -3 2008 3 AA N3AXAA
#> 2 2013 10 21 859 -1 1036 11 UA N57852
#> 3 2013 9 18 933 -7 1037 -43 AA N514AA
#> 4 2013 7 25 752 -3 917 2 MQ N6EAMQ
#> 5 2013 7 1 2007 82 2123 53 AA N547AA
#> 6 2013 9 29 2003 -2 2108 -22 MQ N509MQ
#> 7 2013 4 15 625 -5 752 -13 AA N589AA
#> 8 2013 8 24 1017 12 1113 -7 MQ N517MQ
#> 9 2013 9 20 1948 -12 2112 -20 UA N814UA
#> 10 2013 1 25 1814 54 2020 65 AA N3CXAA
#> # ℹ more rows
#> # ℹ 7 more variables: flight <int>, origin <chr>, dest <chr>, air_time <int>,
#> # distance <int>, hour <int>, minute <int>
- Assign the previous operation to a variable called
flights_ord
<- flights_mapped |>
flights_ord filter(dest == "ORD")
- Pass
flights_ord
tocompute()
, and reassign toflights_ord
<- flights_ord |>
flights_ord compute()
- Preview
flights_ord
flights_ord#> # Source: table<`table_6c908dd8_b420_4046_959b_b74d79da8e42`> [?? x 16]
#> # Database: spark_connection
#> year month day dep_time dep_delay arr_time arr_delay carrier tailnum
#> <int> <int> <int> <int> <int> <int> <int> <chr> <chr>
#> 1 2013 1 1 1817 -3 2008 3 AA N3AXAA
#> 2 2013 10 21 859 -1 1036 11 UA N57852
#> 3 2013 9 18 933 -7 1037 -43 AA N514AA
#> 4 2013 7 25 752 -3 917 2 MQ N6EAMQ
#> 5 2013 7 1 2007 82 2123 53 AA N547AA
#> 6 2013 9 29 2003 -2 2108 -22 MQ N509MQ
#> 7 2013 4 15 625 -5 752 -13 AA N589AA
#> 8 2013 8 24 1017 12 1113 -7 MQ N517MQ
#> 9 2013 9 20 1948 -12 2112 -20 UA N814UA
#> 10 2013 1 25 1814 54 2020 65 AA N3CXAA
#> # ℹ more rows
#> # ℹ 7 more variables: flight <int>, origin <chr>, dest <chr>, air_time <int>,
#> # distance <int>, hour <int>, minute <int>
- Pass
flights_ord
toshow_query()
to see the new query
|>
flights_ord show_query()
#> <SQL>
#> SELECT *
#> FROM `table_6c908dd8_b420_4046_959b_b74d79da8e42`
5.7 End game
Prepare a working data set using several of the techniques covered in this unit
- From
tbl_airports
, select “faa” and “name”. Rename the variables to “dest” and “dest_name” respectively. Assign it to a variable calleddest_airports
<- tbl_airports |>
dest_airports select(dest = faa, dest_name = name)
- Preview
dest_airports
dest_airports#> # Source: SQL [?? x 2]
#> # Database: spark_connection
#> dest dest_name
#> <chr> <chr>
#> 1 04G Lansdowne Airport
#> 2 06A Moton Field Municipal Airport
#> 3 06C Schaumburg Regional
#> 4 06N Randall Airport
#> 5 09J Jekyll Island Airport
#> 6 0A9 Elizabethton Municipal Airport
#> 7 0G6 Williams County Airport
#> 8 0G7 Finger Lakes Regional Airport
#> 9 0P2 Shoestring Aviation Airfield
#> 10 0S9 Jefferson County Intl
#> # ℹ more rows
- From
tbl_airports
, select “faa” and “name”. Rename the variables to “origin” and “origin_name” respectively. Assign it to a variable calledorigin_airports
<- tbl_airports |>
origin_airports select(origin = faa, origin_name = name)
- Preview
origin_airports
origin_airports#> # Source: SQL [?? x 2]
#> # Database: spark_connection
#> origin origin_name
#> <chr> <chr>
#> 1 04G Lansdowne Airport
#> 2 06A Moton Field Municipal Airport
#> 3 06C Schaumburg Regional
#> 4 06N Randall Airport
#> 5 09J Jekyll Island Airport
#> 6 0A9 Elizabethton Municipal Airport
#> 7 0G6 Williams County Airport
#> 8 0G7 Finger Lakes Regional Airport
#> 9 0P2 Shoestring Aviation Airfield
#> 10 0S9 Jefferson County Intl
#> # ℹ more rows
- Select “tailnum”, “dest”, “origin”, and “distance” from
flights_mapped
. Assign it to a variable calledflights_select
<- flights_mapped |>
flights_select select(tailnum, dest, origin, distance)
- Preview
flights_select
flights_select#> # Source: SQL [?? x 4]
#> # Database: spark_connection
#> tailnum dest origin distance
#> <chr> <chr> <chr> <int>
#> 1 N626VA LAX JFK 2475
#> 2 N3760C SJU JFK 1598
#> 3 N712TW LAX JFK 2475
#> 4 N914DL TPA JFK 1005
#> 5 N823AY ORF LGA 296
#> 6 N3AXAA ORD LGA 733
#> 7 N218WN HOU EWR 1411
#> 8 N284JB IAD JFK 228
#> 9 N3FSAA MIA LGA 1096
#> 10 N12163 JAX EWR 820
#> # ℹ more rows
- Pipe
flights_select
intohead()
|>
flights_select head()
#> # Source: SQL [6 x 4]
#> # Database: spark_connection
#> tailnum dest origin distance
#> <chr> <chr> <chr> <int>
#> 1 N626VA LAX JFK 2475
#> 2 N3760C SJU JFK 1598
#> 3 N712TW LAX JFK 2475
#> 4 N914DL TPA JFK 1005
#> 5 N823AY ORF LGA 296
#> 6 N3AXAA ORD LGA 733
- Add a
left_join()
step, afterhead()
. Join to thedest_airports
variable, using “dest” as the joining field
|>
flights_select head() |>
left_join(dest_airports, by = "dest")
#> # Source: SQL [6 x 5]
#> # Database: spark_connection
#> tailnum dest origin distance dest_name
#> <chr> <chr> <chr> <int> <chr>
#> 1 N626VA LAX JFK 2475 Los Angeles Intl
#> 2 N3760C SJU JFK 1598 <NA>
#> 3 N712TW LAX JFK 2475 Los Angeles Intl
#> 4 N914DL TPA JFK 1005 Tampa Intl
#> 5 N823AY ORF LGA 296 Norfolk Intl
#> 6 N3AXAA ORD LGA 733 Chicago Ohare Intl
- Add another
left_join()
step. Join to theorigin_airports
variable, using “origin” as the joining field
|>
flights_select head() |>
left_join(dest_airports, by = "dest") |>
left_join(origin_airports, by = "origin")
#> # Source: SQL [6 x 6]
#> # Database: spark_connection
#> tailnum dest origin distance dest_name origin_name
#> <chr> <chr> <chr> <int> <chr> <chr>
#> 1 N626VA LAX JFK 2475 Los Angeles Intl John F Kennedy Intl
#> 2 N3760C SJU JFK 1598 <NA> John F Kennedy Intl
#> 3 N712TW LAX JFK 2475 Los Angeles Intl John F Kennedy Intl
#> 4 N914DL TPA JFK 1005 Tampa Intl John F Kennedy Intl
#> 5 N823AY ORF LGA 296 Norfolk Intl La Guardia
#> 6 N3AXAA ORD LGA 733 Chicago Ohare Intl La Guardia
- Add another
left_join()
step. Join to thetbl_planes
variable, using “tailnum” as the joining field
|>
flights_select head() |>
left_join(dest_airports, by = "dest") |>
left_join(origin_airports, by = "origin") |>
left_join(tbl_planes, by = "tailnum")
#> # Source: SQL [6 x 14]
#> # Database: spark_connection
#> tailnum dest origin distance dest_name origin_name year type manufacturer
#> <chr> <chr> <chr> <int> <chr> <chr> <dbl> <chr> <chr>
#> 1 N626VA LAX JFK 2475 Los Angele… John F Ken… 2006 Fixe… AIRBUS
#> 2 N3760C SJU JFK 1598 <NA> John F Ken… 2001 Fixe… BOEING
#> 3 N712TW LAX JFK 2475 Los Angele… John F Ken… 1997 Fixe… BOEING
#> 4 N914DL TPA JFK 1005 Tampa Intl John F Ken… 1988 Fixe… MCDONNELL D…
#> 5 N823AY ORF LGA 296 Norfolk In… La Guardia 2005 Fixe… BOMBARDIER …
#> 6 N3AXAA ORD LGA 733 Chicago Oh… La Guardia NaN <NA> <NA>
#> # ℹ 5 more variables: model <chr>, engines <dbl>, seats <dbl>, speed <dbl>,
#> # engine <chr>
- Add a
mutate()
step. Useifelse()
to modify “dest_name”. If “dest_name” is NA, then change its value to “Unknown”, if not NA, leave alone
|>
flights_select head() |>
left_join(dest_airports, by = "dest") |>
left_join(origin_airports, by = "origin") |>
left_join(tbl_planes, by = "tailnum") |>
mutate(dest_name = ifelse(is.na(dest_name), "Unknown", dest_name))
#> # Source: SQL [6 x 14]
#> # Database: spark_connection
#> tailnum dest origin distance dest_name origin_name year type manufacturer
#> <chr> <chr> <chr> <int> <chr> <chr> <dbl> <chr> <chr>
#> 1 N626VA LAX JFK 2475 Los Angele… John F Ken… 2006 Fixe… AIRBUS
#> 2 N3760C SJU JFK 1598 Unknown John F Ken… 2001 Fixe… BOEING
#> 3 N712TW LAX JFK 2475 Los Angele… John F Ken… 1997 Fixe… BOEING
#> 4 N914DL TPA JFK 1005 Tampa Intl John F Ken… 1988 Fixe… MCDONNELL D…
#> 5 N823AY ORF LGA 296 Norfolk In… La Guardia 2005 Fixe… BOMBARDIER …
#> 6 N3AXAA ORD LGA 733 Chicago Oh… La Guardia NaN <NA> <NA>
#> # ℹ 5 more variables: model <chr>, engines <dbl>, seats <dbl>, speed <dbl>,
#> # engine <chr>
- Inside the
mutate()
step, add anotherifelse()
to modify “origin_name”. If “origin_name” is NA, then change its value to “Unknown”, if not NA, leave alone
|>
flights_select head() |>
left_join(dest_airports, by = "dest") |>
left_join(origin_airports, by = "origin") |>
left_join(tbl_planes, by = "tailnum") |>
mutate(
dest_name = ifelse(is.na(dest_name), "Unknown", dest_name),
origin_name = ifelse(is.na(origin_name), "Unknown", origin_name)
)#> # Source: SQL [6 x 14]
#> # Database: spark_connection
#> tailnum dest origin distance dest_name origin_name year type manufacturer
#> <chr> <chr> <chr> <int> <chr> <chr> <dbl> <chr> <chr>
#> 1 N626VA LAX JFK 2475 Los Angele… John F Ken… 2006 Fixe… AIRBUS
#> 2 N3760C SJU JFK 1598 Unknown John F Ken… 2001 Fixe… BOEING
#> 3 N712TW LAX JFK 2475 Los Angele… John F Ken… 1997 Fixe… BOEING
#> 4 N914DL TPA JFK 1005 Tampa Intl John F Ken… 1988 Fixe… MCDONNELL D…
#> 5 N823AY ORF LGA 296 Norfolk In… La Guardia 2005 Fixe… BOMBARDIER …
#> 6 N3AXAA ORD LGA 733 Chicago Oh… La Guardia NaN <NA> <NA>
#> # ℹ 5 more variables: model <chr>, engines <dbl>, seats <dbl>, speed <dbl>,
#> # engine <chr>
- Inside the
mutate()
step, add anotherifelse()
to modify “tailnum”. If “tailnum” is NA, then change its value to “Unknown”, if not NA, leave alone
|>
flights_select head() |>
left_join(dest_airports, by = "dest") |>
left_join(origin_airports, by = "origin") |>
left_join(tbl_planes, by = "tailnum") |>
mutate(
dest_name = ifelse(is.na(dest_name), "Unknown", dest_name),
origin_name = ifelse(is.na(origin_name), "Unknown", origin_name),
tailnum = ifelse(is.na(tailnum), "Unknown", tailnum)
)#> # Source: SQL [6 x 14]
#> # Database: spark_connection
#> tailnum dest origin distance dest_name origin_name year type manufacturer
#> <chr> <chr> <chr> <int> <chr> <chr> <dbl> <chr> <chr>
#> 1 N626VA LAX JFK 2475 Los Angele… John F Ken… 2006 Fixe… AIRBUS
#> 2 N3760C SJU JFK 1598 Unknown John F Ken… 2001 Fixe… BOEING
#> 3 N712TW LAX JFK 2475 Los Angele… John F Ken… 1997 Fixe… BOEING
#> 4 N914DL TPA JFK 1005 Tampa Intl John F Ken… 1988 Fixe… MCDONNELL D…
#> 5 N823AY ORF LGA 296 Norfolk In… La Guardia 2005 Fixe… BOMBARDIER …
#> 6 N3AXAA ORD LGA 733 Chicago Oh… La Guardia NaN <NA> <NA>
#> # ℹ 5 more variables: model <chr>, engines <dbl>, seats <dbl>, speed <dbl>,
#> # engine <chr>
- Add a
show_query()
step to see the resulting query
|>
flights_select head() |>
left_join(dest_airports, by = "dest") |>
left_join(origin_airports, by = "origin") |>
left_join(tbl_planes, by = "tailnum") |>
mutate(
dest_name = ifelse(is.na(dest_name), "Unknown", dest_name),
origin_name = ifelse(is.na(origin_name), "Unknown", origin_name),
tailnum = ifelse(is.na(tailnum), "Unknown", tailnum)
|>
) show_query()
#> <SQL>
#> SELECT
#> IF(ISNULL((`tailnum` IS NULL)), NULL, IF((`tailnum` IS NULL), "Unknown", `tailnum`)) AS `tailnum`,
#> `dest`,
#> `origin`,
#> `distance`,
#> IF(ISNULL((`dest_name` IS NULL)), NULL, IF((`dest_name` IS NULL), "Unknown", `dest_name`)) AS `dest_name`,
#> IF(ISNULL((`origin_name` IS NULL)), NULL, IF((`origin_name` IS NULL), "Unknown", `origin_name`)) AS `origin_name`,
#> `year`,
#> `type`,
#> `manufacturer`,
#> `model`,
#> `engines`,
#> `seats`,
#> `speed`,
#> `engine`
#> FROM (
#> SELECT
#> `...1`.*,
#> `airports_csv...2`.`name` AS `dest_name`,
#> `airports_csv...3`.`name` AS `origin_name`,
#> `year`,
#> `type`,
#> `manufacturer`,
#> `model`,
#> `engines`,
#> `seats`,
#> `speed`,
#> `engine`
#> FROM (
#> SELECT `tailnum`, `dest`, `origin`, `distance`
#> FROM `mapped`
#> LIMIT 6
#> ) `...1`
#> LEFT JOIN `airports_csv` `airports_csv...2`
#> ON (`...1`.`dest` = `airports_csv...2`.`faa`)
#> LEFT JOIN `airports_csv` `airports_csv...3`
#> ON (`...1`.`origin` = `airports_csv...3`.`faa`)
#> LEFT JOIN `planes`
#> ON (`...1`.`tailnum` = `planes`.`tailnum`)
#> ) `q01`
- Remove the
head()
step, and run again to see the new query
|>
flights_select head() |>
left_join(dest_airports, by = "dest") |>
left_join(origin_airports, by = "origin") |>
left_join(tbl_planes, by = "tailnum") |>
mutate(
dest_name = ifelse(is.na(dest_name), "Unknown", dest_name),
origin_name = ifelse(is.na(origin_name), "Unknown", origin_name),
tailnum = ifelse(is.na(tailnum), "Unknown", tailnum)
|>
) show_query()
#> <SQL>
#> SELECT
#> IF(ISNULL((`tailnum` IS NULL)), NULL, IF((`tailnum` IS NULL), "Unknown", `tailnum`)) AS `tailnum`,
#> `dest`,
#> `origin`,
#> `distance`,
#> IF(ISNULL((`dest_name` IS NULL)), NULL, IF((`dest_name` IS NULL), "Unknown", `dest_name`)) AS `dest_name`,
#> IF(ISNULL((`origin_name` IS NULL)), NULL, IF((`origin_name` IS NULL), "Unknown", `origin_name`)) AS `origin_name`,
#> `year`,
#> `type`,
#> `manufacturer`,
#> `model`,
#> `engines`,
#> `seats`,
#> `speed`,
#> `engine`
#> FROM (
#> SELECT
#> `...1`.*,
#> `airports_csv...2`.`name` AS `dest_name`,
#> `airports_csv...3`.`name` AS `origin_name`,
#> `year`,
#> `type`,
#> `manufacturer`,
#> `model`,
#> `engines`,
#> `seats`,
#> `speed`,
#> `engine`
#> FROM (
#> SELECT `tailnum`, `dest`, `origin`, `distance`
#> FROM `mapped`
#> LIMIT 6
#> ) `...1`
#> LEFT JOIN `airports_csv` `airports_csv...2`
#> ON (`...1`.`dest` = `airports_csv...2`.`faa`)
#> LEFT JOIN `airports_csv` `airports_csv...3`
#> ON (`...1`.`origin` = `airports_csv...3`.`faa`)
#> LEFT JOIN `planes`
#> ON (`...1`.`tailnum` = `planes`.`tailnum`)
#> ) `q01`
- Assign to a variable called
tbl_prep
, swap theshow_query()
step withcompute()
<- flights_select |>
tbl_prep left_join(dest_airports, by = "dest") |>
left_join(origin_airports, by = "origin") |>
left_join(tbl_planes, by = "tailnum") |>
mutate(
dest_name = ifelse(is.na(dest_name), "Unknown", dest_name),
origin_name = ifelse(is.na(origin_name), "Unknown", origin_name),
tailnum = ifelse(is.na(tailnum), "Unknown", tailnum)
|>
) compute()
- Pipe
tbl_prep
toshow_query()
to see the new query
|>
tbl_prep show_query()
#> <SQL>
#> SELECT *
#> FROM `table_46ab746a_e8ea_4ab6_b409_2c3da7056e34`