library(dplyr)
library(dbplyr)
library(DBI)
<- dbConnect(
con ::databricks(),
odbcHTTPPath = "/sql/1.0/warehouses/300bd24ba12adf8e"
)
3 Preparing and exploring Data
Catch up
3.1 Selecting variables
Simple strategies to order, and reduce, data to work with
- Load the
customer
table to a variable calledcustomer
<- tbl(con, I("workshops.tpch.customer")) customer
- Select all columns that end with “key”
|>
customer select(ends_with("key"))
#> # Source: SQL [?? x 2]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> c_custkey c_nationkey
#> <dbl> <dbl>
#> 1 1 15
#> 2 2 13
#> 3 3 1
#> 4 4 4
#> 5 5 3
#> 6 6 20
#> 7 7 18
#> 8 8 17
#> 9 9 8
#> 10 10 5
#> # ℹ more rows
- Move all columns that end with “key” to the front
|>
customer select(ends_with("key"), everything())
#> # Source: SQL [?? x 8]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> c_custkey c_nationkey c_name c_address c_phone c_acctbal c_mktsegment
#> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr>
#> 1 1 15 Customer#0000… IVhzIApe… 25-989… 712. BUILDING
#> 2 2 13 Customer#0000… XSTf4,NC… 23-768… 122. AUTOMOBILE
#> 3 3 1 Customer#0000… MG9kdTD2… 11-719… 7498. AUTOMOBILE
#> 4 4 4 Customer#0000… XxVSJsLA… 14-128… 2867. MACHINERY
#> 5 5 3 Customer#0000… KvpyuHCp… 13-750… 794. HOUSEHOLD
#> 6 6 20 Customer#0000… sKZz0Csn… 30-114… 7639. AUTOMOBILE
#> 7 7 18 Customer#0000… TcGe5gaZ… 28-190… 9562. AUTOMOBILE
#> 8 8 17 Customer#0000… I0B10bB0… 27-147… 6820. BUILDING
#> 9 9 8 Customer#0000… xKiAFTjU… 18-338… 8324. FURNITURE
#> 10 10 5 Customer#0000… 6LrEaV6K… 15-741… 2754. HOUSEHOLD
#> # ℹ more rows
#> # ℹ 1 more variable: c_comment <chr>
- Select all columns that do not end with “key”
|>
customer select(-ends_with("key"))
#> # Source: SQL [?? x 6]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> c_name c_address c_phone c_acctbal c_mktsegment c_comment
#> <chr> <chr> <chr> <dbl> <chr> <chr>
#> 1 Customer#000000001 IVhzIApeRb ot,c,E 25-989… 712. BUILDING "to the …
#> 2 Customer#000000002 XSTf4,NCwDVaWNe6… 23-768… 122. AUTOMOBILE "l accou…
#> 3 Customer#000000003 MG9kdTD2WBHm 11-719… 7498. AUTOMOBILE " deposi…
#> 4 Customer#000000004 XxVSJsLAGtn 14-128… 2867. MACHINERY " reques…
#> 5 Customer#000000005 KvpyuHCplrB84WgA… 13-750… 794. HOUSEHOLD "n accou…
#> 6 Customer#000000006 sKZz0CsnMD7mp4Xd… 30-114… 7639. AUTOMOBILE "tions. …
#> 7 Customer#000000007 TcGe5gaZNgVePxU5… 28-190… 9562. AUTOMOBILE "ainst t…
#> 8 Customer#000000008 I0B10bB0AymmC, 0… 27-147… 6820. BUILDING "among t…
#> 9 Customer#000000009 xKiAFTjUsCuxfele… 18-338… 8324. FURNITURE "r theod…
#> 10 Customer#000000010 6LrEaV6KR6PLVcgl… 15-741… 2754. HOUSEHOLD "es regu…
#> # ℹ more rows
3.2 Join to tables
Using left_join() to relate two tables
- Load the
nation
table into a variable called the same
<- tbl(con, I("workshops.tpch.nation")) nation
- Use
left_join
to relatecustomer
withnation
using the nation key
|>
customer left_join(nation, by = c("c_nationkey" = "n_nationkey"))
#> # Source: SQL [?? x 11]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> c_custkey c_name c_address c_nationkey c_phone c_acctbal c_mktsegment
#> <dbl> <chr> <chr> <dbl> <chr> <dbl> <chr>
#> 1 1 Customer#0000… IVhzIApe… 15 25-989… 712. BUILDING
#> 2 2 Customer#0000… XSTf4,NC… 13 23-768… 122. AUTOMOBILE
#> 3 3 Customer#0000… MG9kdTD2… 1 11-719… 7498. AUTOMOBILE
#> 4 4 Customer#0000… XxVSJsLA… 4 14-128… 2867. MACHINERY
#> 5 5 Customer#0000… KvpyuHCp… 3 13-750… 794. HOUSEHOLD
#> 6 6 Customer#0000… sKZz0Csn… 20 30-114… 7639. AUTOMOBILE
#> 7 7 Customer#0000… TcGe5gaZ… 18 28-190… 9562. AUTOMOBILE
#> 8 8 Customer#0000… I0B10bB0… 17 27-147… 6820. BUILDING
#> 9 9 Customer#0000… xKiAFTjU… 8 18-338… 8324. FURNITURE
#> 10 10 Customer#0000… 6LrEaV6K… 5 15-741… 2754. HOUSEHOLD
#> # ℹ more rows
#> # ℹ 4 more variables: c_comment <chr>, n_name <chr>, n_regionkey <dbl>,
#> # n_comment <chr>
- What are the 5 countries with the most customers?
|>
customer left_join(nation, by = c("c_nationkey" = "n_nationkey")) |>
count(n_name, sort = TRUE) |>
head(5)
#> # Source: SQL [5 x 2]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> # Ordered by: desc(n)
#> n_name n
#> <chr> <int64>
#> 1 UNITED STATES 1260
#> 2 IRAQ 1249
#> 3 IRAN 1248
#> 4 CHINA 1244
#> 5 EGYPT 1225
3.3 Prepare base
Building the base variable/query
- Load the
orders
table in a variable calledorders
<- tbl(con, I("workshops.tpch.orders")) orders
- Join
orders
to thecustomer
variable (table). Relate them on theo_custkey
andc_custkey
fields.
|>
orders left_join(customer, by = c("o_custkey" = "c_custkey"))
#> # Source: SQL [?? x 16]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> o_orderkey o_custkey o_orderstatus o_totalprice o_orderdate order_priority
#> <dbl> <dbl> <chr> <dbl> <date> <chr>
#> 1 1 7381 O 181585. 1996-01-02 5-LOW
#> 2 2 15601 O 46094. 1996-12-01 1-URGENT
#> 3 3 24664 F 271423. 1993-10-14 5-LOW
#> 4 4 27356 O 47915. 1995-10-11 5-LOW
#> 5 5 8897 F 136702. 1994-07-30 5-LOW
#> 6 6 11125 F 65071. 1992-02-21 4-NOT SPECIFIED
#> 7 7 7828 O 263865. 1996-01-10 2-HIGH
#> 8 32 26012 O 153480. 1995-07-16 2-HIGH
#> 9 33 13393 F 138123. 1993-10-27 3-MEDIUM
#> 10 34 12202 O 72249. 1998-07-21 3-MEDIUM
#> # ℹ more rows
#> # ℹ 10 more variables: o_clerk <chr>, o_shippriority <dbl>, o_comment <chr>,
#> # c_name <chr>, c_address <chr>, c_nationkey <dbl>, c_phone <chr>,
#> # c_acctbal <dbl>, c_mktsegment <chr>, c_comment <chr>
- Join the
nation
variable/table to theorders
andcustomer
variables. Use thec_nationkey
and then_nationkey
to relate them.
|>
orders left_join(customer, by = c("o_custkey" = "c_custkey")) |>
left_join(nation, by = c("c_nationkey" = "n_nationkey"))
#> # Source: SQL [?? x 19]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> o_orderkey o_custkey o_orderstatus o_totalprice o_orderdate order_priority
#> <dbl> <dbl> <chr> <dbl> <date> <chr>
#> 1 1 7381 O 181585. 1996-01-02 5-LOW
#> 2 2 15601 O 46094. 1996-12-01 1-URGENT
#> 3 3 24664 F 271423. 1993-10-14 5-LOW
#> 4 4 27356 O 47915. 1995-10-11 5-LOW
#> 5 5 8897 F 136702. 1994-07-30 5-LOW
#> 6 6 11125 F 65071. 1992-02-21 4-NOT SPECIFIED
#> 7 7 7828 O 263865. 1996-01-10 2-HIGH
#> 8 32 26012 O 153480. 1995-07-16 2-HIGH
#> 9 33 13393 F 138123. 1993-10-27 3-MEDIUM
#> 10 34 12202 O 72249. 1998-07-21 3-MEDIUM
#> # ℹ more rows
#> # ℹ 13 more variables: o_clerk <chr>, o_shippriority <dbl>, o_comment <chr>,
#> # c_name <chr>, c_address <chr>, c_nationkey <dbl>, c_phone <chr>,
#> # c_acctbal <dbl>, c_mktsegment <chr>, c_comment <chr>, n_name <chr>,
#> # n_regionkey <dbl>, n_comment <chr>
- Load the resulting code into a variable called
rel_orders
. We do this so to get autocomplete working
<- orders |>
rel_orders left_join(customer, by = c("o_custkey" = "c_custkey")) |>
left_join(nation, by = c("c_nationkey" = "n_nationkey"))
- Create new columns for the year of the order date, and another for the month of the order date. Name them
order_year
andorder_month
respectively.
|>
rel_orders mutate(order_year = year(o_orderdate), order_month = month(o_orderdate))
#> # Source: SQL [?? x 21]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> o_orderkey o_custkey o_orderstatus o_totalprice o_orderdate order_priority
#> <dbl> <dbl> <chr> <dbl> <date> <chr>
#> 1 1 7381 O 181585. 1996-01-02 5-LOW
#> 2 2 15601 O 46094. 1996-12-01 1-URGENT
#> 3 3 24664 F 271423. 1993-10-14 5-LOW
#> 4 4 27356 O 47915. 1995-10-11 5-LOW
#> 5 5 8897 F 136702. 1994-07-30 5-LOW
#> 6 6 11125 F 65071. 1992-02-21 4-NOT SPECIFIED
#> 7 7 7828 O 263865. 1996-01-10 2-HIGH
#> 8 32 26012 O 153480. 1995-07-16 2-HIGH
#> 9 33 13393 F 138123. 1993-10-27 3-MEDIUM
#> 10 34 12202 O 72249. 1998-07-21 3-MEDIUM
#> # ℹ more rows
#> # ℹ 15 more variables: o_clerk <chr>, o_shippriority <dbl>, o_comment <chr>,
#> # c_name <chr>, c_address <chr>, c_nationkey <dbl>, c_phone <chr>,
#> # c_acctbal <dbl>, c_mktsegment <chr>, c_comment <chr>, n_name <chr>,
#> # n_regionkey <dbl>, n_comment <chr>, order_year <int>, order_month <int>
- Remove any columns that end in “comment”, and end in “key”
|>
rel_orders mutate(order_year = year(o_orderdate), order_month = month(o_orderdate)) |>
select(-ends_with("comment"), -ends_with("key"))
#> # Source: SQL [?? x 14]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> o_orderstatus o_totalprice o_orderdate order_priority o_clerk o_shippriority
#> <chr> <dbl> <date> <chr> <chr> <dbl>
#> 1 O 181585. 1996-01-02 5-LOW Clerk#… 0
#> 2 O 46094. 1996-12-01 1-URGENT Clerk#… 0
#> 3 F 271423. 1993-10-14 5-LOW Clerk#… 0
#> 4 O 47915. 1995-10-11 5-LOW Clerk#… 0
#> 5 F 136702. 1994-07-30 5-LOW Clerk#… 0
#> 6 F 65071. 1992-02-21 4-NOT SPECIFIED Clerk#… 0
#> 7 O 263865. 1996-01-10 2-HIGH Clerk#… 0
#> 8 O 153480. 1995-07-16 2-HIGH Clerk#… 0
#> 9 F 138123. 1993-10-27 3-MEDIUM Clerk#… 0
#> 10 O 72249. 1998-07-21 3-MEDIUM Clerk#… 0
#> # ℹ more rows
#> # ℹ 8 more variables: c_name <chr>, c_address <chr>, c_phone <chr>,
#> # c_acctbal <dbl>, c_mktsegment <chr>, n_name <chr>, order_year <int>,
#> # order_month <int>
- Rename
o_custkey
tocustomer
, insert code before the selection
|>
rel_orders mutate(order_year = year(o_orderdate), order_month = month(o_orderdate)) |>
rename(customer = o_custkey) |>
select(-ends_with("comment"), -ends_with("key"))
#> # Source: SQL [?? x 15]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> customer o_orderstatus o_totalprice o_orderdate order_priority o_clerk
#> <dbl> <chr> <dbl> <date> <chr> <chr>
#> 1 7381 O 181585. 1996-01-02 5-LOW Clerk#000000…
#> 2 15601 O 46094. 1996-12-01 1-URGENT Clerk#000000…
#> 3 24664 F 271423. 1993-10-14 5-LOW Clerk#000000…
#> 4 27356 O 47915. 1995-10-11 5-LOW Clerk#000000…
#> 5 8897 F 136702. 1994-07-30 5-LOW Clerk#000000…
#> 6 11125 F 65071. 1992-02-21 4-NOT SPECIFIED Clerk#000000…
#> 7 7828 O 263865. 1996-01-10 2-HIGH Clerk#000000…
#> 8 26012 O 153480. 1995-07-16 2-HIGH Clerk#000000…
#> 9 13393 F 138123. 1993-10-27 3-MEDIUM Clerk#000000…
#> 10 12202 O 72249. 1998-07-21 3-MEDIUM Clerk#000000…
#> # ℹ more rows
#> # ℹ 9 more variables: o_shippriority <dbl>, c_name <chr>, c_address <chr>,
#> # c_phone <chr>, c_acctbal <dbl>, c_mktsegment <chr>, n_name <chr>,
#> # order_year <int>, order_month <int>
- Load resulting code to a variable called
prep_orders
<- rel_orders |>
prep_orders mutate(order_year = year(o_orderdate), order_month = month(o_orderdate)) |>
rename(customer = o_custkey) |>
select(-ends_with("comment"), -ends_with("key"))
- Preview
prep_orders
usingglimpse()
|>
prep_orders glimpse()
#> Rows: ??
#> Columns: 15
#> Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> $ customer <dbl> 7381, 15601, 24664, 27356, 8897, 11125, 7828, 26012, 13…
#> $ o_orderstatus <chr> "O", "O", "F", "O", "F", "F", "O", "O", "F", "O", "O", …
#> $ o_totalprice <dbl> 181585.13, 46093.67, 271422.96, 47915.12, 136701.72, 65…
#> $ o_orderdate <date> 1996-01-02, 1996-12-01, 1993-10-14, 1995-10-11, 1994-0…
#> $ order_priority <chr> "5-LOW", "1-URGENT", "5-LOW", "5-LOW", "5-LOW", "4-NOT …
#> $ o_clerk <chr> "Clerk#000000951", "Clerk#000000880", "Clerk#000000955"…
#> $ o_shippriority <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
#> $ c_name <chr> "Customer#000007381", "Customer#000015601", "Customer#0…
#> $ c_address <chr> "uAT93aEOFo7IUvh BMFgRm48hsr7LtPfQJ", "T2ALSWGjgRFWLxP4…
#> $ c_phone <chr> "30-666-139-1602", "32-397-926-3405", "14-383-701-6221"…
#> $ c_acctbal <dbl> 73.39, 7589.86, 570.97, -254.76, 2725.15, 2998.55, 3706…
#> $ c_mktsegment <chr> "BUILDING", "HOUSEHOLD", "FURNITURE", "MACHINERY", "BUI…
#> $ n_name <chr> "SAUDI ARABIA", "RUSSIA", "EGYPT", "ETHIOPIA", "ARGENTI…
#> $ order_year <int> 1996, 1996, 1993, 1995, 1994, 1992, 1996, 1995, 1993, 1…
#> $ order_month <int> 1, 12, 10, 10, 7, 2, 1, 7, 10, 7, 10, 11, 6, 8, 9, 7, 3…
3.4 Answering questions
Using the base query to answer more complex questions
- What are the top 5 countries for total amount ordered?
|>
prep_orders group_by(n_name) |>
summarise(
total_price = sum(o_totalprice, na.rm = TRUE)
|>
) arrange(desc(total_price)) |>
head(5)
#> # Source: SQL [5 x 2]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> # Ordered by: desc(total_price)
#> n_name total_price
#> <chr> <dbl>
#> 1 CHINA 1801806199.
#> 2 UNITED STATES 1774171034.
#> 3 EGYPT 1771552951.
#> 4 IRAN 1771537297.
#> 5 VIETNAM 1771445652.
- What are the top 5 countries for total amount ordered for 1998?
|>
prep_orders filter(order_year == 1998) |>
group_by(n_name) |>
summarise(
total_price = sum(o_totalprice, na.rm = TRUE)
|>
) arrange(desc(total_price)) |>
head(5)
#> # Source: SQL [5 x 2]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> # Ordered by: desc(total_price)
#> n_name total_price
#> <chr> <dbl>
#> 1 CHINA 171553459.
#> 2 UNITED STATES 171380901.
#> 3 UNITED KINGDOM 162354578.
#> 4 IRAN 160671344.
#> 5 ARGENTINA 160164705.
- What has been the top (1) country, in orders, by year?
|>
prep_orders group_by(n_name, order_year) |>
summarise(
total_price = sum(o_totalprice, na.rm = TRUE)
|>
) group_by(order_year) |>
filter(total_price == max(total_price))
#> # Source: SQL [7 x 3]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> # Groups: order_year
#> n_name order_year total_price
#> <chr> <int> <dbl>
#> 1 UNITED KINGDOM 1992 274006542.
#> 2 VIETNAM 1993 272855530.
#> 3 INDONESIA 1994 286622681.
#> 4 EGYPT 1995 281068319.
#> 5 VIETNAM 1996 281019062.
#> 6 UNITED STATES 1997 276025595.
#> 7 CHINA 1998 171553459.
- Who are the top 5 customers by amount ordered?
|>
prep_orders group_by(customer) |>
summarise(
total_price = sum(o_totalprice, na.rm = TRUE)
|>
) arrange(desc(total_price)) |>
head(5)
#> # Source: SQL [5 x 2]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> # Ordered by: desc(total_price)
#> customer total_price
#> <dbl> <dbl>
#> 1 691 6311210.
#> 2 25936 5917968.
#> 3 20959 5884257.
#> 4 4351 5830651.
#> 5 979 5820310.
- What is the country, and market segment, of the top 5 customers by amount ordered?
|>
prep_orders group_by(customer) |>
summarise(
country = first(n_name),
segment = first(c_mktsegment),
total_price = sum(o_totalprice, na.rm = TRUE)
|>
) arrange(desc(total_price)) |>
head(5)
#> # Source: SQL [5 x 4]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> # Ordered by: desc(total_price)
#> customer country segment total_price
#> <dbl> <chr> <chr> <dbl>
#> 1 691 MOZAMBIQUE MACHINERY 6311210.
#> 2 25936 UNITED STATES HOUSEHOLD 5917968.
#> 3 20959 GERMANY BUILDING 5884257.
#> 4 4351 IRAN AUTOMOBILE 5830651.
#> 5 979 CHINA MACHINERY 5820310.