3  Preparing and exploring Data

Catch up

library(dplyr)
library(dbplyr)
library(DBI)

con <- dbConnect(
  odbc::databricks(),
  HTTPPath = "/sql/1.0/warehouses/300bd24ba12adf8e"
)

3.1 Selecting variables

Simple strategies to order, and reduce, data to work with

  1. Load the customer table to a variable called customer
customer <- tbl(con, I("workshops.tpch.customer"))
  1. Select all columns that end with “key”
customer |> 
  select(ends_with("key"))
#> # Source:   SQL [?? x 2]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#>    c_custkey c_nationkey
#>        <dbl>       <dbl>
#>  1         1          15
#>  2         2          13
#>  3         3           1
#>  4         4           4
#>  5         5           3
#>  6         6          20
#>  7         7          18
#>  8         8          17
#>  9         9           8
#> 10        10           5
#> # ℹ more rows
  1. Move all columns that end with “key” to the front
customer |> 
  select(ends_with("key"), everything())
#> # Source:   SQL [?? x 8]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#>    c_custkey c_nationkey c_name         c_address c_phone c_acctbal c_mktsegment
#>        <dbl>       <dbl> <chr>          <chr>     <chr>       <dbl> <chr>       
#>  1         1          15 Customer#0000… IVhzIApe… 25-989…      712. BUILDING    
#>  2         2          13 Customer#0000… XSTf4,NC… 23-768…      122. AUTOMOBILE  
#>  3         3           1 Customer#0000… MG9kdTD2… 11-719…     7498. AUTOMOBILE  
#>  4         4           4 Customer#0000… XxVSJsLA… 14-128…     2867. MACHINERY   
#>  5         5           3 Customer#0000… KvpyuHCp… 13-750…      794. HOUSEHOLD   
#>  6         6          20 Customer#0000… sKZz0Csn… 30-114…     7639. AUTOMOBILE  
#>  7         7          18 Customer#0000… TcGe5gaZ… 28-190…     9562. AUTOMOBILE  
#>  8         8          17 Customer#0000… I0B10bB0… 27-147…     6820. BUILDING    
#>  9         9           8 Customer#0000… xKiAFTjU… 18-338…     8324. FURNITURE   
#> 10        10           5 Customer#0000… 6LrEaV6K… 15-741…     2754. HOUSEHOLD   
#> # ℹ more rows
#> # ℹ 1 more variable: c_comment <chr>
  1. Select all columns that do not end with “key”
customer |> 
  select(-ends_with("key"))
#> # Source:   SQL [?? x 6]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#>    c_name             c_address         c_phone c_acctbal c_mktsegment c_comment
#>    <chr>              <chr>             <chr>       <dbl> <chr>        <chr>    
#>  1 Customer#000000001 IVhzIApeRb ot,c,E 25-989…      712. BUILDING     "to the …
#>  2 Customer#000000002 XSTf4,NCwDVaWNe6… 23-768…      122. AUTOMOBILE   "l accou…
#>  3 Customer#000000003 MG9kdTD2WBHm      11-719…     7498. AUTOMOBILE   " deposi…
#>  4 Customer#000000004 XxVSJsLAGtn       14-128…     2867. MACHINERY    " reques…
#>  5 Customer#000000005 KvpyuHCplrB84WgA… 13-750…      794. HOUSEHOLD    "n accou…
#>  6 Customer#000000006 sKZz0CsnMD7mp4Xd… 30-114…     7639. AUTOMOBILE   "tions. …
#>  7 Customer#000000007 TcGe5gaZNgVePxU5… 28-190…     9562. AUTOMOBILE   "ainst t…
#>  8 Customer#000000008 I0B10bB0AymmC, 0… 27-147…     6820. BUILDING     "among t…
#>  9 Customer#000000009 xKiAFTjUsCuxfele… 18-338…     8324. FURNITURE    "r theod…
#> 10 Customer#000000010 6LrEaV6KR6PLVcgl… 15-741…     2754. HOUSEHOLD    "es regu…
#> # ℹ more rows

3.2 Join to tables

Using left_join() to relate two tables

  1. Load the nation table into a variable called the same
nation <- tbl(con, I("workshops.tpch.nation"))
  1. Use left_join to relate customer with nation using the nation key
customer |> 
  left_join(nation, by = c("c_nationkey" = "n_nationkey"))
#> # Source:   SQL [?? x 11]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#>    c_custkey c_name         c_address c_nationkey c_phone c_acctbal c_mktsegment
#>        <dbl> <chr>          <chr>           <dbl> <chr>       <dbl> <chr>       
#>  1         1 Customer#0000… IVhzIApe…          15 25-989…      712. BUILDING    
#>  2         2 Customer#0000… XSTf4,NC…          13 23-768…      122. AUTOMOBILE  
#>  3         3 Customer#0000… MG9kdTD2…           1 11-719…     7498. AUTOMOBILE  
#>  4         4 Customer#0000… XxVSJsLA…           4 14-128…     2867. MACHINERY   
#>  5         5 Customer#0000… KvpyuHCp…           3 13-750…      794. HOUSEHOLD   
#>  6         6 Customer#0000… sKZz0Csn…          20 30-114…     7639. AUTOMOBILE  
#>  7         7 Customer#0000… TcGe5gaZ…          18 28-190…     9562. AUTOMOBILE  
#>  8         8 Customer#0000… I0B10bB0…          17 27-147…     6820. BUILDING    
#>  9         9 Customer#0000… xKiAFTjU…           8 18-338…     8324. FURNITURE   
#> 10        10 Customer#0000… 6LrEaV6K…           5 15-741…     2754. HOUSEHOLD   
#> # ℹ more rows
#> # ℹ 4 more variables: c_comment <chr>, n_name <chr>, n_regionkey <dbl>,
#> #   n_comment <chr>
  1. What are the 5 countries with the most customers?
customer |> 
  left_join(nation, by = c("c_nationkey" = "n_nationkey")) |> 
  count(n_name, sort = TRUE) |> 
  head(5)
#> # Source:     SQL [5 x 2]
#> # Database:   Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> # Ordered by: desc(n)
#>   n_name              n
#>   <chr>         <int64>
#> 1 UNITED STATES    1260
#> 2 IRAQ             1249
#> 3 IRAN             1248
#> 4 CHINA            1244
#> 5 EGYPT            1225

3.3 Prepare base

Building the base variable/query

  1. Load the orders table in a variable called orders
orders <- tbl(con, I("workshops.tpch.orders"))
  1. Join orders to the customer variable (table). Relate them on the o_custkey and c_custkey fields.
orders |> 
  left_join(customer, by = c("o_custkey" = "c_custkey"))
#> # Source:   SQL [?? x 16]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#>    o_orderkey o_custkey o_orderstatus o_totalprice o_orderdate order_priority 
#>         <dbl>     <dbl> <chr>                <dbl> <date>      <chr>          
#>  1          1      7381 O                  181585. 1996-01-02  5-LOW          
#>  2          2     15601 O                   46094. 1996-12-01  1-URGENT       
#>  3          3     24664 F                  271423. 1993-10-14  5-LOW          
#>  4          4     27356 O                   47915. 1995-10-11  5-LOW          
#>  5          5      8897 F                  136702. 1994-07-30  5-LOW          
#>  6          6     11125 F                   65071. 1992-02-21  4-NOT SPECIFIED
#>  7          7      7828 O                  263865. 1996-01-10  2-HIGH         
#>  8         32     26012 O                  153480. 1995-07-16  2-HIGH         
#>  9         33     13393 F                  138123. 1993-10-27  3-MEDIUM       
#> 10         34     12202 O                   72249. 1998-07-21  3-MEDIUM       
#> # ℹ more rows
#> # ℹ 10 more variables: o_clerk <chr>, o_shippriority <dbl>, o_comment <chr>,
#> #   c_name <chr>, c_address <chr>, c_nationkey <dbl>, c_phone <chr>,
#> #   c_acctbal <dbl>, c_mktsegment <chr>, c_comment <chr>
  1. Join the nation variable/table to the orders and customer variables. Use the c_nationkey and the n_nationkey to relate them.
orders |> 
  left_join(customer, by = c("o_custkey" = "c_custkey")) |> 
  left_join(nation, by = c("c_nationkey" = "n_nationkey"))
#> # Source:   SQL [?? x 19]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#>    o_orderkey o_custkey o_orderstatus o_totalprice o_orderdate order_priority 
#>         <dbl>     <dbl> <chr>                <dbl> <date>      <chr>          
#>  1          1      7381 O                  181585. 1996-01-02  5-LOW          
#>  2          2     15601 O                   46094. 1996-12-01  1-URGENT       
#>  3          3     24664 F                  271423. 1993-10-14  5-LOW          
#>  4          4     27356 O                   47915. 1995-10-11  5-LOW          
#>  5          5      8897 F                  136702. 1994-07-30  5-LOW          
#>  6          6     11125 F                   65071. 1992-02-21  4-NOT SPECIFIED
#>  7          7      7828 O                  263865. 1996-01-10  2-HIGH         
#>  8         32     26012 O                  153480. 1995-07-16  2-HIGH         
#>  9         33     13393 F                  138123. 1993-10-27  3-MEDIUM       
#> 10         34     12202 O                   72249. 1998-07-21  3-MEDIUM       
#> # ℹ more rows
#> # ℹ 13 more variables: o_clerk <chr>, o_shippriority <dbl>, o_comment <chr>,
#> #   c_name <chr>, c_address <chr>, c_nationkey <dbl>, c_phone <chr>,
#> #   c_acctbal <dbl>, c_mktsegment <chr>, c_comment <chr>, n_name <chr>,
#> #   n_regionkey <dbl>, n_comment <chr>
  1. Load the resulting code into a variable called rel_orders. We do this so to get autocomplete working
rel_orders <- orders |> 
  left_join(customer, by = c("o_custkey" = "c_custkey")) |> 
  left_join(nation, by = c("c_nationkey" = "n_nationkey"))
  1. Create new columns for the year of the order date, and another for the month of the order date. Name them order_year and order_month respectively.
rel_orders |> 
  mutate(order_year = year(o_orderdate), order_month = month(o_orderdate))
#> # Source:   SQL [?? x 21]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#>    o_orderkey o_custkey o_orderstatus o_totalprice o_orderdate order_priority 
#>         <dbl>     <dbl> <chr>                <dbl> <date>      <chr>          
#>  1          1      7381 O                  181585. 1996-01-02  5-LOW          
#>  2          2     15601 O                   46094. 1996-12-01  1-URGENT       
#>  3          3     24664 F                  271423. 1993-10-14  5-LOW          
#>  4          4     27356 O                   47915. 1995-10-11  5-LOW          
#>  5          5      8897 F                  136702. 1994-07-30  5-LOW          
#>  6          6     11125 F                   65071. 1992-02-21  4-NOT SPECIFIED
#>  7          7      7828 O                  263865. 1996-01-10  2-HIGH         
#>  8         32     26012 O                  153480. 1995-07-16  2-HIGH         
#>  9         33     13393 F                  138123. 1993-10-27  3-MEDIUM       
#> 10         34     12202 O                   72249. 1998-07-21  3-MEDIUM       
#> # ℹ more rows
#> # ℹ 15 more variables: o_clerk <chr>, o_shippriority <dbl>, o_comment <chr>,
#> #   c_name <chr>, c_address <chr>, c_nationkey <dbl>, c_phone <chr>,
#> #   c_acctbal <dbl>, c_mktsegment <chr>, c_comment <chr>, n_name <chr>,
#> #   n_regionkey <dbl>, n_comment <chr>, order_year <int>, order_month <int>
  1. Remove any columns that end in “comment”, and end in “key”
rel_orders |> 
  mutate(order_year = year(o_orderdate), order_month = month(o_orderdate)) |> 
  select(-ends_with("comment"), -ends_with("key")) 
#> # Source:   SQL [?? x 14]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#>    o_orderstatus o_totalprice o_orderdate order_priority  o_clerk o_shippriority
#>    <chr>                <dbl> <date>      <chr>           <chr>            <dbl>
#>  1 O                  181585. 1996-01-02  5-LOW           Clerk#…              0
#>  2 O                   46094. 1996-12-01  1-URGENT        Clerk#…              0
#>  3 F                  271423. 1993-10-14  5-LOW           Clerk#…              0
#>  4 O                   47915. 1995-10-11  5-LOW           Clerk#…              0
#>  5 F                  136702. 1994-07-30  5-LOW           Clerk#…              0
#>  6 F                   65071. 1992-02-21  4-NOT SPECIFIED Clerk#…              0
#>  7 O                  263865. 1996-01-10  2-HIGH          Clerk#…              0
#>  8 O                  153480. 1995-07-16  2-HIGH          Clerk#…              0
#>  9 F                  138123. 1993-10-27  3-MEDIUM        Clerk#…              0
#> 10 O                   72249. 1998-07-21  3-MEDIUM        Clerk#…              0
#> # ℹ more rows
#> # ℹ 8 more variables: c_name <chr>, c_address <chr>, c_phone <chr>,
#> #   c_acctbal <dbl>, c_mktsegment <chr>, n_name <chr>, order_year <int>,
#> #   order_month <int>
  1. Rename o_custkey to customer, insert code before the selection
rel_orders |> 
  mutate(order_year = year(o_orderdate), order_month = month(o_orderdate)) |> 
  rename(customer = o_custkey) |> 
  select(-ends_with("comment"), -ends_with("key")) 
#> # Source:   SQL [?? x 15]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#>    customer o_orderstatus o_totalprice o_orderdate order_priority  o_clerk      
#>       <dbl> <chr>                <dbl> <date>      <chr>           <chr>        
#>  1     7381 O                  181585. 1996-01-02  5-LOW           Clerk#000000…
#>  2    15601 O                   46094. 1996-12-01  1-URGENT        Clerk#000000…
#>  3    24664 F                  271423. 1993-10-14  5-LOW           Clerk#000000…
#>  4    27356 O                   47915. 1995-10-11  5-LOW           Clerk#000000…
#>  5     8897 F                  136702. 1994-07-30  5-LOW           Clerk#000000…
#>  6    11125 F                   65071. 1992-02-21  4-NOT SPECIFIED Clerk#000000…
#>  7     7828 O                  263865. 1996-01-10  2-HIGH          Clerk#000000…
#>  8    26012 O                  153480. 1995-07-16  2-HIGH          Clerk#000000…
#>  9    13393 F                  138123. 1993-10-27  3-MEDIUM        Clerk#000000…
#> 10    12202 O                   72249. 1998-07-21  3-MEDIUM        Clerk#000000…
#> # ℹ more rows
#> # ℹ 9 more variables: o_shippriority <dbl>, c_name <chr>, c_address <chr>,
#> #   c_phone <chr>, c_acctbal <dbl>, c_mktsegment <chr>, n_name <chr>,
#> #   order_year <int>, order_month <int>
  1. Load resulting code to a variable called prep_orders
prep_orders <- rel_orders |> 
  mutate(order_year = year(o_orderdate), order_month = month(o_orderdate)) |> 
  rename(customer = o_custkey) |> 
  select(-ends_with("comment"), -ends_with("key")) 
  1. Preview prep_orders using glimpse()
prep_orders |> 
  glimpse()
#> Rows: ??
#> Columns: 15
#> Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> $ customer       <dbl> 7381, 15601, 24664, 27356, 8897, 11125, 7828, 26012, 13…
#> $ o_orderstatus  <chr> "O", "O", "F", "O", "F", "F", "O", "O", "F", "O", "O", …
#> $ o_totalprice   <dbl> 181585.13, 46093.67, 271422.96, 47915.12, 136701.72, 65…
#> $ o_orderdate    <date> 1996-01-02, 1996-12-01, 1993-10-14, 1995-10-11, 1994-0…
#> $ order_priority <chr> "5-LOW", "1-URGENT", "5-LOW", "5-LOW", "5-LOW", "4-NOT …
#> $ o_clerk        <chr> "Clerk#000000951", "Clerk#000000880", "Clerk#000000955"…
#> $ o_shippriority <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
#> $ c_name         <chr> "Customer#000007381", "Customer#000015601", "Customer#0…
#> $ c_address      <chr> "uAT93aEOFo7IUvh BMFgRm48hsr7LtPfQJ", "T2ALSWGjgRFWLxP4…
#> $ c_phone        <chr> "30-666-139-1602", "32-397-926-3405", "14-383-701-6221"…
#> $ c_acctbal      <dbl> 73.39, 7589.86, 570.97, -254.76, 2725.15, 2998.55, 3706…
#> $ c_mktsegment   <chr> "BUILDING", "HOUSEHOLD", "FURNITURE", "MACHINERY", "BUI…
#> $ n_name         <chr> "SAUDI ARABIA", "RUSSIA", "EGYPT", "ETHIOPIA", "ARGENTI…
#> $ order_year     <int> 1996, 1996, 1993, 1995, 1994, 1992, 1996, 1995, 1993, 1…
#> $ order_month    <int> 1, 12, 10, 10, 7, 2, 1, 7, 10, 7, 10, 11, 6, 8, 9, 7, 3…

3.4 Answering questions

Using the base query to answer more complex questions

  1. What are the top 5 countries for total amount ordered?
prep_orders |> 
  group_by(n_name) |> 
  summarise(
    total_price = sum(o_totalprice, na.rm = TRUE)
  ) |> 
  arrange(desc(total_price)) |> 
  head(5)
#> # Source:     SQL [5 x 2]
#> # Database:   Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> # Ordered by: desc(total_price)
#>   n_name        total_price
#>   <chr>               <dbl>
#> 1 CHINA         1801806199.
#> 2 UNITED STATES 1774171034.
#> 3 EGYPT         1771552951.
#> 4 IRAN          1771537297.
#> 5 VIETNAM       1771445652.
  1. What are the top 5 countries for total amount ordered for 1998?
prep_orders |> 
  filter(order_year == 1998) |> 
  group_by(n_name) |> 
  summarise(
    total_price = sum(o_totalprice, na.rm = TRUE)
  ) |> 
  arrange(desc(total_price)) |> 
  head(5)
#> # Source:     SQL [5 x 2]
#> # Database:   Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> # Ordered by: desc(total_price)
#>   n_name         total_price
#>   <chr>                <dbl>
#> 1 CHINA           171553459.
#> 2 UNITED STATES   171380901.
#> 3 UNITED KINGDOM  162354578.
#> 4 IRAN            160671344.
#> 5 ARGENTINA       160164705.
  1. What has been the top (1) country, in orders, by year?
prep_orders |> 
  group_by(n_name, order_year) |> 
  summarise(
    total_price = sum(o_totalprice, na.rm = TRUE)
  ) |> 
  group_by(order_year) |> 
  filter(total_price == max(total_price))
#> # Source:   SQL [7 x 3]
#> # Database: Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> # Groups:   order_year
#>   n_name         order_year total_price
#>   <chr>               <int>       <dbl>
#> 1 UNITED KINGDOM       1992  274006542.
#> 2 VIETNAM              1993  272855530.
#> 3 INDONESIA            1994  286622681.
#> 4 EGYPT                1995  281068319.
#> 5 VIETNAM              1996  281019062.
#> 6 UNITED STATES        1997  276025595.
#> 7 CHINA                1998  171553459.
  1. Who are the top 5 customers by amount ordered?
prep_orders |> 
  group_by(customer) |> 
  summarise(
    total_price = sum(o_totalprice, na.rm = TRUE)
    ) |> 
  arrange(desc(total_price)) |> 
  head(5)
#> # Source:     SQL [5 x 2]
#> # Database:   Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> # Ordered by: desc(total_price)
#>   customer total_price
#>      <dbl>       <dbl>
#> 1      691    6311210.
#> 2    25936    5917968.
#> 3    20959    5884257.
#> 4     4351    5830651.
#> 5      979    5820310.
  1. What is the country, and market segment, of the top 5 customers by amount ordered?
prep_orders |> 
  group_by(customer) |> 
  summarise(
    country = first(n_name), 
    segment = first(c_mktsegment),
    total_price = sum(o_totalprice, na.rm = TRUE)
    ) |> 
  arrange(desc(total_price)) |> 
  head(5)
#> # Source:     SQL [5 x 4]
#> # Database:   Spark SQL 3.1.1[token@Spark SQL/hive_metastore]
#> # Ordered by: desc(total_price)
#>   customer country       segment    total_price
#>      <dbl> <chr>         <chr>            <dbl>
#> 1      691 MOZAMBIQUE    MACHINERY     6311210.
#> 2    25936 UNITED STATES HOUSEHOLD     5917968.
#> 3    20959 GERMANY       BUILDING      5884257.
#> 4     4351 IRAN          AUTOMOBILE    5830651.
#> 5      979 CHINA         MACHINERY     5820310.