Arrow In-Memory Exercise

library(arrow)
library(dplyr)
Arrow Table
  1. Read in a single NYC Taxi parquet file using read_parquet() as an Arrow Table

  2. Convert your Arrow Table object to a data.frame or a tibble

parquet_file <- "data/nyc-taxi/year=2019/month=9/part-0.parquet"

taxi_table <- read_parquet(parquet_file, as_data_frame = FALSE)
taxi_table
Table
6567396 rows x 22 columns
$vendor_name <string>
$pickup_datetime <timestamp[ms]>
$dropoff_datetime <timestamp[ms]>
$passenger_count <int64>
$trip_distance <double>
$pickup_longitude <double>
$pickup_latitude <double>
$rate_code <string>
$store_and_fwd <string>
$dropoff_longitude <double>
$dropoff_latitude <double>
$payment_type <string>
$fare_amount <double>
$extra <double>
$mta_tax <double>
$tip_amount <double>
$tolls_amount <double>
$total_amount <double>
$improvement_surcharge <double>
$congestion_surcharge <double>
...
2 more columns
Use `schema()` to see entire schema
taxi_table |> collect()
# A tibble: 6,567,396 × 22
   vendor_name pickup_datetime     dropoff_datetime    passenger_count
   <chr>       <dttm>              <dttm>                        <int>
 1 CMT         2019-08-31 18:09:30 2019-08-31 18:15:42               1
 2 CMT         2019-08-31 18:26:30 2019-08-31 18:44:31               1
 3 CMT         2019-08-31 18:39:35 2019-08-31 19:15:55               2
 4 VTS         2019-08-31 18:12:26 2019-08-31 18:15:17               4
 5 VTS         2019-08-31 18:43:16 2019-08-31 18:53:50               1
 6 VTS         2019-08-31 18:26:13 2019-08-31 18:45:35               1
 7 CMT         2019-08-31 18:34:52 2019-08-31 18:42:03               1
 8 CMT         2019-08-31 18:50:02 2019-08-31 18:58:16               1
 9 CMT         2019-08-31 18:08:02 2019-08-31 18:14:44               0
10 VTS         2019-08-31 18:11:38 2019-08-31 18:26:47               1
# ℹ 6,567,386 more rows
# ℹ 18 more variables: trip_distance <dbl>, pickup_longitude <dbl>,
#   pickup_latitude <dbl>, rate_code <chr>, store_and_fwd <chr>,
#   dropoff_longitude <dbl>, dropoff_latitude <dbl>, payment_type <chr>,
#   fare_amount <dbl>, extra <dbl>, mta_tax <dbl>, tip_amount <dbl>,
#   tolls_amount <dbl>, total_amount <dbl>, improvement_surcharge <dbl>,
#   congestion_surcharge <dbl>, pickup_location_id <int>, …

or

as_tibble(taxi_table)
# A tibble: 6,567,396 × 22
   vendor_name pickup_datetime     dropoff_datetime    passenger_count
   <chr>       <dttm>              <dttm>                        <int>
 1 CMT         2019-08-31 18:09:30 2019-08-31 18:15:42               1
 2 CMT         2019-08-31 18:26:30 2019-08-31 18:44:31               1
 3 CMT         2019-08-31 18:39:35 2019-08-31 19:15:55               2
 4 VTS         2019-08-31 18:12:26 2019-08-31 18:15:17               4
 5 VTS         2019-08-31 18:43:16 2019-08-31 18:53:50               1
 6 VTS         2019-08-31 18:26:13 2019-08-31 18:45:35               1
 7 CMT         2019-08-31 18:34:52 2019-08-31 18:42:03               1
 8 CMT         2019-08-31 18:50:02 2019-08-31 18:58:16               1
 9 CMT         2019-08-31 18:08:02 2019-08-31 18:14:44               0
10 VTS         2019-08-31 18:11:38 2019-08-31 18:26:47               1
# ℹ 6,567,386 more rows
# ℹ 18 more variables: trip_distance <dbl>, pickup_longitude <dbl>,
#   pickup_latitude <dbl>, rate_code <chr>, store_and_fwd <chr>,
#   dropoff_longitude <dbl>, dropoff_latitude <dbl>, payment_type <chr>,
#   fare_amount <dbl>, extra <dbl>, mta_tax <dbl>, tip_amount <dbl>,
#   tolls_amount <dbl>, total_amount <dbl>, improvement_surcharge <dbl>,
#   congestion_surcharge <dbl>, pickup_location_id <int>, …

or

as.data.frame(taxi_table)
  vendor_name     pickup_datetime    dropoff_datetime passenger_count
1         CMT 2019-08-31 18:09:30 2019-08-31 18:15:42               1
2         CMT 2019-08-31 18:26:30 2019-08-31 18:44:31               1
3         CMT 2019-08-31 18:39:35 2019-08-31 19:15:55               2
4         VTS 2019-08-31 18:12:26 2019-08-31 18:15:17               4
5         VTS 2019-08-31 18:43:16 2019-08-31 18:53:50               1
  trip_distance pickup_longitude pickup_latitude     rate_code store_and_fwd
1          0.80               NA              NA Standard rate            No
2          3.70               NA              NA Standard rate            No
3          8.10               NA              NA Standard rate            No
4          0.58               NA              NA Standard rate            No
5          3.32               NA              NA Standard rate            No
  dropoff_longitude dropoff_latitude payment_type fare_amount extra mta_tax
1                NA               NA         Cash         6.0   3.0     0.5
2                NA               NA  Credit card        15.0   3.0     0.5
3                NA               NA  Credit card        29.5   3.0     0.5
4                NA               NA  Credit card         4.0   0.5     0.5
5                NA               NA  Credit card        12.5   0.5     0.5
  tip_amount tolls_amount total_amount improvement_surcharge
1       0.00            0         9.80                   0.3
2       3.00            0        21.80                   0.3
3       0.00            0        33.30                   0.3
4       1.06            0         6.36                   0.3
5       3.26            0        19.56                   0.3
  congestion_surcharge pickup_location_id dropoff_location_id
1                  2.5                148                  79
2                  2.5                148                 230
3                  2.5                 79                 188
4                  0.0                 80                  80
5                  2.5                148                  87
 [ reached 'max' / getOption("max.print") -- omitted 6567391 rows ]