library(arrow)
library(dplyr)
Arrow In-Memory Exercise
Arrow Table
Read in a single NYC Taxi parquet file using
read_parquet()
as an Arrow TableConvert your Arrow Table object to a
data.frame
or atibble
<- "data/nyc-taxi/year=2019/month=9/part-0.parquet"
parquet_file
<- read_parquet(parquet_file, as_data_frame = FALSE)
taxi_table taxi_table
Table
6567396 rows x 22 columns
$vendor_name <string>
$pickup_datetime <timestamp[ms]>
$dropoff_datetime <timestamp[ms]>
$passenger_count <int64>
$trip_distance <double>
$pickup_longitude <double>
$pickup_latitude <double>
$rate_code <string>
$store_and_fwd <string>
$dropoff_longitude <double>
$dropoff_latitude <double>
$payment_type <string>
$fare_amount <double>
$extra <double>
$mta_tax <double>
$tip_amount <double>
$tolls_amount <double>
$total_amount <double>
$improvement_surcharge <double>
$congestion_surcharge <double>
...
2 more columns
Use `schema()` to see entire schema
|> collect() taxi_table
# A tibble: 6,567,396 × 22
vendor_name pickup_datetime dropoff_datetime passenger_count
<chr> <dttm> <dttm> <int>
1 CMT 2019-08-31 18:09:30 2019-08-31 18:15:42 1
2 CMT 2019-08-31 18:26:30 2019-08-31 18:44:31 1
3 CMT 2019-08-31 18:39:35 2019-08-31 19:15:55 2
4 VTS 2019-08-31 18:12:26 2019-08-31 18:15:17 4
5 VTS 2019-08-31 18:43:16 2019-08-31 18:53:50 1
6 VTS 2019-08-31 18:26:13 2019-08-31 18:45:35 1
7 CMT 2019-08-31 18:34:52 2019-08-31 18:42:03 1
8 CMT 2019-08-31 18:50:02 2019-08-31 18:58:16 1
9 CMT 2019-08-31 18:08:02 2019-08-31 18:14:44 0
10 VTS 2019-08-31 18:11:38 2019-08-31 18:26:47 1
# ℹ 6,567,386 more rows
# ℹ 18 more variables: trip_distance <dbl>, pickup_longitude <dbl>,
# pickup_latitude <dbl>, rate_code <chr>, store_and_fwd <chr>,
# dropoff_longitude <dbl>, dropoff_latitude <dbl>, payment_type <chr>,
# fare_amount <dbl>, extra <dbl>, mta_tax <dbl>, tip_amount <dbl>,
# tolls_amount <dbl>, total_amount <dbl>, improvement_surcharge <dbl>,
# congestion_surcharge <dbl>, pickup_location_id <int>, …
or
as_tibble(taxi_table)
# A tibble: 6,567,396 × 22
vendor_name pickup_datetime dropoff_datetime passenger_count
<chr> <dttm> <dttm> <int>
1 CMT 2019-08-31 18:09:30 2019-08-31 18:15:42 1
2 CMT 2019-08-31 18:26:30 2019-08-31 18:44:31 1
3 CMT 2019-08-31 18:39:35 2019-08-31 19:15:55 2
4 VTS 2019-08-31 18:12:26 2019-08-31 18:15:17 4
5 VTS 2019-08-31 18:43:16 2019-08-31 18:53:50 1
6 VTS 2019-08-31 18:26:13 2019-08-31 18:45:35 1
7 CMT 2019-08-31 18:34:52 2019-08-31 18:42:03 1
8 CMT 2019-08-31 18:50:02 2019-08-31 18:58:16 1
9 CMT 2019-08-31 18:08:02 2019-08-31 18:14:44 0
10 VTS 2019-08-31 18:11:38 2019-08-31 18:26:47 1
# ℹ 6,567,386 more rows
# ℹ 18 more variables: trip_distance <dbl>, pickup_longitude <dbl>,
# pickup_latitude <dbl>, rate_code <chr>, store_and_fwd <chr>,
# dropoff_longitude <dbl>, dropoff_latitude <dbl>, payment_type <chr>,
# fare_amount <dbl>, extra <dbl>, mta_tax <dbl>, tip_amount <dbl>,
# tolls_amount <dbl>, total_amount <dbl>, improvement_surcharge <dbl>,
# congestion_surcharge <dbl>, pickup_location_id <int>, …
or
as.data.frame(taxi_table)
vendor_name pickup_datetime dropoff_datetime passenger_count
1 CMT 2019-08-31 18:09:30 2019-08-31 18:15:42 1
2 CMT 2019-08-31 18:26:30 2019-08-31 18:44:31 1
3 CMT 2019-08-31 18:39:35 2019-08-31 19:15:55 2
4 VTS 2019-08-31 18:12:26 2019-08-31 18:15:17 4
5 VTS 2019-08-31 18:43:16 2019-08-31 18:53:50 1
trip_distance pickup_longitude pickup_latitude rate_code store_and_fwd
1 0.80 NA NA Standard rate No
2 3.70 NA NA Standard rate No
3 8.10 NA NA Standard rate No
4 0.58 NA NA Standard rate No
5 3.32 NA NA Standard rate No
dropoff_longitude dropoff_latitude payment_type fare_amount extra mta_tax
1 NA NA Cash 6.0 3.0 0.5
2 NA NA Credit card 15.0 3.0 0.5
3 NA NA Credit card 29.5 3.0 0.5
4 NA NA Credit card 4.0 0.5 0.5
5 NA NA Credit card 12.5 0.5 0.5
tip_amount tolls_amount total_amount improvement_surcharge
1 0.00 0 9.80 0.3
2 3.00 0 21.80 0.3
3 0.00 0 33.30 0.3
4 1.06 0 6.36 0.3
5 3.26 0 19.56 0.3
congestion_surcharge pickup_location_id dropoff_location_id
1 2.5 148 79
2 2.5 148 230
3 2.5 79 188
4 0.0 80 80
5 2.5 148 87
[ reached 'max' / getOption("max.print") -- omitted 6567391 rows ]