Triamus / play

play repo for experiments (mainly with git)
1 stars 0 forks source link

apache drill #15

Open Triamus opened 6 years ago

Triamus commented 6 years ago

apache drill & parquet

select 
  csv.row_id, 
  csv.name,
  parquet.vehicle,
  parquet.model_year
from 
  dfs.tmp.`D:/data/other/claims.csv` as csv
left join
  dfs.tmp.`D:/data/other/claims.parquet` as parquet
on
  csv.row_id = parquet.row_id
where
  csv.blind_make = 'AR'
limit 5;
select csv.row_id, csv.name, parquet.vehicle, parquet.model_year from dfs.tmp.`D:/data/other/claims.csv` csv left join dfs.tmp.`D:/data/other/claims.parquet` parquet on csv.row_id = parquet.row_id where csv.blind_make = "AR" limit 5;

title: "R Notebook" output: html_notebook editor_options: chunk_output_type: console

library(sergeant)
# use localhost if running standalone on same system otherwise the host or IP of your Drill server
ds <- src_drill("localhost")
ds
dc <- drill_connection()
dc

dc %>% drill_status()
dc %>% drill_version()
dc %>% drill_metrics()
dc %>% drill_options()
dc %>% drill_stats()
dc %>% drill_storage()
dc %>% drill_threads()

dc %>% drill_show_files("dfs.tmp")
dc %>% drill_show_schemas()
library(tidyverse)
#ds %>% drill_version()

see available methods

sql_translate_env(src_drill()$con)
dc %>%
  drill_query("SELECT * FROM dfs.`C:/data/flight_delay/Flight_Delays_Sample.csv` limit 5")

dc %>%
  drill_query("SELECT * FROM dfs.`C:/data/flight_delay/Flight_Delays_Sample.parquet` limit 5")
db <- tbl(ds, "cp.`employee.json`")
install.packages("sergeant")
library(sergeant)
curl -o ~/data/misc/Bike-Sharing-Dataset.zip https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip

unzip ~/data/misc/Bike-Sharing-Dataset.zip