this to narrow down the complexity of interacting with databases to those who have zero skills on SQL but some skills in R programming!
knitr::include_graphics(here::here("images/database.png"))
CREATE DATABASE IF NOT EXISTS Warehouse;
-- Create table called Penguins
CREATE TABLE IF NOT EXISTS penguins(
id INT NOT NULL AUTO_INCREMENT,
VARCHAR(32),
species VARCHAR(32),
island DOUBLE,
bill_length_mm DOUBLE,
bill_depth_mm INTEGER,
flipper_length_mm INTEGER,
body_mass_g VARCHAR(32),
sex year INTEGER,
PRIMARY KEY(id)
) ;
- Load the csv in created table
DATA LOCAL INFILE "/home/ubuntu/project/Markdown_Training/dplyr_and_DBI/penguins.csv"
LOAD INTO TABLE penguins
BY ','
FIELDS TERMINATED BY '"'
ENCLOSED BY '\n'
LINES TERMINATED 1 ROWS; IGNORE
library(dbplyr)
library(DBI)
library(dplyr)
library(tidyverse)
## Connecting to the DB with DB connection string hidden in .renviron
conn <- dbConnect(
RMySQL::MySQL(),
host = Sys.getenv("mysql_hostname"),
port = as.numeric(Sys.getenv("mysql_port")),
user = Sys.getenv("mysql_user"),
password = Sys.getenv("mysql_passwd"),
dbname = "Warehouse",
timeout = 10
)
DBI::dbListTables(conn)#Checking the tables in TB
pengins_tb <- dplyr::tbl(conn, "penguins") #Reading data from DB
pengins_tb %>% collect()
#Get the average body mass of penguins by sex and species
avgbodymass <- pengins_tb %>%
group_by(sex,species) %>%
summarise(avgbodymass = round(mean(body_mass_g,na.rm = T), digits = 2)
) #%>%
# collect()
#show querry of above
avgbodymass %>%
show_query()
#Writting low SQL
avgbodymass <- DBI::dbGetQuery(conn, "
SELECT sex, species, ROUND(AVG(body_mass_g), 2) AS avgbodymass
FROM penguins
GROUP BY sex, species
")
#Which species observed most on every island
speciesObserved_most <- pengins_tb %>%
group_by(island,species) %>%
summarise(num_species = n()) %>%
filter(num_species==max(num_species))
speciesObserved_most %>% show_query()
speciesObserved_most <- DBI::dbGetQuery(conn, "
SELECT island, species, num_species
FROM (
SELECT *, MAX(num_species) OVER (PARTITION BY island) AS q03
FROM (
SELECT island, species, COUNT(*) AS num_species
FROM penguins
GROUP BY island, species
) q01
) q02
WHERE (num_species = q03)
")
#Disconnect from DB
DBI::dbDisconnect(conn)
For attribution, please cite this work as
Gisa (2023, Feb. 28). : Interacting with the Database from RStudio. Retrieved from https://github.com/mgisa/myblog/DataBase/2023-02-16-db-connection/
BibTeX citation
@misc{gisa2023interacting, author = {Gisa, Murera}, title = {: Interacting with the Database from RStudio}, url = {https://github.com/mgisa/myblog/DataBase/2023-02-16-db-connection/}, year = {2023} }