Interacting with the Database from RStudio

database DBI dbplyr tidyverse

this to narrow down the complexity of interacting with databases to those who have zero skills on SQL but some skills in R programming!

Author

Affiliation

Murera Gisa

 

Published

Feb. 27, 2023

Citation

Gisa, 2023

hide
knitr::include_graphics(here::here("images/database.png"))
gganimate: worth to try it and learn it

Figure 1: gganimate: worth to try it and learn it

Creating DB to be Used

hide
CREATE DATABASE IF NOT EXISTS Warehouse;

Creating empty table on DB

hide
-- Create table called Penguins 

CREATE TABLE IF NOT EXISTS penguins(
  id INT NOT NULL AUTO_INCREMENT,
  species  VARCHAR(32),
  island  VARCHAR(32),
  bill_length_mm DOUBLE,
  bill_depth_mm DOUBLE,
  flipper_length_mm INTEGER,
  body_mass_g INTEGER,
  sex VARCHAR(32),
  year INTEGER,
  PRIMARY KEY(id)
)
;

Load the penguins csv table

hide
- Load the csv in created table

LOAD DATA LOCAL INFILE "/home/ubuntu/project/Markdown_Training/dplyr_and_DBI/penguins.csv"
INTO TABLE penguins 
FIELDS TERMINATED BY ',' 
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;

Connecting to the DB through Rstudio

hide
library(dbplyr)
library(DBI)
library(dplyr)
library(tidyverse)

## Connecting to the DB with DB connection string hidden in .renviron

conn <- dbConnect(
  RMySQL::MySQL(),
  host = Sys.getenv("mysql_hostname"),
  port = as.numeric(Sys.getenv("mysql_port")),
  user = Sys.getenv("mysql_user"),
  password = Sys.getenv("mysql_passwd"),
  dbname = "Warehouse",
  timeout = 10
)

Assignment Questions

hide
DBI::dbListTables(conn)#Checking the tables in TB

pengins_tb <- dplyr::tbl(conn, "penguins") #Reading data from DB

pengins_tb %>% collect()

#Get the average body mass of penguins by sex and species
avgbodymass <- pengins_tb %>%   
              group_by(sex,species) %>% 
              summarise(avgbodymass = round(mean(body_mass_g,na.rm = T), digits = 2)
                        ) #%>%
             # collect() 
#show querry of above 
avgbodymass %>%
  show_query()
#Writting low SQL

avgbodymass <- DBI::dbGetQuery(conn, "
                               SELECT sex, species, ROUND(AVG(body_mass_g), 2) AS avgbodymass
                               FROM penguins
                               GROUP BY sex, species
                               ")
#Which species observed most on every island

speciesObserved_most <- pengins_tb %>% 
                     group_by(island,species) %>% 
                    summarise(num_species = n()) %>%
                    filter(num_species==max(num_species)) 

speciesObserved_most %>% show_query()

speciesObserved_most <- DBI::dbGetQuery(conn, "
SELECT island, species, num_species
FROM (
  SELECT *, MAX(num_species) OVER (PARTITION BY island) AS q03
  FROM (
    SELECT island, species, COUNT(*) AS num_species
    FROM penguins
    GROUP BY island, species
  ) q01
) q02
WHERE (num_species = q03)
                         ")

#Disconnect from DB
DBI::dbDisconnect(conn)

Footnotes

    Citation

    For attribution, please cite this work as

    Gisa (2023, Feb. 28). : Interacting with the Database from RStudio. Retrieved from https://github.com/mgisa/myblog/DataBase/2023-02-16-db-connection/

    BibTeX citation

    @misc{gisa2023interacting,
      author = {Gisa, Murera},
      title = {: Interacting with the Database from RStudio},
      url = {https://github.com/mgisa/myblog/DataBase/2023-02-16-db-connection/},
      year = {2023}
    }