Create Temporary Tables Using SQL Files¶
This tutorial demonstrates how to create temporary tables in athena using pydbtools.read_sql_queries
. This is an amended version of create_temporary_version.ipynb.
Setup¶
Just run this script to create the source database so we can use it for our example.
import os
import pandas as pd
import awswrangler as wr
import pydbtools as pydb
# setup your own testing area (set foldername = GH username)
foldername = "mratford" # GH username
foldername = foldername.lower().replace("-", "_")
bucketname = "alpha-everyone"
s3_base_path = f"s3://{bucketname}/{foldername}/"
db_name = f"aws_example_{foldername}"
source_db_base_path = f"s3://{bucketname}/{foldername}/source_db/"
pydb.delete_database_and_data(db_name)
# Setup source database
# Create the database
pydb.create_database(db_name)
# Iterate through the tables in data/ and write them to our db using awswrangler
for table_name in ["department", "employees", "sales"]:
table_path = pydb.s3_path_join(source_db_base_path, f"{table_name}/")
pydb.file_to_table(
path=f"data/{table_name}.csv",
database=db_name,
table=table_name,
location=table_path,
)
Task¶
We are going to create a table that shows total sales per employee using all 3 tables.
pydb.read_sql_query(
f"SELECT * FROM {db_name}.employees LIMIT 5", ctas_approach=False
)
employee_id | sex | forename | surname | department_id | manager_id | |
---|---|---|---|---|---|---|
0 | 1 | M | Dexter | Mitchell | 1 | 17 |
1 | 2 | F | Summer | Bennett | 1 | 17 |
2 | 3 | M | Pip | Carter | 1 | 17 |
3 | 4 | F | Bella | Long | 1 | 17 |
4 | 5 | F | Lexie | Perry | <NA> | 17 |
pydb.read_sql_query(
f"SELECT * FROM {db_name}.department LIMIT 5", ctas_approach=False
)
department_id | department_name | |
---|---|---|
0 | 1 | Sales |
1 | 2 | Admin |
2 | 3 | Management |
3 | 4 | Technical |
4 | 5 | Maintenance |
pydb.read_sql_query(
f"SELECT * FROM {db_name}.sales LIMIT 5", ctas_approach=False
)
employee_id | qtr | sales | |
---|---|---|---|
0 | 1 | 1 | 768.17 |
1 | 2 | 1 | 391.98 |
2 | 3 | 1 | 406.36 |
3 | 4 | 1 | 816.25 |
4 | 5 | 1 | 437.05 |
pydbtools has read_sql_queries
and read_sql_queries
functions that allow you to create temporary tables within SQL which you can refer to in a __temp__
database.
First create a total_sales table:
sql = f"""
CREATE TEMP TABLE total_sales AS
SELECT employee_id, sum(sales) as total_sales
FROM {db_name}.sales
GROUP BY employee_id;
"""
print(sql)
CREATE TEMP TABLE total_sales AS SELECT employee_id, sum(sales) as total_sales FROM aws_example_mratford.sales GROUP BY employee_id;
Then create a table of employee names from the sales department:
sql += f"""
CREATE TEMP TABLE sales_employees AS
SELECT e.employee_id, e.forename, e.surname, d.department_name
FROM {db_name}.employees AS e
LEFT JOIN {db_name}.department AS d
ON e.department_id = d.department_id
WHERE e.department_id = 1;
"""
print(sql)
CREATE TEMP TABLE total_sales AS SELECT employee_id, sum(sales) as total_sales FROM aws_example_mratford.sales GROUP BY employee_id; CREATE TEMP TABLE sales_employees AS SELECT e.employee_id, e.forename, e.surname, d.department_name FROM aws_example_mratford.employees AS e LEFT JOIN aws_example_mratford.department AS d ON e.department_id = d.department_id WHERE e.department_id = 1;
Finally return our final tables
Note that more than one select statement can be used so the function returns an iterator yielding the results of each select.
sql += f"""
SELECT se.*, ts.total_sales
FROM __temp__.sales_employees AS se
INNER JOIN __temp__.total_sales AS ts
ON se.employee_id = ts.employee_id;
"""
print(sql)
CREATE TEMP TABLE total_sales AS SELECT employee_id, sum(sales) as total_sales FROM aws_example_mratford.sales GROUP BY employee_id; CREATE TEMP TABLE sales_employees AS SELECT e.employee_id, e.forename, e.surname, d.department_name FROM aws_example_mratford.employees AS e LEFT JOIN aws_example_mratford.department AS d ON e.department_id = d.department_id WHERE e.department_id = 1; SELECT se.*, ts.total_sales FROM __temp__.sales_employees AS se INNER JOIN __temp__.total_sales AS ts ON se.employee_id = ts.employee_id;
total_sales = pydb.read_sql_queries(sql)
total_sales
employee_id | forename | surname | department_name | total_sales | |
---|---|---|---|---|---|
0 | 1 | Dexter | Mitchell | Sales | 2911.65 |
1 | 2 | Summer | Bennett | Sales | 1785.73 |
2 | 3 | Pip | Carter | Sales | 2590.60 |
3 | 4 | Bella | Long | Sales | 2996.54 |
4 | 6 | Robert | Roberts | Sales | 2207.77 |
5 | 7 | Iris | Alexander | Sales | 2465.13 |
6 | 9 | Evan | Carter | Sales | 2279.84 |
7 | 10 | Lauren | Powell | Sales | 1935.67 |
8 | 11 | Alice | James | Sales | 3092.89 |
9 | 12 | Owen | Scott | Sales | 2286.28 |
10 | 13 | Sarah | Patterson | Sales | 2711.01 |
11 | 15 | Evie | Morgan | Sales | 2613.67 |
12 | 16 | Zachary | Rodriguez | Sales | 2373.06 |
13 | 17 | Madison | Kelly | Sales | 2302.02 |
14 | 18 | Jake | Harris | Sales | 1759.39 |
15 | 19 | Emma | Cooper | Sales | 2442.86 |
16 | 20 | Eva | Peterson | Sales | 2851.36 |
17 | 21 | Isaac | White | Sales | 1643.14 |
18 | 23 | Sienna | James | Sales | 3036.47 |
19 | 24 | Seren | Diaz | Sales | 2248.35 |
20 | 26 | Angel | Hayes | Sales | 1994.65 |
21 | 27 | Evan | Carter | Sales | 3050.02 |
22 | 28 | Olivia | Rogers | Sales | 2071.77 |
23 | 29 | Molly | Gray | Sales | 2478.25 |
24 | 30 | Lucas | Moore | Sales | 2251.47 |
25 | 31 | Kitty | Russell | Sales | 2461.53 |
26 | 32 | Leo | Thomas | Sales | 2693.30 |
27 | 33 | Evie | Morgan | Sales | 2685.07 |
28 | 34 | Eli | Mitchell | Sales | 1387.50 |
29 | 36 | joseph | Martin | Sales | 1580.06 |
30 | 37 | Alexandra | Russell | Sales | 1984.24 |
31 | 38 | Keira | Hughes | Sales | 2158.55 |
32 | 39 | Sophie | Morris | Sales | 1931.27 |
33 | 40 | Tobias | Collins | Sales | 2595.53 |
34 | 41 | Lola | Watson | Sales | 1326.88 |
35 | 42 | Maisie | Torres | Sales | 1688.76 |
36 | 43 | Phoebe | Peterson | Sales | 2073.25 |
37 | 44 | Mia | Murphy | Sales | 2184.14 |
38 | 45 | Freya | Bailey | Sales | 2778.84 |
39 | 46 | Sebastian | Hall | Sales | 2547.63 |
40 | 47 | Emily | Stewart | Sales | 1489.52 |
The read_sql_queries_gen
function allows you to use more than SELECT
statement, returning an iterator of dataframes.
sql += f"""
SELECT forename, surname, sum(s.sales) as q1_sales
FROM __temp__.sales_employees AS se
LEFT JOIN {db_name}.sales AS s
ON se.employee_id = s.employee_id
GROUP BY forename, surname;
"""
print(sql)
CREATE TEMP TABLE total_sales AS SELECT employee_id, sum(sales) as total_sales FROM aws_example_mratford.sales GROUP BY employee_id; CREATE TEMP TABLE sales_employees AS SELECT e.employee_id, e.forename, e.surname, d.department_name FROM aws_example_mratford.employees AS e LEFT JOIN aws_example_mratford.department AS d ON e.department_id = d.department_id WHERE e.department_id = 1; SELECT se.*, ts.total_sales FROM __temp__.sales_employees AS se INNER JOIN __temp__.total_sales AS ts ON se.employee_id = ts.employee_id; SELECT forename, surname, sum(s.sales) as q1_sales FROM __temp__.sales_employees AS se LEFT JOIN aws_example_mratford.sales AS s ON se.employee_id = s.employee_id GROUP BY forename, surname;
total_sales, q1_sales = tuple(pydb.read_sql_queries_gen(sql))
q1_sales
forename | surname | q1_sales | |
---|---|---|---|
0 | Dexter | Mitchell | 2911.65 |
1 | Elliot | King | NaN |
2 | Madison | Kelly | 2302.02 |
3 | Eva | Peterson | 2851.36 |
4 | Olivia | Rogers | 2071.77 |
5 | Angel | Hayes | 1994.65 |
6 | Bella | Long | 2996.54 |
7 | Rosie | Bennett | NaN |
8 | Lauren | Powell | 1935.67 |
9 | Tobias | Collins | 2595.53 |
10 | Evan | Carter | 5329.86 |
11 | Sarah | Patterson | 2711.01 |
12 | Robert | Roberts | 2207.77 |
13 | Emily | Stewart | 1489.52 |
14 | Owen | Scott | 2286.28 |
15 | joseph | Martin | 1580.06 |
16 | Keira | Hughes | 2158.55 |
17 | Iris | Alexander | 2465.13 |
18 | Eli | Mitchell | 1387.50 |
19 | Pip | Carter | 2590.60 |
20 | Kitty | Russell | 2461.53 |
21 | Lola | Watson | 1326.88 |
22 | Lucas | Moore | 2251.47 |
23 | Sebastian | Hall | 2547.63 |
24 | Phoebe | Peterson | 2073.25 |
25 | Zachary | Rodriguez | 2373.06 |
26 | Emma | Cooper | 2442.86 |
27 | Seren | Diaz | 2248.35 |
28 | Sophie | Morris | 1931.27 |
29 | Alexandra | Russell | 1984.24 |
30 | Summer | Bennett | 1785.73 |
31 | Jake | Harris | 1759.39 |
32 | Alice | James | 3092.89 |
33 | Mia | Murphy | 2184.14 |
34 | Freya | Bailey | 2778.84 |
35 | Maisie | Torres | 1688.76 |
36 | Leo | Thomas | 2693.30 |
37 | Evie | Morgan | 5298.74 |
38 | Isaac | White | 1643.14 |
39 | Molly | Gray | 2478.25 |
40 | Sienna | James | 3036.47 |
### Clean up
pydb.delete_database_and_data(db_name)
True