In [1]:
Copied!
import os
import pandas as pd
import awswrangler as wr
import pydbtools as pydb
import os
import pandas as pd
import awswrangler as wr
import pydbtools as pydb
In [2]:
Copied!
# setup your own testing area (set foldername = GH username)
foldername = "mratford" # GH username
foldername = foldername.lower().replace("-", "_")
# setup your own testing area (set foldername = GH username)
foldername = "mratford" # GH username
foldername = foldername.lower().replace("-", "_")
In [3]:
Copied!
bucketname = "alpha-everyone"
s3_base_path = f"s3://{bucketname}/{foldername}/"
db_name = f"aws_example_{foldername}"
source_db_base_path = f"s3://{bucketname}/{foldername}/source_db/"
# Delete all the s3 files in a given path
if wr.s3.list_objects(s3_base_path):
print("deleting objs")
wr.s3.delete_objects(s3_base_path)
# Delete the database if it exists
df_dbs = wr.catalog.databases(None)
if db_name in df_dbs["Database"].to_list():
print(f"{db_name} found deleting")
wr.catalog.delete_database(name=db_name)
# Setup source database
# Create the database
wr.catalog.create_database(db_name)
bucketname = "alpha-everyone"
s3_base_path = f"s3://{bucketname}/{foldername}/"
db_name = f"aws_example_{foldername}"
source_db_base_path = f"s3://{bucketname}/{foldername}/source_db/"
# Delete all the s3 files in a given path
if wr.s3.list_objects(s3_base_path):
print("deleting objs")
wr.s3.delete_objects(s3_base_path)
# Delete the database if it exists
df_dbs = wr.catalog.databases(None)
if db_name in df_dbs["Database"].to_list():
print(f"{db_name} found deleting")
wr.catalog.delete_database(name=db_name)
# Setup source database
# Create the database
wr.catalog.create_database(db_name)
deleting objs aws_example_mratford found deleting
Create a dataframe with difficult dates¶
In [4]:
Copied!
df = pd.DataFrame(
{
"event": [
"Foundation of Mega-City One",
"Newspeak eclipses oldspeak",
"The year 3000",
],
"future_date": ["2031-03-17", "2050-10-05", "3000-01-01"],
}
)
df = pd.DataFrame(
{
"event": [
"Foundation of Mega-City One",
"Newspeak eclipses oldspeak",
"The year 3000",
],
"future_date": ["2031-03-17", "2050-10-05", "3000-01-01"],
}
)
Write to a table. We need to use strings as types to get the data into the table.
In [5]:
Copied!
table_name = "future"
wr.s3.to_parquet(
df=df,
path=os.path.join(s3_base_path, table_name),
index=False,
dataset=True,
database=db_name,
table=table_name,
mode="overwrite",
)
table_name = "future"
wr.s3.to_parquet(
df=df,
path=os.path.join(s3_base_path, table_name),
index=False,
dataset=True,
database=db_name,
table=table_name,
mode="overwrite",
)
Out[5]:
{'paths': ['s3://alpha-everyone/mratford/future/28e04de56d6240c0b03185d11dce343f.snappy.parquet'], 'partitions_values': {}}
Now use the existing table to create a new table with the date strings converted to dates.
In [6]:
Copied!
sql = f"""
CREATE TABLE {db_name}.new_future
WITH (
external_location = '{source_db_base_path}new_future'
)
AS SELECT event, date(future_date) as futuredate
FROM {db_name}.future;
"""
print(sql)
_ = pydb.start_query_execution_and_wait(sql)
sql = f"""
CREATE TABLE {db_name}.new_future
WITH (
external_location = '{source_db_base_path}new_future'
)
AS SELECT event, date(future_date) as futuredate
FROM {db_name}.future;
"""
print(sql)
_ = pydb.start_query_execution_and_wait(sql)
CREATE TABLE aws_example_mratford.new_future WITH ( external_location = 's3://alpha-everyone/mratford/source_db/new_future' ) AS SELECT event, date(future_date) as futuredate FROM aws_example_mratford.future;
If we query the table using a standard awswrangler query it will fail.
In [7]:
Copied!
sql = f"""
select event, futuredate
from {db_name}.new_future
"""
try:
wr.athena.read_sql_query(sql, database=db_name, ctas_approach=False)
except AttributeError:
print("Failed as expected.")
sql = f"""
select event, futuredate
from {db_name}.new_future
"""
try:
wr.athena.read_sql_query(sql, database=db_name, ctas_approach=False)
except AttributeError:
print("Failed as expected.")
Failed as expected.
A pydbtools query passes the additional parameters to awswrangler to handle the dates.
In [8]:
Copied!
pydb.read_sql_query(sql)
pydb.read_sql_query(sql)
Out[8]:
event | futuredate | |
---|---|---|
0 | Foundation of Mega-City One | 2031-03-17 |
1 | Newspeak eclipses oldspeak | 2050-10-05 |
2 | The year 3000 | 3000-01-01 |
Clean up the database and table.
In [9]:
Copied!
pydb.delete_database_and_data(db_name)
pydb.delete_database_and_data(db_name)
Out[9]:
True
In [ ]:
Copied!