raw to metadata
Note: This notebook is used in automation
This notebook pre-process raw mbox files produced by gz_to_raw.ipynb
and converts them to csv files that only contain the metadata fields for each email.
When run directly, it will convert all mbox files in raw/fedora-devel-list/
and save them as csv's in /interim/metadata/
.
When run as part of the automation workflow, it will only pre-process the last full months worth of data and upload it to remote storage for later use.
[28]
import mailbox
import os
import datetime
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
load_dotenv("../../.env")
import sys
sys.path.append("../..")
from src import utils
[5]
# collect paths of either the entire dataset, or only the most recent month if running in automation
BASE_PATH = os.getenv("LOCAL_DATA_PATH", "../../data")
path = Path(BASE_PATH).joinpath("raw/fedora-devel-list")
mboxes = [x.name for x in list(path.glob("*.mbox"))]
if os.getenv("RUN_IN_AUTOMATION"):
LAST_MONTH_DATE = datetime.datetime.now().replace(
day=1
) - datetime.timedelta(days=1)
y = LAST_MONTH_DATE.year
m = LAST_MONTH_DATE.month
mboxes = [x.name for x in list(path.glob(f"*{y}-{m}.mbox"))]
[17]
# Define a function to convert mbox data into row,column format for analysis using pandas
def mbox_to_meta(mbox):
csv = []
for msg in mbox:
msg_id = msg["Message-ID"]
date = msg["Date"]
subject = msg["Subject"]
csv.append((msg_id, date, subject))
df = pd.DataFrame(
csv,
columns=["Message-ID", "Date", "Subject"],
)
return df
[19]
# Ensure datset location exists
dataset_base_path = Path(f"{BASE_PATH}/interim/metadata")
dataset_base_path.mkdir(parents=True, exist_ok=True)
# Register all created dataset slices for later upload
new_files = []
# Save each dataset into its own monthly csv
for mbox in mboxes:
output_path = dataset_base_path.joinpath(f"{mbox}.csv")
monthly_mbox = mailbox.mbox(path.joinpath(mbox))
df = mbox_to_meta(monthly_mbox)
df.to_csv(output_path)
new_files.append(output_path)
print(f"{output_path} saved")
../../data/interim/metadata/fedora-devel-2018-3.mbox.csv saved
../../data/interim/metadata/fedora-devel-2018-5.mbox.csv saved
../../data/interim/metadata/fedora-devel-2018-6.mbox.csv saved
../../data/interim/metadata/fedora-devel-2018-7.mbox.csv saved
../../data/interim/metadata/fedora-devel-2018-8.mbox.csv saved
../../data/interim/metadata/fedora-devel-2018-9.mbox.csv saved
../../data/interim/metadata/fedora-devel-2019-7.mbox.csv saved
../../data/interim/metadata/fedora-devel-2019-8.mbox.csv saved
../../data/interim/metadata/fedora-devel-2019-9.mbox.csv saved
../../data/interim/metadata/fedora-devel-2019-10.mbox.csv saved
../../data/interim/metadata/fedora-devel-2019-11.mbox.csv saved
../../data/interim/metadata/fedora-devel-2019-12.mbox.csv saved
../../data/interim/metadata/fedora-devel-2020-1.mbox.csv saved
../../data/interim/metadata/fedora-devel-2020-2.mbox.csv saved
../../data/interim/metadata/fedora-devel-2020-3.mbox.csv saved
../../data/interim/metadata/fedora-devel-2020-4.mbox.csv saved
../../data/interim/metadata/fedora-devel-2020-5.mbox.csv saved
../../data/interim/metadata/fedora-devel-2020-6.mbox.csv saved
../../data/interim/metadata/fedora-devel-2020-7.mbox.csv saved
../../data/interim/metadata/fedora-devel-2020-8.mbox.csv saved
../../data/interim/metadata/fedora-devel-2020-9.mbox.csv saved
../../data/interim/metadata/fedora-devel-2020-10.mbox.csv saved
../../data/interim/metadata/fedora-devel-2020-11.mbox.csv saved
../../data/interim/metadata/fedora-devel-2018-12.mbox.csv saved
../../data/interim/metadata/fedora-devel-2018-1.mbox.csv saved
../../data/interim/metadata/fedora-devel-2018-2.mbox.csv saved
../../data/interim/metadata/fedora-devel-2018-4.mbox.csv saved
../../data/interim/metadata/fedora-devel-2018-10.mbox.csv saved
../../data/interim/metadata/fedora-devel-2018-11.mbox.csv saved
../../data/interim/metadata/fedora-devel-2019-1.mbox.csv saved
../../data/interim/metadata/fedora-devel-2019-2.mbox.csv saved
../../data/interim/metadata/fedora-devel-2019-3.mbox.csv saved
../../data/interim/metadata/fedora-devel-2019-4.mbox.csv saved
../../data/interim/metadata/fedora-devel-2019-5.mbox.csv saved
../../data/interim/metadata/fedora-devel-2019-6.mbox.csv saved
[52]
# Push to Ceph
if os.getenv("RUN_IN_AUTOMATION"):
utils.upload_files(
(f.as_posix(), f"interim/metadata/{Path(f).name}") for f in new_files
)