ODH Logo

raw to metadata

Note: This notebook is used in automation

This notebook pre-process raw mbox files produced by gz_to_raw.ipynb and converts them to csv files that only contain the metadata fields for each email.

When run directly, it will convert all mbox files in raw/fedora-devel-list/ and save them as csv's in /interim/metadata/.

When run as part of the automation workflow, it will only pre-process the last full months worth of data and upload it to remote storage for later use.

[28]
import mailbox
import os
import datetime
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

load_dotenv("../../.env")

import sys

sys.path.append("../..")
from src import utils
[5]
# collect paths of either the entire dataset, or only the most recent month if running in automation

BASE_PATH = os.getenv("LOCAL_DATA_PATH", "../../data")
path = Path(BASE_PATH).joinpath("raw/fedora-devel-list")
mboxes = [x.name for x in list(path.glob("*.mbox"))]

if os.getenv("RUN_IN_AUTOMATION"):
    LAST_MONTH_DATE = datetime.datetime.now().replace(
        day=1
    ) - datetime.timedelta(days=1)
    y = LAST_MONTH_DATE.year
    m = LAST_MONTH_DATE.month
    mboxes = [x.name for x in list(path.glob(f"*{y}-{m}.mbox"))]
[17]
# Define a function to convert mbox data into row,column format for analysis using pandas


def mbox_to_meta(mbox):

    csv = []
    for msg in mbox:
        msg_id = msg["Message-ID"]
        date = msg["Date"]
        subject = msg["Subject"]

        csv.append((msg_id, date, subject))
    df = pd.DataFrame(
        csv,
        columns=["Message-ID", "Date", "Subject"],
    )

    return df
[19]
# Ensure datset location exists
dataset_base_path = Path(f"{BASE_PATH}/interim/metadata")
dataset_base_path.mkdir(parents=True, exist_ok=True)

# Register all created dataset slices for later upload
new_files = []

# Save each dataset into its own monthly csv
for mbox in mboxes:
    output_path = dataset_base_path.joinpath(f"{mbox}.csv")
    monthly_mbox = mailbox.mbox(path.joinpath(mbox))
    df = mbox_to_meta(monthly_mbox)
    df.to_csv(output_path)
    new_files.append(output_path)
    print(f"{output_path} saved")
../../data/interim/metadata/fedora-devel-2018-3.mbox.csv saved ../../data/interim/metadata/fedora-devel-2018-5.mbox.csv saved ../../data/interim/metadata/fedora-devel-2018-6.mbox.csv saved ../../data/interim/metadata/fedora-devel-2018-7.mbox.csv saved ../../data/interim/metadata/fedora-devel-2018-8.mbox.csv saved ../../data/interim/metadata/fedora-devel-2018-9.mbox.csv saved ../../data/interim/metadata/fedora-devel-2019-7.mbox.csv saved ../../data/interim/metadata/fedora-devel-2019-8.mbox.csv saved ../../data/interim/metadata/fedora-devel-2019-9.mbox.csv saved ../../data/interim/metadata/fedora-devel-2019-10.mbox.csv saved ../../data/interim/metadata/fedora-devel-2019-11.mbox.csv saved ../../data/interim/metadata/fedora-devel-2019-12.mbox.csv saved ../../data/interim/metadata/fedora-devel-2020-1.mbox.csv saved ../../data/interim/metadata/fedora-devel-2020-2.mbox.csv saved ../../data/interim/metadata/fedora-devel-2020-3.mbox.csv saved ../../data/interim/metadata/fedora-devel-2020-4.mbox.csv saved ../../data/interim/metadata/fedora-devel-2020-5.mbox.csv saved ../../data/interim/metadata/fedora-devel-2020-6.mbox.csv saved ../../data/interim/metadata/fedora-devel-2020-7.mbox.csv saved ../../data/interim/metadata/fedora-devel-2020-8.mbox.csv saved ../../data/interim/metadata/fedora-devel-2020-9.mbox.csv saved ../../data/interim/metadata/fedora-devel-2020-10.mbox.csv saved ../../data/interim/metadata/fedora-devel-2020-11.mbox.csv saved ../../data/interim/metadata/fedora-devel-2018-12.mbox.csv saved ../../data/interim/metadata/fedora-devel-2018-1.mbox.csv saved ../../data/interim/metadata/fedora-devel-2018-2.mbox.csv saved ../../data/interim/metadata/fedora-devel-2018-4.mbox.csv saved ../../data/interim/metadata/fedora-devel-2018-10.mbox.csv saved ../../data/interim/metadata/fedora-devel-2018-11.mbox.csv saved ../../data/interim/metadata/fedora-devel-2019-1.mbox.csv saved ../../data/interim/metadata/fedora-devel-2019-2.mbox.csv saved ../../data/interim/metadata/fedora-devel-2019-3.mbox.csv saved ../../data/interim/metadata/fedora-devel-2019-4.mbox.csv saved ../../data/interim/metadata/fedora-devel-2019-5.mbox.csv saved ../../data/interim/metadata/fedora-devel-2019-6.mbox.csv saved
[52]
# Push to Ceph
if os.getenv("RUN_IN_AUTOMATION"):
    utils.upload_files(
        (f.as_posix(), f"interim/metadata/{Path(f).name}") for f in new_files
    )