raw to text data

Note: This notebook is used in automation

This notebook pre-process raw mbox files produced by gz_to_raw.ipynb and converts them to csv files that only contain the text, data, and message id fields for each email.

When run directly, it will convert all mbox files in raw/fedora-devel-list/ and save them as csv's in /interim/text/.

When run as part of the automation workflow, it will only pre-process the last full months worth of data and upload it to remote storage for later use.

[1]

import mailbox
import os
import re
import datetime
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
import sys

load_dotenv("../../.env")
sys.path.append("../..")
from src import utils  # noqa

[2]

# collect paths of either the entire dataset, or only the most recent month if running in automation

BASE_PATH = os.getenv("LOCAL_DATA_PATH", "../../data")
path = Path(BASE_PATH).joinpath("raw/fedora-devel-list")
mboxes = [x.name for x in list(path.glob("*.mbox"))]

if os.getenv("RUN_IN_AUTOMATION"):
    LAST_MONTH_DATE = datetime.datetime.now().replace(
        day=1
    ) - datetime.timedelta(days=1)
    y = LAST_MONTH_DATE.year
    m = LAST_MONTH_DATE.month
    mboxes = [x.name for x in list(path.glob(f"*{y}-{m}.mbox"))]

[3]

# Define a function to convert mbox data into row,column format for analysis
# using pandas


def mbox_to_text(mbox):

    csv = []
    for msg in mbox:
        msg_id = msg["Message-ID"]
        date = msg["Date"]
        body = []
        for m in msg.get_payload():
            body.append(m.get_payload())

        csv.append((msg_id, date, body))
    df = pd.DataFrame(csv, columns=["Message-ID", "Date", "Body"])
    return df

[4]

# Clean the data before storing it
def strip_thread(text):
    text = "".join(text)
    text = text.replace("\r", "")
    lines = text.split("\n")
    lines = [line for line in lines if len(line) > 0]
    lines = [line for line in lines if line[0] != ">"]
    lines = [line for line in lines if line[:3] != "Re:"]
    lines = [line for line in lines if line[:7] != "Subject"]
    lines = [line for line in lines if line[:5] != "From:"]
    lines = [line for line in lines if line[:5] != "Date:"]
    lines = [line for line in lines if "BEGIN PGP SIGNED MESSAGE" not in line]
    lines = [line for line in lines if line[:5] != "Hash:"]
    lines = [line for line in lines if line[:10] != "Version: G"]
    lines = [line for line in lines if "wrote:" not in line]
    lines = [line for line in lines if "wrote :" not in line]
    lines = [line for line in lines if "writes:" not in line]
    lines = [line for line in lines if line[:7] != "Am Mit,"]
    lines = [line for line in lines if line[:7] != "Am Don,"]
    lines = [line for line in lines if line[:7] != "Am Mon,"]
    lines = [line for line in lines if line[:7] != "Quoting"]
    lines = [line for line in lines if line[:10] != "Em Quinta,"]
    lines = [line for line in lines if "said:" not in line]
    lines = [
        line
        for line in lines
        # matching lines that are "On Weekday, Day Month, Year" or "On Weekday, Month Day, Year"
        # ie, Monday, 6 March, 2020 or Monday, March 6, 2020
        if re.match(
            ".*n ((Sun|Mon|Tue(s)?|Wed(nes)?|Thu(rs)?|Fri|Sat(ur)?)(day)?)(,)? (.*)?"
            "(Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep((t)?ember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)( .*)? 20..*"  # noqa
            "(at [\d]{2}:[\d]{2} AM|PM)?",  # noqa
            line,
        )
        is None
    ]
    lines = [
        line
        for line in lines
        # matching lines that ar "On Day Month Year"
        # ie, 6 March 2020
        if re.match(
            (
                ".*n .(.)? (Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep((t)?ember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?) 20..*"  # noqa
            ),
            line,
        )
        is None
    ]
    lines = [
        line
        for line in lines
        # matching lines that are "On Weekday, YYYY-MM-DD at HH:MM"
        # ie, Monday, 2020-03-06 at 12:01
        if re.match(
            ".*n ((Sun|Mon|Tue(s)?|Wed(nes)?|Thu(rs)?|Fri|Sat(ur)?)(day)?), 20[\d]{2}-[\d]{2}-[\d]{2} at.*",  # noqa
            line,
        )
        is None
    ]
    lines = [line for line in lines if line[-6:] != "said: "]
    lines = [line for line in lines if line[-8:] != "babbled:"]
    lines = [line for line in lines if line[-7:] != "wrot=e:"]
    lines = [line for line in lines if line[-8:] != "A9crit :"]
    lines = [line for line in lines if line[0] != "|"]
    return "\n".join(lines)


# format for CSV, clean special characters, and remove extranous emails
def pandas_clean(emails):
    emails["Body"].replace(
        to_replace=[
            r"\n",
            "\n",
        ],
        value=" ",
        regex=True,
        inplace=True,
    )
    emails["Body"].replace(
        to_replace=[r"\'", "'", ">", "<", "= ", "-", r"http\S+"],
        value="",
        regex=True,
        inplace=True,
    )
    emails["Body"].replace(
        to_replace=[r"\\\s+", r"\\s+", "="], value="", regex=True, inplace=True
    )
    emails["Body"].replace(
        to_replace=["   ", "  "], value=" ", regex=True, inplace=True
    )
    emails["Body"].replace(
        to_replace=["_", "3D"], value="", regex=True, inplace=True
    )
    emails["Body"].replace(
        to_replace=["   ", "  "], value=" ", regex=True, inplace=True
    )
    emails["Body"].replace(
        to_replace=["   ", "  "], value=" ", regex=True, inplace=True
    )
    emails["Body"] = emails["Body"].apply(
        lambda x: x.strip().replace(r"\n", "")
    )

    emails.drop(emails.index[emails["Body"] == ""], inplace=True)
    emails.drop(emails.index[emails["Body"] == " "], inplace=True)
    emails.dropna(subset=["Body"], inplace=True)

    emails = emails.reset_index()
    emails.drop("index", axis=1, inplace=True)
    return emails

[5]

# Ensure datset location exists
dataset_base_path = Path(f"{BASE_PATH}/interim/text")
dataset_base_path.mkdir(parents=True, exist_ok=True)

# [WIP]avoid edge cases
subset_mboxes = mboxes[:5]

# Register all created dataset slices for later upload
new_files = []

# Save each dataset into its own monthly csv
for mbox in subset_mboxes:
    output_path = dataset_base_path.joinpath(f"{mbox}.csv")
    monthly_mbox = mailbox.mbox(path.joinpath(mbox))
    df = mbox_to_text(monthly_mbox)
    df["Body"] = df["Body"].apply(strip_thread)
    df = pandas_clean(df)
    df.to_csv(output_path)
    new_files.append(output_path)
    print(f"{output_path} saved")

../../data/interim/text/fedora-devel-2018-11.mbox.csv saved
../../data/interim/text/fedora-devel-2018-5.mbox.csv saved
../../data/interim/text/fedora-devel-2018-6.mbox.csv saved
../../data/interim/text/fedora-devel-2019-12.mbox.csv saved
../../data/interim/text/fedora-devel-2019-11.mbox.csv saved

To sanity check our work, we'll look at the uncleaned data. Many of these emails will have time stamps and other extra information within the message.

[6]

sample_mbox = mailbox.mbox(path.joinpath("fedora-devel-2018-6.mbox"))
sample_mbox_df = mbox_to_text(sample_mbox)
sample_mbox_df.head()

	Message-ID	Date	Body
0	<1527812803.30401.3@posteo.de>	Thu, 31 May 2018 19:26:43 -0500	[On Thu, May 31, 2018 at 5:39 PM, Kevin Kofler...
1	<20180601034232.24781.60408@mailman01.phx2.fed...	Fri, 01 Jun 2018 03:42:32 +0000	[100% agreed. how ever enable an option to aut...
2	<9b15996762dcd9a92ccd484d7545ce35e9da8093.came...	Fri, 01 Jun 2018 07:14:09 +0300	[On Thu, 2018-05-31 at 22:34 +0100, Tomasz K=C...
3	<8e923b97-9ebf-678f-3bca-73b92d638929@gmail.com>	Fri, 01 Jun 2018 00:52:38 -0400	[With respect, I am opposed to the proposal.\n...
4	<CAOCN9rxBuGK=gXgQzHFX=eKm_x4zttQSfAMpiZwOz5wS...	Fri, 01 Jun 2018 02:29:06 -0400	[On Thu, May 31, 2018 at 6:53 AM Sam Varshavch...

We can see the cleaned messages contain more relevant information to analyze since we removed text such as date sent, sender, and special characters, among others.

[7]

sample_cleaned_mbox = pd.read_csv(
    "../../data/interim/text/fedora-devel-2018-6.mbox.csv"
)
sample_cleaned_mbox.head()

	Unnamed: 0	Message-ID	Date	Body
0	0	<1527812803.30401.3@posteo.de>	Thu, 31 May 2018 19:26:43 -0500	Ugh yes, thanks for pointing this out, Kevin. ...
1	1	<20180601034232.24781.60408@mailman01.phx2.fed...	Fri, 01 Jun 2018 03:42:32 +0000	100% agreed. how ever enable an option to auto...
2	2	<9b15996762dcd9a92ccd484d7545ce35e9da8093.came...	Fri, 01 Jun 2018 07:14:09 +0300	(CCing the Fedora infrastructure list) This is...
3	3	<8e923b97-9ebf-678f-3bca-73b92d638929@gmail.com>	Fri, 01 Jun 2018 00:52:38 -0400	With respect, I am opposed to the proposal. In...
4	4	<CAOCN9rxBuGK=gXgQzHFX=eKm_x4zttQSfAMpiZwOz5wS...	Fri, 01 Jun 2018 02:29:06 -0400	rote: cy. Its also useful to see a report of w...

[8]

# Push all the files to ceph

if os.getenv("RUN_IN_AUTOMATION"):
    utils.upload_files(
        (f.as_posix(), f"interim/text/{Path(f).name}") for f in new_files
    )

Contribute to this page