ODH Logo

Collect Data

This notebook is used to scrape data from the Fedora developers mailing list in monthly chunks and then store copies both locally and in a remote Ceph bucket.

Steps:

  • Download Fedora data set as monthly *.gz files to LOCAL_PATH or "data/" directory.
  • Upload dataset as monthly "{BUCKET}/{PREFIX}/raw/*.gz" files in remote Ceph storage.

When run in automation, this should only act upon the most recent full month of data.

[10]
import wget
import os
import datetime
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv

load_dotenv("../../.env")

import sys

sys.path.append("../..")
from src import utils

Download Data

[2]
BASE_PATH = os.getenv("LOCAL_DATA_PATH", "../../data")
dataset_base_path = Path(f"{BASE_PATH}/raw/fedora-devel-list/")
dataset_base_path.mkdir(parents=True, exist_ok=True)
[3]
CURRENT_DATE = datetime.datetime.now().replace(day=1)
START_DATE = datetime.date(2018, 1, 1)
[4]
dates = pd.date_range(START_DATE, CURRENT_DATE, freq="m")

if os.getenv("RUN_IN_AUTOMATION"):
    dates = [dates[-1]]
[43]
# iterate over dates and download the corresponding *.gz files
base_url = "https://lists.fedoraproject.org/archives/list/devel@lists.\
fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz"

for i, date in enumerate(dates):
    url = f"{base_url}?start={date.year}-{date.month}-01&end={date.year}-{date.month}-{date.day}"
    print(url)
    wget.download(
        url,
        out=f"{dataset_base_path}/fedora-devel-{date.year}-{date.month}.mbox.gz",
    )
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-1-01&end=2018-1-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-2-01&end=2018-2-28 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-3-01&end=2018-3-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-4-01&end=2018-4-30 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-5-01&end=2018-5-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-6-01&end=2018-6-30 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-7-01&end=2018-7-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-8-01&end=2018-8-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-9-01&end=2018-9-30 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-10-01&end=2018-10-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-11-01&end=2018-11-30 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-12-01&end=2018-12-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-1-01&end=2019-1-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-2-01&end=2019-2-28 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-3-01&end=2019-3-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-4-01&end=2019-4-30 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-5-01&end=2019-5-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-6-01&end=2019-6-30 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-7-01&end=2019-7-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-8-01&end=2019-8-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-9-01&end=2019-9-30 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-10-01&end=2019-10-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-11-01&end=2019-11-30 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-12-01&end=2019-12-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-1-01&end=2020-1-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-2-01&end=2020-2-29 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-3-01&end=2020-3-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-4-01&end=2020-4-30 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-5-01&end=2020-5-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-6-01&end=2020-6-30 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-7-01&end=2020-7-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-8-01&end=2020-8-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-9-01&end=2020-9-30 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-10-01&end=2020-10-31 https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-11-01&end=2020-11-30

Upload data to Ceph

[5]
new_files = [
    f"{dataset_base_path}/fedora-devel-{date.year}-{date.month}.mbox.gz"
    for date in dates
]
[12]
if os.getenv("RUN_IN_AUTOMATION"):
    utils.upload_files((f, f"raw/{Path(f).name}") for f in new_files)
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-1.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-1.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-2.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-2.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-3.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-3.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-4.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-4.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-5.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-5.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-6.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-6.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-7.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-7.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-8.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-8.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-9.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-9.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-10.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-10.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-11.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-11.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-12.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-12.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-1.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-1.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-2.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-2.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-3.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-3.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-4.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-4.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-5.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-5.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-6.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-6.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-7.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-7.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-8.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-8.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-9.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-9.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-10.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-10.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-11.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-11.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-12.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-12.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-1.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-1.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-2.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-2.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-3.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-3.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-4.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-4.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-5.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-5.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-6.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-6.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-7.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-7.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-8.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-8.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-9.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-9.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-10.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-10.mbox.gz... Done Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-11.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-11.mbox.gz... Done