Collect Data
This notebook is used to scrape data from the Fedora developers mailing list in monthly chunks and then store copies both locally and in a remote Ceph bucket.
Steps:
- Download Fedora data set as monthly *.gz files to LOCAL_PATH or "data/" directory.
- Upload dataset as monthly "{BUCKET}/{PREFIX}/raw/*.gz" files in remote Ceph storage.
When run in automation, this should only act upon the most recent full month of data.
[10]
import wget
import os
import datetime
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
load_dotenv("../../.env")
import sys
sys.path.append("../..")
from src import utils
Download Data
[2]
BASE_PATH = os.getenv("LOCAL_DATA_PATH", "../../data")
dataset_base_path = Path(f"{BASE_PATH}/raw/fedora-devel-list/")
dataset_base_path.mkdir(parents=True, exist_ok=True)
[3]
CURRENT_DATE = datetime.datetime.now().replace(day=1)
START_DATE = datetime.date(2018, 1, 1)
[4]
dates = pd.date_range(START_DATE, CURRENT_DATE, freq="m")
if os.getenv("RUN_IN_AUTOMATION"):
dates = [dates[-1]]
[43]
# iterate over dates and download the corresponding *.gz files
base_url = "https://lists.fedoraproject.org/archives/list/devel@lists.\
fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz"
for i, date in enumerate(dates):
url = f"{base_url}?start={date.year}-{date.month}-01&end={date.year}-{date.month}-{date.day}"
print(url)
wget.download(
url,
out=f"{dataset_base_path}/fedora-devel-{date.year}-{date.month}.mbox.gz",
)
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-1-01&end=2018-1-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-2-01&end=2018-2-28
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-3-01&end=2018-3-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-4-01&end=2018-4-30
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-5-01&end=2018-5-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-6-01&end=2018-6-30
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-7-01&end=2018-7-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-8-01&end=2018-8-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-9-01&end=2018-9-30
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-10-01&end=2018-10-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-11-01&end=2018-11-30
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2018-12-01&end=2018-12-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-1-01&end=2019-1-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-2-01&end=2019-2-28
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-3-01&end=2019-3-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-4-01&end=2019-4-30
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-5-01&end=2019-5-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-6-01&end=2019-6-30
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-7-01&end=2019-7-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-8-01&end=2019-8-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-9-01&end=2019-9-30
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-10-01&end=2019-10-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-11-01&end=2019-11-30
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2019-12-01&end=2019-12-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-1-01&end=2020-1-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-2-01&end=2020-2-29
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-3-01&end=2020-3-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-4-01&end=2020-4-30
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-5-01&end=2020-5-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-6-01&end=2020-6-30
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-7-01&end=2020-7-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-8-01&end=2020-8-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-9-01&end=2020-9-30
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-10-01&end=2020-10-31
https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start=2020-11-01&end=2020-11-30
Upload data to Ceph
[5]
new_files = [
f"{dataset_base_path}/fedora-devel-{date.year}-{date.month}.mbox.gz"
for date in dates
]
[12]
if os.getenv("RUN_IN_AUTOMATION"):
utils.upload_files((f, f"raw/{Path(f).name}") for f in new_files)
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-1.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-1.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-2.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-2.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-3.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-3.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-4.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-4.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-5.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-5.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-6.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-6.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-7.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-7.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-8.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-8.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-9.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-9.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-10.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-10.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-11.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-11.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2018-12.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2018-12.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-1.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-1.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-2.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-2.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-3.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-3.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-4.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-4.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-5.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-5.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-6.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-6.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-7.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-7.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-8.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-8.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-9.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-9.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-10.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-10.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-11.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-11.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2019-12.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2019-12.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-1.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-1.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-2.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-2.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-3.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-3.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-4.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-4.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-5.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-5.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-6.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-6.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-7.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-7.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-8.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-8.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-9.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-9.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-10.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-10.mbox.gz... Done
Uploading file: ../../data/raw/fedora-devel-list/fedora-devel-2020-11.mbox.gz to mcliffor/fedora_devel_mail/raw/fedora-devel-2020-11.mbox.gz... Done