Seldon deployment for build log clustering
In this notebook, we deploy a seldon service for clustering build logs. First, we take the experiments in build log clustering notebook and train a Sklearn pipeline with all the components. Then, we save the model on s3 storage and deploy a seldon service that uses the saved model. Finally, we test the service for inference on an example request.
[1]
import os
import pandas as pd
import requests
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import joblib
import boto3
import json
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
True
Load Dataset
[2]
# Note: periodic jobs only (see FIXME in class Builds)
job_name = "periodic-ci-openshift-release-master-ci-4.8-e2e-gcp"
logs_path = "../../../../data/raw/gcs/build-logs/" # local cache of build log files
metadata_path = "../../../../data/raw/gcs/build-metadata/" # path to saved metadata
metadata_file_name = os.path.join(metadata_path, f"{job_name}_build-logs.csv")
def log_path_for(build_id):
return os.path.join(logs_path, f"{build_id}.txt")
def prow_url_for(build_id):
project = "origin-ci-test"
# FIXME: this prefix is only for periodic jobs
job_prefix = f"logs/{job_name}/"
return f"https://prow.ci.openshift.org/view/gcs/{project}/{job_prefix}{build_id}"
def clean_df(df):
"""Polishes the metadata DataFrame"""
build_errors = df[df["result"] == "error"].index
df.drop(build_errors, inplace=True) # Remove builds that erroed (prow error)
df["duration"] = df["end"] - df["start"] # From timestamps to job duration
df["success"] = df["result"] == "SUCCESS" # A boolean version of the result
return df
print("Reading metadata from", metadata_file_name)
df = pd.read_csv(metadata_file_name, index_col=0)
df = clean_df(df)
df
Reading metadata from ../../../../data/raw/gcs/build-metadata/periodic-ci-openshift-release-master-ci-4.8-e2e-gcp_build-logs.csv
result | size | start | end | duration | success | |
---|---|---|---|---|---|---|
1429152444788510720 | SUCCESS | 4135 | 1629571472 | 1629576480 | 5008 | True |
1455624937803878400 | SUCCESS | 4579 | 1635883006 | 1635887416 | 4410 | True |
1445593776872493056 | SUCCESS | 4397 | 1633491392 | 1633496097 | 4705 | True |
1417019048973045760 | SUCCESS | 4134 | 1626678644 | 1626683464 | 4820 | True |
1427589558375026688 | SUCCESS | 4133 | 1629198851 | 1629203489 | 4638 | True |
... | ... | ... | ... | ... | ... | ... |
1464437854917627904 | SUCCESS | 4579 | 1637984169 | 1637988828 | 4659 | True |
1420899046205165568 | SUCCESS | 4131 | 1627603731 | 1627608772 | 5041 | True |
1410375749352820736 | FAILURE | 8880668 | 1625094759 | 1625100569 | 5810 | False |
1422945097544110080 | SUCCESS | 4133 | 1628091552 | 1628096732 | 5180 | True |
1462490101803126784 | SUCCESS | 4581 | 1637519789 | 1637524977 | 5188 | True |
1080 rows × 6 columns
[3]
# Get a list of paths to the local copy of each build log
build_logs = []
for build_id in df.index:
with open(log_path_for(build_id), "r") as f:
build_logs.append(f.read())
Train SKlearn Pipeline
[4]
token_pattern = r"\b[a-z][a-z0-9_/\.-]+\b"
vectorizer = TfidfVectorizer(
min_df=0.03,
token_pattern=token_pattern,
)
k = 3
kmeans = KMeans(n_clusters=k, random_state=123)
pipeline = Pipeline([("tfidf", vectorizer), ("kmeans", kmeans)])
[5]
pipeline.fit(build_logs)
Pipeline(steps=[('tfidf',
TfidfVectorizer(min_df=0.03,
token_pattern='\\b[a-z][a-z0-9_/\\.-]+\\b')),
('kmeans', KMeans(n_clusters=3, random_state=123))])
Save Pipeline
[6]
joblib.dump(pipeline, "model.joblib")
['model.joblib']
[7]
# Test set
test_set = [i for i in build_logs if len(i) < 5000][:25]
[8]
# Sanity check to see if the saved model works locally
pipeline_loaded = joblib.load("model2.joblib")
pipeline_loaded
pipeline_loaded.predict(test_set)
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
0, 0, 0], dtype=int32)
[10]
# Set credentials for your s3 storage
s3_endpoint_url = os.getenv("S3_ENDPOINT")
aws_access_key_id = os.getenv("S3_ACCESS_KEY")
aws_secret_access_key = os.getenv("S3_SECRET_KEY")
s3_bucket = os.getenv("S3_BUCKET")
[13]
s3_resource = boto3.resource(
"s3",
endpoint_url=s3_endpoint_url,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
)
bucket = s3_resource.Bucket(name=s3_bucket)
[14]
# Upload your model
bucket.upload_file(
"model.joblib", "ai4ci/build-log-clustering/tfidf-kmeans/model.joblib"
)
# Check if your model exists on s3
objects = [
obj.key for obj in bucket.objects.filter(Prefix="") if "model.joblib" in obj.key
]
objects
['ai4ci/build-log-clustering/tfidf-kmeans/model.joblib',
'ai4ci/failure-classifier/model.joblib',
'ai4ci/github-pr-ttm/model/model.joblib',
'github/ttm-model-raw-data/pipeline/model.joblib',
'github/ttm-model/model.joblib',
'github/ttm-model/pipeline/model.joblib']
Test seldon deployment service
We use the deployment config to deploy a seldon service.
[15]
# Service url
base_url = "http://build-log-clustering-ds-ml-workflows-ws.apps.smaug.na.operate-first.cloud/predict"
[16]
# convert the dataframe into a numpy array and then to a list (required by seldon)
data = {"data": {"ndarray": test_set}}
# create the query payload
json_data = json.dumps(data)
headers = {"content-Type": "application/json"}
# query our inference service
response = requests.post(base_url, data=json_data, headers=headers)
response
<Response [200]>
[17]
response.json()
{'data': {'names': [],
'ndarray': [0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
2,
0,
0,
0,
0,
0,
0,
0,
0]},
'meta': {'requestPath': {'classifier': 'registry.connect.redhat.com/seldonio/sklearnserver@sha256:88d126455b150291cbb3772f67b4f35a88bb54b15ff7c879022f77fb051615ad'}}}
Conclusion
In this notebook, we saw how to create and save an unsupervised model for clustering build logs. We successfully deployed and tested the model using s3 for storage and a seldon service on Openshift.