Seldon deployment for build log clustering

In this notebook, we deploy a seldon service for clustering build logs. First, we take the experiments in build log clustering notebook and train a Sklearn pipeline with all the components. Then, we save the model on s3 storage and deploy a seldon service that uses the saved model. Finally, we test the service for inference on an example request.

[1]

import os
import pandas as pd
import requests
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import joblib
import boto3
import json
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

Load Dataset

[2]

# Note: periodic jobs only (see FIXME in class Builds)
job_name = "periodic-ci-openshift-release-master-ci-4.8-e2e-gcp"

logs_path = "../../../../data/raw/gcs/build-logs/"  # local cache of build log files
metadata_path = "../../../../data/raw/gcs/build-metadata/"  # path to saved metadata
metadata_file_name = os.path.join(metadata_path, f"{job_name}_build-logs.csv")


def log_path_for(build_id):
    return os.path.join(logs_path, f"{build_id}.txt")


def prow_url_for(build_id):
    project = "origin-ci-test"
    # FIXME: this prefix is only for periodic jobs
    job_prefix = f"logs/{job_name}/"
    return f"https://prow.ci.openshift.org/view/gcs/{project}/{job_prefix}{build_id}"


def clean_df(df):
    """Polishes the metadata DataFrame"""
    build_errors = df[df["result"] == "error"].index
    df.drop(build_errors, inplace=True)  # Remove builds that erroed (prow error)
    df["duration"] = df["end"] - df["start"]  # From timestamps to job duration
    df["success"] = df["result"] == "SUCCESS"  # A boolean version of the result
    return df


print("Reading metadata from", metadata_file_name)
df = pd.read_csv(metadata_file_name, index_col=0)
df = clean_df(df)
df

Reading metadata from ../../../../data/raw/gcs/build-metadata/periodic-ci-openshift-release-master-ci-4.8-e2e-gcp_build-logs.csv

	result	size	start	end	duration	success
1429152444788510720	SUCCESS	4135	1629571472	1629576480	5008	True
1455624937803878400	SUCCESS	4579	1635883006	1635887416	4410	True
1445593776872493056	SUCCESS	4397	1633491392	1633496097	4705	True
1417019048973045760	SUCCESS	4134	1626678644	1626683464	4820	True
1427589558375026688	SUCCESS	4133	1629198851	1629203489	4638	True
...	...	...	...	...	...	...
1464437854917627904	SUCCESS	4579	1637984169	1637988828	4659	True
1420899046205165568	SUCCESS	4131	1627603731	1627608772	5041	True
1410375749352820736	FAILURE	8880668	1625094759	1625100569	5810	False
1422945097544110080	SUCCESS	4133	1628091552	1628096732	5180	True
1462490101803126784	SUCCESS	4581	1637519789	1637524977	5188	True

1080 rows × 6 columns

[3]

# Get a list of paths to the local copy of each build log
build_logs = []
for build_id in df.index:
    with open(log_path_for(build_id), "r") as f:
        build_logs.append(f.read())

Train SKlearn Pipeline

[4]

token_pattern = r"\b[a-z][a-z0-9_/\.-]+\b"
vectorizer = TfidfVectorizer(
    min_df=0.03,
    token_pattern=token_pattern,
)

k = 3
kmeans = KMeans(n_clusters=k, random_state=123)

pipeline = Pipeline([("tfidf", vectorizer), ("kmeans", kmeans)])

[5]

pipeline.fit(build_logs)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(min_df=0.03,
                                 token_pattern='\\b[a-z][a-z0-9_/\\.-]+\\b')),
                ('kmeans', KMeans(n_clusters=3, random_state=123))])

Save Pipeline

[6]

joblib.dump(pipeline, "model.joblib")

['model.joblib']

[7]

# Test set
test_set = [i for i in build_logs if len(i) < 5000][:25]

[8]

# Sanity check to see if the saved model works locally
pipeline_loaded = joblib.load("model2.joblib")
pipeline_loaded
pipeline_loaded.predict(test_set)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int32)

[10]

# Set credentials for your s3 storage
s3_endpoint_url = os.getenv("S3_ENDPOINT")
aws_access_key_id = os.getenv("S3_ACCESS_KEY")
aws_secret_access_key = os.getenv("S3_SECRET_KEY")
s3_bucket = os.getenv("S3_BUCKET")

[13]

s3_resource = boto3.resource(
    "s3",
    endpoint_url=s3_endpoint_url,
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)
bucket = s3_resource.Bucket(name=s3_bucket)

[14]

# Upload your model
bucket.upload_file(
    "model.joblib", "ai4ci/build-log-clustering/tfidf-kmeans/model.joblib"
)

# Check if your model exists on s3
objects = [
    obj.key for obj in bucket.objects.filter(Prefix="") if "model.joblib" in obj.key
]
objects

['ai4ci/build-log-clustering/tfidf-kmeans/model.joblib',
 'ai4ci/failure-classifier/model.joblib',
 'ai4ci/github-pr-ttm/model/model.joblib',
 'github/ttm-model-raw-data/pipeline/model.joblib',
 'github/ttm-model/model.joblib',
 'github/ttm-model/pipeline/model.joblib']

Test seldon deployment service

We use the deployment config to deploy a seldon service.

[15]

# Service url
base_url = "http://build-log-clustering-ds-ml-workflows-ws.apps.smaug.na.operate-first.cloud/predict"

[16]

# convert the dataframe into a numpy array and then to a list (required by seldon)
data = {"data": {"ndarray": test_set}}

# create the query payload
json_data = json.dumps(data)
headers = {"content-Type": "application/json"}

# query our inference service
response = requests.post(base_url, data=json_data, headers=headers)
response

<Response [200]>

[17]

response.json()

{'data': {'names': [],
  'ndarray': [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   2,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0]},
 'meta': {'requestPath': {'classifier': 'registry.connect.redhat.com/seldonio/sklearnserver@sha256:88d126455b150291cbb3772f67b4f35a88bb54b15ff7c879022f77fb051615ad'}}}

Conclusion

In this notebook, we saw how to create and save an unsupervised model for clustering build logs. We successfully deployed and tested the model using s3 for storage and a seldon service on Openshift.