Step 1: Exploring Data
The purpose of this notebook is to do some basic data exploration. The goal is to better understand the data, so we can better create the training pipeline. This notebook attemps to answer questions such as how many unique models, how many unique vendors, missing data, correlation of the covariates and label, any intersting patterns in the SMART stats over time, etc.
import os
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from joblib import parallel_backend
from dask.distributed import Client
from umap import UMAP
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from src import utils
sns.set()
pd.set_option("display.max_rows", 75)
pd.set_option("display.max_columns", 500)
pbar = ProgressBar()
pbar.register()
# inferred int32 types cause a type mismatch (int vs float) error when dask sees a null value
# null values cannot be interpreted as ints
custom_dtypes = {
"date": "object",
"serial_number": "object",
"model": "object",
"capacity_bytes": "float32",
"failure": "float32",
"smart_1_normalized": "float32",
"smart_1_raw": "float32",
"smart_2_normalized": "float32",
"smart_2_raw": "float32",
"smart_3_normalized": "float32",
"smart_3_raw": "float32",
"smart_4_normalized": "float32",
"smart_4_raw": "float32",
"smart_5_normalized": "float32",
"smart_5_raw": "float32",
"smart_7_normalized": "float32",
"smart_7_raw": "float32",
"smart_8_normalized": "float32",
"smart_8_raw": "float32",
"smart_9_normalized": "float32",
"smart_9_raw": "float32",
"smart_10_normalized": "float32",
"smart_10_raw": "float32",
"smart_11_normalized": "float32",
"smart_11_raw": "float32",
"smart_12_normalized": "float32",
"smart_12_raw": "float32",
"smart_13_normalized": "float32",
"smart_13_raw": "float32",
"smart_15_normalized": "float32",
"smart_15_raw": "float32",
"smart_16_normalized": "float32",
"smart_16_raw": "float32",
"smart_17_normalized": "float32",
"smart_17_raw": "float32",
"smart_22_normalized": "float32",
"smart_22_raw": "float32",
"smart_23_normalized": "float32",
"smart_23_raw": "float32",
"smart_24_normalized": "float32",
"smart_24_raw": "float32",
"smart_168_normalized": "float32",
"smart_168_raw": "float32",
"smart_170_normalized": "float32",
"smart_170_raw": "float32",
"smart_173_normalized": "float32",
"smart_173_raw": "float32",
"smart_174_normalized": "float32",
"smart_174_raw": "float32",
"smart_177_normalized": "float32",
"smart_177_raw": "float32",
"smart_179_normalized": "float32",
"smart_179_raw": "float32",
"smart_181_normalized": "float32",
"smart_181_raw": "float32",
"smart_182_normalized": "float32",
"smart_182_raw": "float32",
"smart_183_normalized": "float32",
"smart_183_raw": "float32",
"smart_184_normalized": "float32",
"smart_184_raw": "float32",
"smart_187_normalized": "float32",
"smart_187_raw": "float32",
"smart_188_normalized": "float32",
"smart_188_raw": "float32",
"smart_189_normalized": "float32",
"smart_189_raw": "float32",
"smart_190_normalized": "float32",
"smart_190_raw": "float32",
"smart_191_normalized": "float32",
"smart_191_raw": "float32",
"smart_192_normalized": "float32",
"smart_192_raw": "float32",
"smart_193_normalized": "float32",
"smart_193_raw": "float32",
"smart_194_normalized": "float32",
"smart_194_raw": "float32",
"smart_195_normalized": "float32",
"smart_195_raw": "float32",
"smart_196_normalized": "float32",
"smart_196_raw": "float32",
"smart_197_normalized": "float32",
"smart_197_raw": "float32",
"smart_198_normalized": "float32",
"smart_198_raw": "float32",
"smart_199_normalized": "float32",
"smart_199_raw": "float32",
"smart_200_normalized": "float32",
"smart_200_raw": "float32",
"smart_201_normalized": "float32",
"smart_201_raw": "float32",
"smart_218_normalized": "float32",
"smart_218_raw": "float32",
"smart_220_normalized": "float32",
"smart_220_raw": "float32",
"smart_222_normalized": "float32",
"smart_222_raw": "float32",
"smart_223_normalized": "float32",
"smart_223_raw": "float32",
"smart_224_normalized": "float32",
"smart_224_raw": "float32",
"smart_225_normalized": "float32",
"smart_225_raw": "float32",
"smart_226_normalized": "float32",
"smart_226_raw": "float32",
"smart_231_normalized": "float32",
"smart_231_raw": "float32",
"smart_232_normalized": "float32",
"smart_232_raw": "float32",
"smart_233_normalized": "float32",
"smart_233_raw": "float32",
"smart_235_normalized": "float32",
"smart_235_raw": "float32",
"smart_240_normalized": "float32",
"smart_240_raw": "float32",
"smart_241_normalized": "float32",
"smart_241_raw": "float32",
"smart_242_normalized": "float32",
"smart_242_raw": "float32",
"smart_250_normalized": "float32",
"smart_250_raw": "float32",
"smart_251_normalized": "float32",
"smart_251_raw": "float32",
"smart_252_normalized": "float32",
"smart_252_raw": "float32",
"smart_254_normalized": "float32",
"smart_254_raw": "float32",
"smart_255_normalized": "float32",
"smart_255_raw": "float32",
}
# read all the (local) data into one dataframe
DATA_ROOT_DIR = "/home/kachauha/Downloads/"
df4 = dd.read_parquet(
os.path.join(DATA_ROOT_DIR, "data_Q4_2018_parquet"), engine="pyarrow", index=False
)
df3 = dd.read_parquet(
os.path.join(DATA_ROOT_DIR, "data_Q3_2018_parquet"), engine="pyarrow", index=False
)
df2 = dd.read_parquet(
os.path.join(DATA_ROOT_DIR, "data_Q2_2018_parquet"), engine="pyarrow", index=False
)
df1 = dd.read_parquet(
os.path.join(DATA_ROOT_DIR, "data_Q1_2018_parquet"), engine="pyarrow", index=False
)
df = utils.optimal_repartition_df(
dd.concat(dfs=[df1, df2, df3, df4], interleave_partitions=True)
)
[########################################] | 100% Completed | 1min 32.4s
# split into working vs failed drives serial number
failed_sers = df[df["failure"] == 1][["serial_number", "model"]].compute()
working_sers = (
df[~df["serial_number"].isin(failed_sers["serial_number"])][
["serial_number", "model"]
]
.drop_duplicates()
.compute()
)
# sanity check - make sure we havent labelled a serial number as BOTH working and failed
assert ~working_sers["serial_number"].isin(failed_sers["serial_number"]).any()
# how many of each class
print(f"{len(failed_sers)} failed drives, {len(working_sers)} working drives")
[########################################] | 100% Completed | 34.3s
[########################################] | 100% Completed | 52.2s
1381 failed drives, 122948 working drives
Manufacturer-level and Model-level analysis
The values of SMART stats and whether or not they are reported varies from vendor to vendor. Furthermore, simply including vendor as a feature may or may not work for all kinds of prediction models. Therefore, it may be a good idea to analyze the data in a vendor specific way.
# split data into vendors, print how many data points available for each vendor
seagate_df = df[df["model"].str.startswith("S")]
print(dd.compute(seagate_df.shape))
hgst_df = df[df["model"].str.startswith("HG")]
print(dd.compute(hgst_df.shape))
toshiba_df = df[df["model"].str.startswith("T")]
print(dd.compute(toshiba_df.shape))
wdc_df = df[df["model"].str.startswith("W")]
print(dd.compute(wdc_df.shape))
hitachi_df = df[df["model"].str.startswith("Hi")]
print(dd.compute(hitachi_df.shape))
[########################################] | 100% Completed | 46.9s
((28004260, 129),)
[########################################] | 100% Completed | 44.2s
((7741899, 129),)
[########################################] | 100% Completed | 43.0s
((482251, 129),)
[########################################] | 100% Completed | 42.9s
((348371, 129),)
[########################################] | 100% Completed | 41.8s
((123725, 129),)
The cell below determines how big of a chunk of the total dataset does each hard drive model comprise. Also, what percent of that model failed.
# how many drives by model, and what is the fail percentage
model_stats = pd.merge(
failed_sers["model"].value_counts().to_frame("failed_count").reset_index(),
working_sers["model"].value_counts().to_frame("working_count").reset_index(),
how="outer",
).fillna(0)
# rename the index column as the model column
model_stats.rename(columns={"index": "model"}, inplace=True)
# total count of model, raw value and as a percent of total drives
model_stats["total_count"] = model_stats["failed_count"] + model_stats["working_count"]
model_stats["total_percent"] = np.around(
100 * model_stats["total_count"] / model_stats["total_count"].sum(), decimals=2
)
# percentage of instances that have failed, per model
model_stats["fail_percent"] = np.around(
100 * model_stats["failed_count"] / model_stats["total_count"], decimals=2
)
# add manufacturer/vendor column
model_stats["vendor"] = model_stats["model"].apply(utils.get_vendor)
model_stats.sort_values(by=["total_count"], ascending=False)
model | failed_count | working_count | total_count | total_percent | fail_percent | vendor | |
---|---|---|---|---|---|---|---|
1 | ST12000NM0007 | 297.0 | 32341.0 | 32638.0 | 26.25 | 0.91 | Seagate |
0 | ST4000DM000 | 581.0 | 31583.0 | 32164.0 | 25.87 | 1.81 | Seagate |
4 | HGST HMS5C4040BLE640 | 54.0 | 15328.0 | 15382.0 | 12.37 | 0.35 | HGST |
2 | ST8000NM0055 | 126.0 | 14383.0 | 14509.0 | 11.67 | 0.87 | Seagate |
3 | ST8000DM002 | 92.0 | 9873.0 | 9965.0 | 8.02 | 0.92 | Seagate |
7 | HGST HMS5C4040ALE640 | 24.0 | 6531.0 | 6555.0 | 5.27 | 0.37 | HGST |
31 | Hitachi HDS5C4040ALE630 | 0.0 | 2296.0 | 2296.0 | 1.85 | 0.00 | Hitachi |
8 | ST6000DX000 | 17.0 | 1865.0 | 1882.0 | 1.51 | 0.90 | Seagate |
12 | TOSHIBA MG07ACA14TA | 9.0 | 1270.0 | 1279.0 | 1.03 | 0.70 | Toshiba |
28 | HGST HUH721212ALN604 | 1.0 | 1278.0 | 1279.0 | 1.03 | 0.08 | HGST |
18 | ST10000NM0086 | 4.0 | 1224.0 | 1228.0 | 0.99 | 0.33 | Seagate |
14 | HGST HUH728080ALE600 | 7.0 | 1045.0 | 1052.0 | 0.85 | 0.67 | HGST |
6 | ST500LM012 HN | 41.0 | 668.0 | 709.0 | 0.57 | 5.78 | Seagate |
5 | TOSHIBA MQ01ABF050 | 50.0 | 525.0 | 575.0 | 0.46 | 8.70 | Toshiba |
11 | WDC WD60EFRX | 9.0 | 428.0 | 437.0 | 0.35 | 2.06 | WDC |
30 | ST4000DM001 | 1.0 | 391.0 | 392.0 | 0.32 | 0.26 | Seagate |
15 | TOSHIBA MQ01ABF050M | 6.0 | 375.0 | 381.0 | 0.31 | 1.57 | Toshiba |
9 | WDC WD5000LPVX | 16.0 | 301.0 | 317.0 | 0.25 | 5.05 | WDC |
17 | WDC WD30EFRX | 4.0 | 180.0 | 184.0 | 0.15 | 2.17 | WDC |
25 | TOSHIBA MD04ABA400V | 1.0 | 145.0 | 146.0 | 0.12 | 0.68 | Toshiba |
13 | ST500LM030 | 7.0 | 111.0 | 118.0 | 0.09 | 5.93 | Seagate |
16 | Hitachi HDS722020ALA330 | 5.0 | 104.0 | 109.0 | 0.09 | 4.59 | Hitachi |
24 | HGST HDS5C4040ALE630 | 1.0 | 96.0 | 97.0 | 0.08 | 1.03 | HGST |
19 | ST4000DM005 | 4.0 | 85.0 | 89.0 | 0.07 | 4.49 | Seagate |
32 | WDC WD5000LPCX | 0.0 | 56.0 | 56.0 | 0.05 | 0.00 | WDC |
33 | WDC WD40EFRX | 0.0 | 46.0 | 46.0 | 0.04 | 0.00 | WDC |
34 | ST9250315AS | 0.0 | 45.0 | 45.0 | 0.04 | 0.00 | Seagate |
35 | TOSHIBA MD04ABA500V | 0.0 | 45.0 | 45.0 | 0.04 | 0.00 | Toshiba |
26 | WDC WD1600AAJS | 1.0 | 42.0 | 43.0 | 0.03 | 2.33 | WDC |
22 | HGST HUS726040ALE610 | 2.0 | 37.0 | 39.0 | 0.03 | 5.13 | HGST |
36 | ST31500541AS | 0.0 | 39.0 | 39.0 | 0.03 | 0.00 | Seagate |
23 | ST8000DM005 | 2.0 | 25.0 | 27.0 | 0.02 | 7.41 | Seagate |
37 | ST250LM004 HN | 0.0 | 24.0 | 24.0 | 0.02 | 0.00 | Seagate |
29 | WDC WD5000BPKT | 1.0 | 20.0 | 21.0 | 0.02 | 4.76 | WDC |
38 | TOSHIBA HDWF180 | 0.0 | 20.0 | 20.0 | 0.02 | 0.00 | Toshiba |
39 | ST9320325AS | 0.0 | 19.0 | 19.0 | 0.02 | 0.00 | Seagate |
40 | ST6000DM001 | 0.0 | 15.0 | 15.0 | 0.01 | 0.00 | Seagate |
41 | Seagate BarraCuda SSD ZA500CM10002 | 0.0 | 13.0 | 13.0 | 0.01 | 0.00 | Seagate |
42 | ST3160316AS | 0.0 | 12.0 | 12.0 | 0.01 | 0.00 | Seagate |
10 | Samsung SSD 850 EVO 1TB | 10.0 | 0.0 | 10.0 | 0.01 | 100.00 | Seagate |
43 | TOSHIBA HDWE160 | 0.0 | 10.0 | 10.0 | 0.01 | 0.00 | Toshiba |
44 | HGST HUH721010ALE600 | 0.0 | 10.0 | 10.0 | 0.01 | 0.00 | HGST |
45 | ST320LT007 | 0.0 | 9.0 | 9.0 | 0.01 | 0.00 | Seagate |
21 | ST8000DM004 | 3.0 | 4.0 | 7.0 | 0.01 | 42.86 | Seagate |
20 | ST4000DX002 | 4.0 | 1.0 | 5.0 | 0.00 | 80.00 | Seagate |
46 | ST3160318AS | 0.0 | 5.0 | 5.0 | 0.00 | 0.00 | Seagate |
47 | WDC WD2500BPVT | 0.0 | 5.0 | 5.0 | 0.00 | 0.00 | WDC |
49 | WDC WD2500AAJS | 0.0 | 3.0 | 3.0 | 0.00 | 0.00 | WDC |
48 | ST6000DM004 | 0.0 | 3.0 | 3.0 | 0.00 | 0.00 | Seagate |
50 | WDC WD3200BEKX | 0.0 | 2.0 | 2.0 | 0.00 | 0.00 | WDC |
51 | WDC WD3200AAJS | 0.0 | 2.0 | 2.0 | 0.00 | 0.00 | WDC |
52 | Hitachi HDS724040ALE640 | 0.0 | 2.0 | 2.0 | 0.00 | 0.00 | Hitachi |
27 | Hitachi HDS5C3030ALA630 | 1.0 | 0.0 | 1.0 | 0.00 | 100.00 | Hitachi |
53 | WDC WD3200LPVX | 0.0 | 1.0 | 1.0 | 0.00 | 0.00 | WDC |
54 | HGST HMS5C4040BLE641 | 0.0 | 1.0 | 1.0 | 0.00 | 0.00 | HGST |
55 | ST1000LM024 HN | 0.0 | 1.0 | 1.0 | 0.00 | 0.00 | Seagate |
56 | WDC WD3200AAKS | 0.0 | 1.0 | 1.0 | 0.00 | 0.00 | WDC |
57 | HGST HDS724040ALE640 | 0.0 | 1.0 | 1.0 | 0.00 | 0.00 | HGST |
58 | WDC WD1600BPVT | 0.0 | 1.0 | 1.0 | 0.00 | 0.00 | WDC |
59 | Seagate BarraCuda SSD ZA2000CM10002 | 0.0 | 1.0 | 1.0 | 0.00 | 0.00 | Seagate |
60 | ST33000651AS | 0.0 | 1.0 | 1.0 | 0.00 | 0.00 | Seagate |
NOTE: Although there are 48 different models of hard drives being used, these are coming from only 5 unique vendors.
The cell below tries to find if the fail/work ratio is greater in some manufacturers than others. It also gives an idea of how much data do we have from each manufacturer.
# how many failed drives in each vendor
for mdf in (seagate_df, hgst_df, toshiba_df, wdc_df, hitachi_df):
num_failed_sers = (mdf["failure"] == 1).sum().compute()
num_working_sers = mdf["serial_number"].nunique().compute() - num_failed_sers
print(
"{} failed drives\n{} working drives".format(num_failed_sers, num_working_sers)
)
print(
"================================================================================"
)
[########################################] | 100% Completed | 46.3s
[########################################] | 100% Completed | 53.0s
1189 failed drives
92739 working drives
================================================================================
[########################################] | 100% Completed | 44.4s
[########################################] | 100% Completed | 45.8s
89 failed drives
24327 working drives
================================================================================
[########################################] | 100% Completed | 42.6s
[########################################] | 100% Completed | 42.5s
66 failed drives
2389 working drives
================================================================================
[########################################] | 100% Completed | 42.6s
[########################################] | 100% Completed | 42.4s
31 failed drives
1088 working drives
================================================================================
[########################################] | 100% Completed | 42.3s
[########################################] | 100% Completed | 42.2s
6 failed drives
2402 working drives
================================================================================
# get vendor-wise stats for failed vs working
vendor_stats = model_stats.groupby("vendor").sum()
vendor_stats["fail_percent"] = np.around(
100
* vendor_stats["failed_count"]
/ (vendor_stats["failed_count"] + vendor_stats["working_count"]),
decimals=2,
)
vendor_stats
failed_count | working_count | total_count | total_percent | fail_percent | |
---|---|---|---|---|---|
vendor | |||||
HGST | 89.0 | 24327.0 | 24416.0 | 19.64 | 0.36 |
Hitachi | 6.0 | 2402.0 | 2408.0 | 1.94 | 0.25 |
Seagate | 1189.0 | 92741.0 | 93930.0 | 75.55 | 1.27 |
Toshiba | 66.0 | 2390.0 | 2456.0 | 1.99 | 2.69 |
WDC | 31.0 | 1088.0 | 1119.0 | 0.89 | 2.77 |
# within drives of a manufacturer, what is the distribution of models like
for mdf in (seagate_df, hgst_df, toshiba_df, wdc_df, hitachi_df):
valcts = mdf["model"].value_counts().compute()
sns.barplot(x=valcts.index, y=(valcts.values / valcts.sum()))
plt.xticks(rotation=90)
plt.show()
[########################################] | 100% Completed | 50.9s
[########################################] | 100% Completed | 44.6s
[########################################] | 100% Completed | 44.0s
[########################################] | 100% Completed | 47.4s
[########################################] | 100% Completed | 46.2s
NOTE: From the above barplots, it can be seen that only a few models comprise a huge chunk of all the data from a manufacturer
Analyze Critical Stats
These are the columns specified by wikipedia, backblaze, and IBM research as good predictors.
CRITICAL_STATS = [
1,
5,
7,
10,
184,
187,
188,
189,
190,
193,
194,
196,
197,
198,
201,
240,
241,
242,
]
# NOTE - THESE LISTS ARE SUBJECT TO CHANGE
crit_cols_raw = ["smart_{}_raw".format(i) for i in CRITICAL_STATS]
crit_cols_normalized = ["smart_{}_normalized".format(i) for i in CRITICAL_STATS]
SMART Stats Descriptions
Source: Wikipedia
Putting descriptions here to help better make sense of the results that follow.
184 = end to end error / ioedc : This attribute is a part of Hewlett-Packard's SMART IV technology, as well as part of other vendors' IO Error Detection and Correction schemas, and it contains a count of parity errors which occur in the data path to the media via the drive's cache RAM.
187 = reported uncorrectable errors : The count of errors that could not be recovered using hardware ECC (see attribute 195)
188 = command timeout : The count of aborted operations due to HDD timeout. Normally this attribute value should be equal to zero.
189 = high fly write : This feature is implemented in most modern Seagate drives and some of Western Digital's drives, beginning with the WD Enterprise WDE18300 and WDE9180 Ultra2 SCSI hard drives, and will be included on all future WD Enterprise products.
190 = temp diff / airflow diff : Value is equal to (100-temp. °C), allowing manufacturer to set a minimum threshold which corresponds to a maximum temperature. This also follows the convention of 100 being a best-case value and lower values being undesirable. However, some older drives may instead report raw Temperature (identical to 0xC2) or Temperature minus 50 here.
196 = reallocation event count : Count of remap operations. The raw value of this attribute shows the total count of attempts to transfer data from reallocated sectors to a spare area. Both successful and unsuccessful attempts are counted
201 = soft read eror rate or TA counter detected : Count indicates the number of uncorrectable software read errors.
NaN Counts
# number of nans
print("All data")
nan_count = utils.get_nan_count_percent(
df[crit_cols_raw + crit_cols_normalized]
).compute()
nan_count
All data
[########################################] | 100% Completed | 37.9s
[########################################] | 100% Completed | 36.4s
count | percent | |
---|---|---|
smart_1_raw | 1069 | 0.000029 |
smart_5_raw | 1335 | 0.000036 |
smart_7_raw | 1569 | 0.000043 |
smart_10_raw | 1569 | 0.000043 |
smart_184_raw | 16714033 | 0.455417 |
smart_187_raw | 8930415 | 0.243332 |
smart_188_raw | 8930649 | 0.243339 |
smart_189_raw | 16714033 | 0.455417 |
smart_190_raw | 8930415 | 0.243332 |
smart_193_raw | 240980 | 0.006566 |
smart_194_raw | 1069 | 0.000029 |
smart_196_raw | 27771426 | 0.756704 |
smart_197_raw | 1569 | 0.000043 |
smart_198_raw | 1569 | 0.000043 |
smart_201_raw | 36700506 | 1.000000 |
smart_240_raw | 8429261 | 0.229677 |
smart_241_raw | 8937648 | 0.243529 |
smart_242_raw | 8937882 | 0.243536 |
smart_1_normalized | 1069 | 0.000029 |
smart_5_normalized | 1335 | 0.000036 |
smart_7_normalized | 1569 | 0.000043 |
smart_10_normalized | 1569 | 0.000043 |
smart_184_normalized | 16714033 | 0.455417 |
smart_187_normalized | 8930415 | 0.243332 |
smart_188_normalized | 8930649 | 0.243339 |
smart_189_normalized | 16714033 | 0.455417 |
smart_190_normalized | 8930415 | 0.243332 |
smart_193_normalized | 240980 | 0.006566 |
smart_194_normalized | 1069 | 0.000029 |
smart_196_normalized | 27771426 | 0.756704 |
smart_197_normalized | 1569 | 0.000043 |
smart_198_normalized | 1569 | 0.000043 |
smart_201_normalized | 36700506 | 1.000000 |
smart_240_normalized | 8429261 | 0.229677 |
smart_241_normalized | 8937648 | 0.243529 |
smart_242_normalized | 8937882 | 0.243536 |
Some vendors may not provide certain SMART stats. This could be one possible explanation for the high number of nans. If this is the case, then that particular feature will not be helpful for predicting status of a drive from that vendor. Lets get the vendor-wise nans
print("Seagate")
seagate_nan_ct = utils.get_nan_count_percent(
seagate_df[crit_cols_raw + crit_cols_normalized]
).compute()
seagate_nan_ct
Seagate
[########################################] | 100% Completed | 47.7s
[########################################] | 100% Completed | 48.2s
count | percent | |
---|---|---|
smart_1_raw | 970 | 0.000035 |
smart_5_raw | 1236 | 0.000044 |
smart_7_raw | 1470 | 0.000052 |
smart_10_raw | 1470 | 0.000052 |
smart_184_raw | 8017787 | 0.286306 |
smart_187_raw | 234169 | 0.008362 |
smart_188_raw | 234403 | 0.008370 |
smart_189_raw | 8017787 | 0.286306 |
smart_190_raw | 234169 | 0.008362 |
smart_193_raw | 240881 | 0.008602 |
smart_194_raw | 970 | 0.000035 |
smart_196_raw | 27771327 | 0.991682 |
smart_197_raw | 1470 | 0.000052 |
smart_198_raw | 1470 | 0.000052 |
smart_201_raw | 28004260 | 1.000000 |
smart_240_raw | 246153 | 0.008790 |
smart_241_raw | 245419 | 0.008764 |
smart_242_raw | 245653 | 0.008772 |
smart_1_normalized | 970 | 0.000035 |
smart_5_normalized | 1236 | 0.000044 |
smart_7_normalized | 1470 | 0.000052 |
smart_10_normalized | 1470 | 0.000052 |
smart_184_normalized | 8017787 | 0.286306 |
smart_187_normalized | 234169 | 0.008362 |
smart_188_normalized | 234403 | 0.008370 |
smart_189_normalized | 8017787 | 0.286306 |
smart_190_normalized | 234169 | 0.008362 |
smart_193_normalized | 240881 | 0.008602 |
smart_194_normalized | 970 | 0.000035 |
smart_196_normalized | 27771327 | 0.991682 |
smart_197_normalized | 1470 | 0.000052 |
smart_198_normalized | 1470 | 0.000052 |
smart_201_normalized | 28004260 | 1.000000 |
smart_240_normalized | 246153 | 0.008790 |
smart_241_normalized | 245419 | 0.008764 |
smart_242_normalized | 245653 | 0.008772 |
print("WDC")
wdc_nan_ct = utils.get_nan_count_percent(
wdc_df[crit_cols_raw + crit_cols_normalized]
).compute()
wdc_nan_ct
WDC
[########################################] | 100% Completed | 43.3s
[########################################] | 100% Completed | 43.4s
count | percent | |
---|---|---|
smart_1_raw | 0 | 0.000000 |
smart_5_raw | 0 | 0.000000 |
smart_7_raw | 0 | 0.000000 |
smart_10_raw | 0 | 0.000000 |
smart_184_raw | 348371 | 1.000000 |
smart_187_raw | 348371 | 1.000000 |
smart_188_raw | 348371 | 1.000000 |
smart_189_raw | 348371 | 1.000000 |
smart_190_raw | 348371 | 1.000000 |
smart_193_raw | 0 | 0.000000 |
smart_194_raw | 0 | 0.000000 |
smart_196_raw | 0 | 0.000000 |
smart_197_raw | 0 | 0.000000 |
smart_198_raw | 0 | 0.000000 |
smart_201_raw | 348371 | 1.000000 |
smart_240_raw | 317479 | 0.911324 |
smart_241_raw | 344354 | 0.988469 |
smart_242_raw | 344354 | 0.988469 |
smart_1_normalized | 0 | 0.000000 |
smart_5_normalized | 0 | 0.000000 |
smart_7_normalized | 0 | 0.000000 |
smart_10_normalized | 0 | 0.000000 |
smart_184_normalized | 348371 | 1.000000 |
smart_187_normalized | 348371 | 1.000000 |
smart_188_normalized | 348371 | 1.000000 |
smart_189_normalized | 348371 | 1.000000 |
smart_190_normalized | 348371 | 1.000000 |
smart_193_normalized | 0 | 0.000000 |
smart_194_normalized | 0 | 0.000000 |
smart_196_normalized | 0 | 0.000000 |
smart_197_normalized | 0 | 0.000000 |
smart_198_normalized | 0 | 0.000000 |
smart_201_normalized | 348371 | 1.000000 |
smart_240_normalized | 317479 | 0.911324 |
smart_241_normalized | 344354 | 0.988469 |
smart_242_normalized | 344354 | 0.988469 |
print("HGST")
hgst_nan_ct = utils.get_nan_count_percent(
hgst_df[crit_cols_raw + crit_cols_normalized]
).compute()
hgst_nan_ct
HGST
[########################################] | 100% Completed | 46.6s
[########################################] | 100% Completed | 47.2s
count | percent | |
---|---|---|
smart_1_raw | 94 | 0.000012 |
smart_5_raw | 94 | 0.000012 |
smart_7_raw | 94 | 0.000012 |
smart_10_raw | 94 | 0.000012 |
smart_184_raw | 7741899 | 1.000000 |
smart_187_raw | 7741899 | 1.000000 |
smart_188_raw | 7741899 | 1.000000 |
smart_189_raw | 7741899 | 1.000000 |
smart_190_raw | 7741899 | 1.000000 |
smart_193_raw | 94 | 0.000012 |
smart_194_raw | 94 | 0.000012 |
smart_196_raw | 94 | 0.000012 |
smart_197_raw | 94 | 0.000012 |
smart_198_raw | 94 | 0.000012 |
smart_201_raw | 7741899 | 1.000000 |
smart_240_raw | 7741899 | 1.000000 |
smart_241_raw | 7741899 | 1.000000 |
smart_242_raw | 7741899 | 1.000000 |
smart_1_normalized | 94 | 0.000012 |
smart_5_normalized | 94 | 0.000012 |
smart_7_normalized | 94 | 0.000012 |
smart_10_normalized | 94 | 0.000012 |
smart_184_normalized | 7741899 | 1.000000 |
smart_187_normalized | 7741899 | 1.000000 |
smart_188_normalized | 7741899 | 1.000000 |
smart_189_normalized | 7741899 | 1.000000 |
smart_190_normalized | 7741899 | 1.000000 |
smart_193_normalized | 94 | 0.000012 |
smart_194_normalized | 94 | 0.000012 |
smart_196_normalized | 94 | 0.000012 |
smart_197_normalized | 94 | 0.000012 |
smart_198_normalized | 94 | 0.000012 |
smart_201_normalized | 7741899 | 1.000000 |
smart_240_normalized | 7741899 | 1.000000 |
smart_241_normalized | 7741899 | 1.000000 |
smart_242_normalized | 7741899 | 1.000000 |
print("Hitachi")
hitachi_nan_ct = utils.get_nan_count_percent(
hitachi_df[crit_cols_raw + crit_cols_normalized]
).compute()
hitachi_nan_ct
Hitachi
[########################################] | 100% Completed | 43.4s
[########################################] | 100% Completed | 42.9s
count | percent | |
---|---|---|
smart_1_raw | 0 | 0.0 |
smart_5_raw | 0 | 0.0 |
smart_7_raw | 0 | 0.0 |
smart_10_raw | 0 | 0.0 |
smart_184_raw | 123725 | 1.0 |
smart_187_raw | 123725 | 1.0 |
smart_188_raw | 123725 | 1.0 |
smart_189_raw | 123725 | 1.0 |
smart_190_raw | 123725 | 1.0 |
smart_193_raw | 0 | 0.0 |
smart_194_raw | 0 | 0.0 |
smart_196_raw | 0 | 0.0 |
smart_197_raw | 0 | 0.0 |
smart_198_raw | 0 | 0.0 |
smart_201_raw | 123725 | 1.0 |
smart_240_raw | 123725 | 1.0 |
smart_241_raw | 123725 | 1.0 |
smart_242_raw | 123725 | 1.0 |
smart_1_normalized | 0 | 0.0 |
smart_5_normalized | 0 | 0.0 |
smart_7_normalized | 0 | 0.0 |
smart_10_normalized | 0 | 0.0 |
smart_184_normalized | 123725 | 1.0 |
smart_187_normalized | 123725 | 1.0 |
smart_188_normalized | 123725 | 1.0 |
smart_189_normalized | 123725 | 1.0 |
smart_190_normalized | 123725 | 1.0 |
smart_193_normalized | 0 | 0.0 |
smart_194_normalized | 0 | 0.0 |
smart_196_normalized | 0 | 0.0 |
smart_197_normalized | 0 | 0.0 |
smart_198_normalized | 0 | 0.0 |
smart_201_normalized | 123725 | 1.0 |
smart_240_normalized | 123725 | 1.0 |
smart_241_normalized | 123725 | 1.0 |
smart_242_normalized | 123725 | 1.0 |
print("Toshiba")
toshiba_nan_ct = utils.get_nan_count_percent(
toshiba_df[crit_cols_raw + crit_cols_normalized]
).compute()
toshiba_nan_ct
Toshiba
[########################################] | 100% Completed | 43.8s
[########################################] | 100% Completed | 46.3s
count | percent | |
---|---|---|
smart_1_raw | 5 | 0.00001 |
smart_5_raw | 5 | 0.00001 |
smart_7_raw | 5 | 0.00001 |
smart_10_raw | 5 | 0.00001 |
smart_184_raw | 482251 | 1.00000 |
smart_187_raw | 482251 | 1.00000 |
smart_188_raw | 482251 | 1.00000 |
smart_189_raw | 482251 | 1.00000 |
smart_190_raw | 482251 | 1.00000 |
smart_193_raw | 5 | 0.00001 |
smart_194_raw | 5 | 0.00001 |
smart_196_raw | 5 | 0.00001 |
smart_197_raw | 5 | 0.00001 |
smart_198_raw | 5 | 0.00001 |
smart_201_raw | 482251 | 1.00000 |
smart_240_raw | 5 | 0.00001 |
smart_241_raw | 482251 | 1.00000 |
smart_242_raw | 482251 | 1.00000 |
smart_1_normalized | 5 | 0.00001 |
smart_5_normalized | 5 | 0.00001 |
smart_7_normalized | 5 | 0.00001 |
smart_10_normalized | 5 | 0.00001 |
smart_184_normalized | 482251 | 1.00000 |
smart_187_normalized | 482251 | 1.00000 |
smart_188_normalized | 482251 | 1.00000 |
smart_189_normalized | 482251 | 1.00000 |
smart_190_normalized | 482251 | 1.00000 |
smart_193_normalized | 5 | 0.00001 |
smart_194_normalized | 5 | 0.00001 |
smart_196_normalized | 5 | 0.00001 |
smart_197_normalized | 5 | 0.00001 |
smart_198_normalized | 5 | 0.00001 |
smart_201_normalized | 482251 | 1.00000 |
smart_240_normalized | 5 | 0.00001 |
smart_241_normalized | 482251 | 1.00000 |
smart_242_normalized | 482251 | 1.00000 |
NOTE: are the nans meaningless or do they imply 0?
This will be analyzed in the data_cleaner_*.ipynb
notebooks.
# let's see if there are unusual amounts of nan's within each model too
# if there is, then that means there was probably something wrong with data collection
top_models_seagate = ["ST12000NM0007", "ST4000DM000", "ST8000NM0055"]
for model in top_models_seagate:
# get nan counts and percents, but print only those which are not 0 or 1
nanct = utils.get_nan_count_percent(seagate_df[seagate_df["model"] == model])
print(nanct[(nanct["percent"] != 0) & (nanct["percent"] != 1)].compute())
count percent
smart_1_normalized 442 0.000057
smart_1_raw 442 0.000057
smart_3_normalized 442 0.000057
smart_3_raw 442 0.000057
smart_4_normalized 442 0.000057
smart_4_raw 442 0.000057
smart_5_normalized 442 0.000057
smart_5_raw 442 0.000057
smart_7_normalized 442 0.000057
smart_7_raw 442 0.000057
smart_9_normalized 442 0.000057
smart_9_raw 442 0.000057
smart_10_normalized 442 0.000057
smart_10_raw 442 0.000057
smart_12_normalized 442 0.000057
smart_12_raw 442 0.000057
smart_187_normalized 442 0.000057
smart_187_raw 442 0.000057
smart_188_normalized 442 0.000057
smart_188_raw 442 0.000057
smart_190_normalized 442 0.000057
smart_190_raw 442 0.000057
smart_192_normalized 442 0.000057
smart_192_raw 442 0.000057
smart_193_normalized 442 0.000057
smart_193_raw 442 0.000057
smart_194_normalized 442 0.000057
smart_194_raw 442 0.000057
smart_195_normalized 442 0.000057
smart_195_raw 442 0.000057
smart_197_normalized 442 0.000057
smart_197_raw 442 0.000057
smart_198_normalized 442 0.000057
smart_198_raw 442 0.000057
smart_199_normalized 442 0.000057
smart_199_raw 442 0.000057
smart_200_normalized 442 0.000057
smart_200_raw 442 0.000057
smart_240_normalized 442 0.000057
smart_240_raw 442 0.000057
smart_241_normalized 442 0.000057
smart_241_raw 442 0.000057
smart_242_normalized 442 0.000057
smart_242_raw 442 0.000057
count percent
smart_1_normalized 157 0.000016
smart_1_raw 157 0.000016
smart_3_normalized 157 0.000016
smart_3_raw 157 0.000016
smart_4_normalized 157 0.000016
smart_4_raw 157 0.000016
smart_5_normalized 157 0.000016
smart_5_raw 157 0.000016
smart_7_normalized 157 0.000016
smart_7_raw 157 0.000016
smart_9_normalized 157 0.000016
smart_9_raw 157 0.000016
smart_10_normalized 157 0.000016
smart_10_raw 157 0.000016
smart_12_normalized 157 0.000016
smart_12_raw 157 0.000016
smart_183_normalized 157 0.000016
smart_183_raw 157 0.000016
smart_184_normalized 157 0.000016
smart_184_raw 157 0.000016
smart_187_normalized 157 0.000016
smart_187_raw 157 0.000016
smart_188_normalized 157 0.000016
smart_188_raw 157 0.000016
smart_189_normalized 157 0.000016
smart_189_raw 157 0.000016
smart_190_normalized 157 0.000016
smart_190_raw 157 0.000016
smart_191_normalized 157 0.000016
smart_191_raw 157 0.000016
smart_192_normalized 157 0.000016
smart_192_raw 157 0.000016
smart_193_normalized 157 0.000016
smart_193_raw 157 0.000016
smart_194_normalized 157 0.000016
smart_194_raw 157 0.000016
smart_197_normalized 157 0.000016
smart_197_raw 157 0.000016
smart_198_normalized 157 0.000016
smart_198_raw 157 0.000016
smart_199_normalized 157 0.000016
smart_199_raw 157 0.000016
smart_240_normalized 157 0.000016
smart_240_raw 157 0.000016
smart_241_normalized 157 0.000016
smart_241_raw 157 0.000016
smart_242_normalized 157 0.000016
smart_242_raw 157 0.000016
count percent
smart_1_normalized 55 0.00001
smart_1_raw 55 0.00001
smart_3_normalized 55 0.00001
smart_3_raw 55 0.00001
smart_4_normalized 55 0.00001
smart_4_raw 55 0.00001
smart_5_normalized 55 0.00001
smart_5_raw 55 0.00001
smart_7_normalized 55 0.00001
smart_7_raw 55 0.00001
smart_9_normalized 55 0.00001
smart_9_raw 55 0.00001
smart_10_normalized 55 0.00001
smart_10_raw 55 0.00001
smart_12_normalized 55 0.00001
smart_12_raw 55 0.00001
smart_184_normalized 55 0.00001
smart_184_raw 55 0.00001
smart_187_normalized 55 0.00001
smart_187_raw 55 0.00001
smart_188_normalized 55 0.00001
smart_188_raw 55 0.00001
smart_189_normalized 55 0.00001
smart_189_raw 55 0.00001
smart_190_normalized 55 0.00001
smart_190_raw 55 0.00001
smart_191_normalized 55 0.00001
smart_191_raw 55 0.00001
smart_192_normalized 55 0.00001
smart_192_raw 55 0.00001
smart_193_normalized 55 0.00001
smart_193_raw 55 0.00001
smart_194_normalized 55 0.00001
smart_194_raw 55 0.00001
smart_195_normalized 55 0.00001
smart_195_raw 55 0.00001
smart_197_normalized 55 0.00001
smart_197_raw 55 0.00001
smart_198_normalized 55 0.00001
smart_198_raw 55 0.00001
smart_199_normalized 55 0.00001
smart_199_raw 55 0.00001
smart_240_normalized 55 0.00001
smart_240_raw 55 0.00001
smart_241_normalized 55 0.00001
smart_241_raw 55 0.00001
smart_242_normalized 55 0.00001
smart_242_raw 55 0.00001
# general description
# NOTE: the columns with all values NAN must be removed otherwise a value error is thrown
summary = df[nan_count.index[nan_count["percent"] != 1]].describe().compute()
summary
[########################################] | 100% Completed | 1min 6.1s
smart_1_raw | smart_5_raw | smart_7_raw | smart_10_raw | smart_184_raw | smart_187_raw | smart_188_raw | smart_189_raw | smart_190_raw | smart_193_raw | smart_194_raw | smart_196_raw | smart_197_raw | smart_198_raw | smart_240_raw | smart_241_raw | smart_242_raw | smart_1_normalized | smart_5_normalized | smart_7_normalized | smart_10_normalized | smart_184_normalized | smart_187_normalized | smart_188_normalized | smart_189_normalized | smart_190_normalized | smart_193_normalized | smart_194_normalized | smart_196_normalized | smart_197_normalized | smart_198_normalized | smart_240_normalized | smart_241_normalized | smart_242_normalized | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3.669944e+07 | 3.669917e+07 | 3.669894e+07 | 3.669894e+07 | 1.998647e+07 | 2.777009e+07 | 2.776986e+07 | 1.998647e+07 | 2.777009e+07 | 3.645953e+07 | 3.669944e+07 | 8.929080e+06 | 3.669894e+07 | 3.669894e+07 | 2.827124e+07 | 2.776286e+07 | 2.776262e+07 | 3.669944e+07 | 3.669917e+07 | 3.669894e+07 | 3.669894e+07 | 1.998647e+07 | 2.777009e+07 | 2.776986e+07 | 1.998647e+07 | 2.777009e+07 | 3.645953e+07 | 3.669944e+07 | 8.929080e+06 | 3.669894e+07 | 3.669894e+07 | 2.827124e+07 | 2.776286e+07 | 2.776262e+07 |
mean | 9.245211e+07 | 4.234648e+00 | 3.094477e+09 | 1.737661e+01 | 8.348497e-03 | 6.757353e-02 | 7.902177e+07 | 9.502234e+00 | 2.840635e+01 | 1.741274e+04 | 2.838011e+01 | 4.711752e-01 | 1.117139e-01 | 1.073078e-01 | 3.099953e+12 | 3.667505e+10 | 8.648306e+10 | 9.606600e+01 | 1.019101e+02 | 9.146092e+01 | 1.009584e+02 | 9.999167e+01 | 9.993825e+01 | 1.000001e+02 | 9.867809e+01 | 7.159428e+01 | 9.341934e+01 | 7.012979e+01 | 1.078518e+02 | 1.019113e+02 | 1.009938e+02 | 9.996036e+01 | 1.000145e+02 | 1.000145e+02 |
std | 8.067240e+07 | 2.649165e+02 | 5.505890e+11 | 1.533640e+03 | 4.436532e-01 | 1.263756e+01 | 3.442275e+09 | 6.387947e+02 | 6.525753e+00 | 7.335219e+04 | 6.033230e+00 | 1.940358e+01 | 3.169116e+01 | 3.168643e+01 | 2.390528e+13 | 1.247371e+10 | 2.596269e+11 | 1.862878e+01 | 1.542695e+01 | 1.696676e+01 | 1.207357e+01 | 4.436532e-01 | 1.072197e+00 | 1.122683e-03 | 5.817690e+00 | 6.527210e+00 | 1.946688e+01 | 7.709105e+01 | 3.052469e+01 | 1.541177e+01 | 1.218965e+01 | 1.271935e+00 | 1.202787e+00 | 1.202788e+00 |
min | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.300000e+01 | 0.000000e+00 | 1.200000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 4.500000e+01 | 1.000000e+00 | 2.400000e+01 | 7.500000e+01 | 2.400000e+01 | 1.000000e+00 | 9.700000e+01 | 1.000000e+00 | 4.400000e+01 | 1.000000e+00 | 1.300000e+01 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 2.500000e+01 | 9.900000e+01 | 1.000000e+02 |
25% | 6.765948e+06 | 0.000000e+00 | 5.476350e+07 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.600000e+01 | 3.850000e+02 | 2.700000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 8.410000e+03 | 3.379063e+10 | 6.591016e+10 | 8.300000e+01 | 1.000000e+02 | 8.900000e+01 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 6.800000e+01 | 9.800000e+01 | 2.900000e+01 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 |
50% | 8.764862e+07 | 0.000000e+00 | 4.826357e+08 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 3.100000e+01 | 3.443000e+03 | 3.100000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.411800e+04 | 4.205447e+10 | 8.620323e+10 | 1.000000e+02 | 1.000000e+02 | 9.400000e+01 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 7.400000e+01 | 1.000000e+02 | 3.700000e+01 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 |
75% | 1.688201e+08 | 0.000000e+00 | 1.268629e+09 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 3.600000e+01 | 1.402650e+04 | 3.500000e+01 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 2.525600e+04 | 5.044046e+10 | 1.177307e+11 | 1.150000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 7.800000e+01 | 1.000000e+02 | 1.870000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 |
max | 8.449025e+08 | 6.501600e+04 | 2.814717e+14 | 3.276800e+05 | 7.600000e+01 | 6.553500e+04 | 6.013057e+11 | 6.553500e+04 | 1.410000e+02 | 1.692766e+07 | 1.410000e+02 | 3.938000e+03 | 1.426160e+05 | 1.426160e+05 | 2.814707e+14 | 1.790039e+11 | 3.237733e+13 | 2.000000e+02 | 2.520000e+02 | 2.520000e+02 | 2.520000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 1.000000e+02 | 2.150000e+02 | 2.000000e+02 | 2.530000e+02 | 2.520000e+02 | 2.520000e+02 | 2.520000e+02 | 1.000000e+02 | 2.000000e+02 | 2.000000e+02 |
# number of rows to use for plotting - sampling is done iff number of rows is too much to keep in memory
num_rows_to_sample = 5000000
# plot histograms for overall data
# it would probably be more insightful to do this per vendor, but start with overall
# dont make the hist if all values are nan
hist_cols = nan_count.index[
nan_count["percent"] < 0.25
] # + ['failure', 'capacity_bytes']
len_df = len(df)
for col in hist_cols:
# get all the values for this column
if num_rows_to_sample < len_df:
data = df[col].sample(frac=num_rows_to_sample / len_df).compute()
else:
data = df[col].compute()
# plot only the non null values
print(
data.isna().sum(),
"out of",
data.shape[0],
"are NaN values. These are not shown on the graph below",
)
sns.distplot(data[~data.isna()])
plt.show()
[########################################] | 100% Completed | 35.4s
133 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 34.3s
163 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.9s
216 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.7s
197 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.5s
1216760 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.5s
1217591 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.5s
1214772 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.7s
32765 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.7s
157 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 37.0s
266 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.5s
213 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.3s
1148971 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.4s
1217502 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.3s
1216922 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.1s
150 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.1s
188 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.2s
210 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.8s
228 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.8s
1218351 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.3s
1216144 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.3s
1217148 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.4s
32848 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.4s
142 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.3s
220 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.3s
217 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.5s
1148709 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 34.0s
1216598 out of 4999997 are NaN values. These are not shown on the graph below
[########################################] | 100% Completed | 33.1s
1217544 out of 4999997 are NaN values. These are not shown on the graph below
# correlation with failure
# corr_cols = ['failure', 'capacity_bytes'] + crit_cols_raw + crit_cols_normalized
corr_cols = ["failure", "capacity_bytes"] + list(
nan_count.index[nan_count["percent"] != 1]
)
downsampled_sers = (
failed_sers["serial_number"].sample(n=500).values.tolist()
+ working_sers["serial_number"].sample(n=75000).values.tolist()
)
corr_mat = df[df["serial_number"].isin(downsampled_sers)][corr_cols].corr().compute()
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(
corr_mat,
ax=ax,
vmin=-1,
vmax=1,
square=True,
linewidths=0.5,
cmap="RdBu",
cbar_kws={"shrink": 0.5},
)
[########################################] | 100% Completed | 5min 40.7s
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b84244fd0>
# only those attributes with less than NAN_PERCENT_THRESHOLD% values as NaN's
# will be selected for computing the correaltion matirx
NAN_PERCENT_THRESHOLD = 0.5
# correlation matrix for seagaet drives
seagate_corr_cols = ["failure", "capacity_bytes"] + list(
seagate_nan_ct.index[seagate_nan_ct["percent"] < NAN_PERCENT_THRESHOLD]
)
seagate_corr = (
seagate_df[seagate_df["serial_number"].isin(downsampled_sers)][seagate_corr_cols]
.corr()
.compute()
)
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(
seagate_corr,
ax=ax,
vmin=-1,
vmax=1,
square=True,
linewidths=0.5,
cmap="RdBu",
cbar_kws={"shrink": 0.5},
)
[########################################] | 100% Completed | 4min 7.8s
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b9e2ae438>
# correlation matrix for wdc drives
wdc_corr_cols = ["failure", "capacity_bytes"] + list(
wdc_nan_ct.index[wdc_nan_ct["percent"] < NAN_PERCENT_THRESHOLD]
)
wdc_corr = wdc_df[wdc_corr_cols].corr().compute()
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(
wdc_corr,
ax=ax,
vmin=-1,
vmax=1,
square=True,
linewidths=0.5,
cmap="RdBu",
cbar_kws={"shrink": 0.5},
)
[########################################] | 100% Completed | 57.5s
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b62469eb8>
# correlation matrix for hgst drives
hgst_corr_cols = ["failure", "capacity_bytes"] + list(
hgst_nan_ct.index[hgst_nan_ct["percent"] < NAN_PERCENT_THRESHOLD]
)
hgst_corr = hgst_df[hgst_corr_cols].corr().compute()
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(
hgst_corr,
ax=ax,
vmin=-1,
vmax=1,
square=True,
linewidths=0.5,
cmap="RdBu",
cbar_kws={"shrink": 0.5},
)
[########################################] | 100% Completed | 1min 22.2s
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b84216630>
# correlation matrix for hitachi drives
hitachi_corr_cols = ["failure", "capacity_bytes"] + list(
hitachi_nan_ct.index[hitachi_nan_ct["percent"] < NAN_PERCENT_THRESHOLD]
)
hitachi_corr = hitachi_df[hitachi_corr_cols].corr().compute()
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(
hitachi_corr,
ax=ax,
vmin=-1,
vmax=1,
square=True,
linewidths=0.5,
cmap="RdBu",
cbar_kws={"shrink": 0.5},
)
[########################################] | 100% Completed | 56.9s
<matplotlib.axes._subplots.AxesSubplot at 0x7f0b61fd43c8>
# correlation matrix for toshiba drives
toshiba_corr_cols = ["failure", "capacity_bytes"] + list(
toshiba_nan_ct.index[toshiba_nan_ct["percent"] < NAN_PERCENT_THRESHOLD]
)
toshiba_corr = toshiba_df[toshiba_corr_cols].corr().compute()
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(
toshiba_corr,
ax=ax,
vmin=-1,
vmax=1,
square=True,
linewidths=0.5,
cmap="RdBu",
cbar_kws={"shrink": 0.5},
)
[########################################] | 100% Completed | 1min 0.3s
<matplotlib.axes._subplots.AxesSubplot at 0x7f0bda21e438>
# TODO: Might be better to call compute to get the combined data of all serials in subset
# as opposed to calling it for earch serial number in for loop
# NOTE: running this cell will take a VERY long time (~1hr on intel i7 w/ 16GB ram)
# adjust NUM_DRIVES_TO_SAMPLE to select a small subset to use for plotting
NUM_DRIVES_TO_SAMPLE = 50
# plots for smart stat 5
# cols_to_plot = ['smart_5_raw', 'smart_7_raw']
cols_to_plot = crit_cols_raw + crit_cols_normalized
cols_to_plot.remove("smart_201_raw") # too many nans
cols_to_plot.remove("smart_201_normalized") # too many nans
# one figure per smart stat
figs = [plt.figure(i, figsize=(10, 10)) for i in range(len(cols_to_plot))]
axes = [f.add_subplot(111) for f in figs]
for colname, ax in zip(cols_to_plot, axes):
ax.set_title(
"{} vs Time for {} Failed Drives".format(colname, NUM_DRIVES_TO_SAMPLE)
)
ax.set_xlabel("Number of Days")
ax.set_ylabel(colname)
# keep track of what hard drives were used to generate the data. NOTE: only the first 16 chars of ser will be saved
failed_ser_subset = np.empty(shape=(NUM_DRIVES_TO_SAMPLE), dtype="<S16")
# make the figures
for i, ser in enumerate(
failed_sers["serial_number"].sample(NUM_DRIVES_TO_SAMPLE, random_state=42)
):
# log serial numbers which are being used
failed_ser_subset[i] = ser
print("{} / {}. Drive serial number {}".format(i + 1, NUM_DRIVES_TO_SAMPLE, ser))
# get teh data to make the figures
drive_data = df[df["serial_number"] == ser][cols_to_plot].compute()
# dummy x axis data
xvals = [i for i in range(drive_data.shape[0])]
# make the plot
for ax, c in zip(axes, cols_to_plot):
ax.plot(xvals, drive_data[c])
# save the figures
for f, c in zip(figs, cols_to_plot):
f.savefig("../../../reports/figures/{}_failed.png".format(c))
# save the serial numbres used in figures
np.save("failed_graphs_serials", failed_ser_subset)
1 / 50. Drive serial number 175PP3I7T
[########################################] | 100% Completed | 1min 11.8s
2 / 50. Drive serial number S2ZYJ9CFC01460
[########################################] | 100% Completed | 1min 12.7s
3 / 50. Drive serial number Z4D0D1V1
[########################################] | 100% Completed | 1min 12.4s
4 / 50. Drive serial number Z304JMAM
[########################################] | 100% Completed | 1min 11.4s
5 / 50. Drive serial number ZJV03CQH
[########################################] | 100% Completed | 1min 11.9s
6 / 50. Drive serial number 88Q0A01NF97G
[########################################] | 100% Completed | 1min 11.9s
7 / 50. Drive serial number ZA181AA1
[########################################] | 100% Completed | 1min 11.7s
8 / 50. Drive serial number Z3025KZQ
[########################################] | 100% Completed | 1min 11.9s
9 / 50. Drive serial number S301PQF4
[########################################] | 100% Completed | 1min 11.4s
10 / 50. Drive serial number Z304K4G7
[########################################] | 100% Completed | 1min 13.0s
11 / 50. Drive serial number Z4D0EVSH
[########################################] | 100% Completed | 1min 19.7s
12 / 50. Drive serial number ZA171RLD
[########################################] | 100% Completed | 1min 28.0s
13 / 50. Drive serial number ZCH07J8T
[########################################] | 100% Completed | 1min 12.3s
14 / 50. Drive serial number Z3025L0M
[########################################] | 100% Completed | 1min 12.6s
15 / 50. Drive serial number Z302A0K4
[########################################] | 100% Completed | 1min 11.8s
16 / 50. Drive serial number VKG8ZKRX
[########################################] | 100% Completed | 1min 12.1s
17 / 50. Drive serial number Z300JBBN
[########################################] | 100% Completed | 1min 12.9s
18 / 50. Drive serial number Z303XYW6
[########################################] | 100% Completed | 1min 11.9s
19 / 50. Drive serial number S30116JR
[########################################] | 100% Completed | 1min 12.1s
20 / 50. Drive serial number ZCH08P7M
[########################################] | 100% Completed | 1min 11.6s
21 / 50. Drive serial number WD-WX31A356PK9Y
[########################################] | 100% Completed | 1min 11.7s
22 / 50. Drive serial number ZCH0CRE6
[########################################] | 100% Completed | 1min 11.9s
23 / 50. Drive serial number PL2331LAG9B5WJ
[########################################] | 100% Completed | 1min 11.6s
24 / 50. Drive serial number WD-WX21D947N1C3
[########################################] | 100% Completed | 1min 11.6s
25 / 50. Drive serial number PL1331LAHBZ49H
[########################################] | 100% Completed | 1min 11.8s
26 / 50. Drive serial number ZA10NENE
[########################################] | 100% Completed | 1min 11.9s
27 / 50. Drive serial number Z305Q0NT
[########################################] | 100% Completed | 1min 12.2s
28 / 50. Drive serial number Z304MZ1A
[########################################] | 100% Completed | 1min 11.9s
29 / 50. Drive serial number ZCH0CXJ4
[########################################] | 100% Completed | 1min 11.9s
30 / 50. Drive serial number S2ZYJ9FFB18436
[########################################] | 100% Completed | 1min 11.9s
31 / 50. Drive serial number S300ZRQS
[########################################] | 100% Completed | 1min 11.8s
32 / 50. Drive serial number ZCH07VXJ
[########################################] | 100% Completed | 1min 12.0s
33 / 50. Drive serial number ZCH0AZB0
[########################################] | 100% Completed | 1min 11.8s
34 / 50. Drive serial number Z302T6Y1
[########################################] | 100% Completed | 1min 12.0s
35 / 50. Drive serial number ZCH07W07
[########################################] | 100% Completed | 1min 12.1s
36 / 50. Drive serial number PL1331LAHD4DYH
[########################################] | 100% Completed | 1min 12.0s
37 / 50. Drive serial number ZJV03NB1
[########################################] | 100% Completed | 1min 12.0s
38 / 50. Drive serial number ZA1818J7
[########################################] | 100% Completed | 1min 11.6s
39 / 50. Drive serial number Z3029GF5
[########################################] | 100% Completed | 1min 12.1s
40 / 50. Drive serial number Z3041C62
[########################################] | 100% Completed | 1min 11.8s
41 / 50. Drive serial number ZCH07NM4
[########################################] | 100% Completed | 1min 12.2s
42 / 50. Drive serial number 38M0A00XF97G
[########################################] | 100% Completed | 1min 12.0s
43 / 50. Drive serial number Z304JMCM
[########################################] | 100% Completed | 1min 11.9s
44 / 50. Drive serial number ZCH0715W
[########################################] | 100% Completed | 1min 11.9s
45 / 50. Drive serial number ZCH07B83
[########################################] | 100% Completed | 1min 11.7s
46 / 50. Drive serial number Z304JCWC
[########################################] | 100% Completed | 1min 11.6s
47 / 50. Drive serial number ZCH07ZFX
[########################################] | 100% Completed | 1min 12.4s
48 / 50. Drive serial number ZCH0D3ZT
[########################################] | 100% Completed | 1min 11.9s
49 / 50. Drive serial number S300Z3XP
[########################################] | 100% Completed | 1min 11.9s
50 / 50. Drive serial number S300WDLE
[########################################] | 100% Completed | 1min 11.7s
# sample to use for plotting
NUM_DRIVES_TO_SAMPLE = 50
# one figure per smart stat
for colname, ax in zip(cols_to_plot, axes):
ax.cla() # clear data from previous plots
ax.set_title(
"{} vs Time for {} Working Drives".format(colname, NUM_DRIVES_TO_SAMPLE)
)
ax.set_xlabel("Number of Days")
ax.set_ylabel(colname)
# keep track of what hard drives were used to generate the data. NOTE: only the first 16 chars of ser will be saved
working_ser_subset = np.empty(shape=(NUM_DRIVES_TO_SAMPLE), dtype="<S16")
# make the figures
for i, ser in enumerate(
working_sers["serial_number"].sample(NUM_DRIVES_TO_SAMPLE, random_state=42)
):
# log serial numbers which are being used
working_ser_subset[i] = ser
print("{} / {}. Drive serial number {}".format(i + 1, NUM_DRIVES_TO_SAMPLE, ser))
# get teh data to make the figures
drive_data = df[df["serial_number"] == ser][cols_to_plot].compute()
# dummy x axis data
xvals = [i for i in range(drive_data.shape[0])]
# make the plot
for ax, c in zip(axes, cols_to_plot):
ax.plot(xvals, drive_data[c])
# save the figures
for f, c in zip(figs, cols_to_plot):
f.savefig("../../../reports/figures/{}_working.png".format(c))
# save the serial numbres used in figures
np.save("working_graphs_serials", working_ser_subset)
plt.close("all")
Backblaze's analysis:
Backblaze also performed some analysis on the SMART stats 5, 187, 188, 197, 198. They concluded that just having one of the stats in an abnormal state may not necessarily mean anything, but all of them being abnormal at the same time is a red flag. Details and some nice diagrams can be found here: https://www.backblaze.com/blog/what-smart-stats-indicate-hard-drive-failures/
Visualize Embeddings
# get the groupby object. there should be one gourp per serial nunmbers
# get only those columns which have less than 50 percent nan values
grouped_data_cols = ["serial_number", "model", "capacity_bytes"] + list(
nan_count.index[nan_count["percent"] < 0.5]
)
groups = df[grouped_data_cols].groupby("serial_number")
# get simple stats to represent time series of each hard drive
group_means = groups.mean().compute().add_prefix("mean_")
group_stds = groups.std().compute().add_prefix("std_")
# group_days = groups.size().compute().to_frame('days')
# group_days.index.name = None # to match the other agg results, so that it can be concatenated easily
[########################################] | 100% Completed | 2min 7.3s
[########################################] | 100% Completed | 2min 30.3s
# put the stats together into one df
group_stats = pd.concat([group_means, group_stds], axis=1)
# make serial number a column instead of index. will be easier for calc later
group_stats = group_stats.reset_index()
# need to add failed label
group_stats["failure"] = group_stats["serial_number"].isin(failed_sers["serial_number"])
group_stats.head()
serial_number | mean_capacity_bytes | mean_smart_1_raw | mean_smart_5_raw | mean_smart_7_raw | mean_smart_10_raw | mean_smart_184_raw | mean_smart_187_raw | mean_smart_188_raw | mean_smart_189_raw | mean_smart_190_raw | mean_smart_193_raw | mean_smart_194_raw | mean_smart_197_raw | mean_smart_198_raw | mean_smart_240_raw | mean_smart_241_raw | mean_smart_242_raw | mean_smart_1_normalized | mean_smart_5_normalized | mean_smart_7_normalized | mean_smart_10_normalized | mean_smart_184_normalized | mean_smart_187_normalized | mean_smart_188_normalized | mean_smart_189_normalized | mean_smart_190_normalized | mean_smart_193_normalized | mean_smart_194_normalized | mean_smart_197_normalized | mean_smart_198_normalized | mean_smart_240_normalized | mean_smart_241_normalized | mean_smart_242_normalized | std_capacity_bytes | std_smart_1_raw | std_smart_5_raw | std_smart_7_raw | std_smart_10_raw | std_smart_184_raw | std_smart_187_raw | std_smart_188_raw | std_smart_189_raw | std_smart_190_raw | std_smart_193_raw | std_smart_194_raw | std_smart_197_raw | std_smart_198_raw | std_smart_240_raw | std_smart_241_raw | std_smart_242_raw | std_smart_1_normalized | std_smart_5_normalized | std_smart_7_normalized | std_smart_10_normalized | std_smart_184_normalized | std_smart_187_normalized | std_smart_188_normalized | std_smart_189_normalized | std_smart_190_normalized | std_smart_193_normalized | std_smart_194_normalized | std_smart_197_normalized | std_smart_198_normalized | std_smart_240_normalized | std_smart_241_normalized | std_smart_242_normalized | failure | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 175PP3HDT | 5.001079e+11 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | 933.293151 | 33.008219 | 0.000000 | 0.0 | 0.0 | NaN | NaN | 100.0 | 100.0 | 100.0 | 100.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | 44.974722 | 1.216841 | 0.000000 | 0.0 | 0.0 | NaN | NaN | 0.010974 | 0.010974 | 0.010974 | 0.010974 | NaN | NaN | NaN | NaN | NaN | 0.010974 | 0.010974 | 0.010974 | 0.010974 | 0.010974 | NaN | NaN | False |
1 | 175PP3I4T | 5.001079e+11 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | 2175.139726 | 39.224658 | 0.000000 | 0.0 | 0.0 | NaN | NaN | 100.0 | 100.0 | 100.0 | 100.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | 6.571260 | 1.718963 | 0.000000 | 0.0 | 0.0 | NaN | NaN | 0.010974 | 0.010974 | 0.010974 | 0.010974 | NaN | NaN | NaN | NaN | NaN | 0.010974 | 0.010974 | 0.010974 | 0.010974 | 0.010974 | NaN | NaN | False |
2 | 175PP3I5T | 5.001079e+11 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | 2024.745205 | 36.863014 | 0.000000 | 0.0 | 0.0 | NaN | NaN | 100.0 | 100.0 | 100.0 | 100.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | 0.491750 | 2.025357 | 0.000000 | 0.0 | 0.0 | NaN | NaN | 0.010974 | 0.010974 | 0.010974 | 0.010974 | NaN | NaN | NaN | NaN | NaN | 0.010974 | 0.010974 | 0.010974 | 0.010974 | 0.010974 | NaN | NaN | False |
3 | 175PP3I6T | 5.001079e+11 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | 1861.000000 | 34.747945 | 0.000000 | 0.0 | 0.0 | NaN | NaN | 100.0 | 100.0 | 100.0 | 100.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | 0.000000 | 1.314325 | 0.000000 | 0.0 | 0.0 | NaN | NaN | 0.010974 | 0.010974 | 0.010974 | 0.010974 | NaN | NaN | NaN | NaN | NaN | 0.010974 | 0.010974 | 0.010974 | 0.010974 | 0.010974 | NaN | NaN | False |
4 | 175PP3I7T | 5.001079e+11 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | 1020.292683 | 31.283537 | 2.609756 | 0.0 | 0.0 | NaN | NaN | 100.0 | 100.0 | 100.0 | 100.0 | NaN | NaN | NaN | NaN | NaN | 100.0 | 100.0 | 100.0 | 100.0 | 100.0 | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | 46.040017 | 0.999440 | 5.383759 | 0.0 | 0.0 | NaN | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | NaN | NaN | NaN | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | NaN | True |
# FIXME: find a smarter way to deal with nans. drop columns not rows
# for now, just drop it. this drops ~25k observations and leaves us with ~83k
clean_group_stats = group_stats.dropna(how="any")
# make sure we still have enough failure drive data
print(clean_group_stats["failure"].sum(), "failed drives data retained")
clean_group_stats[clean_group_stats["failure"]].head()
835 failed drives data retained
serial_number | mean_capacity_bytes | mean_smart_1_raw | mean_smart_5_raw | mean_smart_7_raw | mean_smart_10_raw | mean_smart_184_raw | mean_smart_187_raw | mean_smart_188_raw | mean_smart_189_raw | mean_smart_190_raw | mean_smart_193_raw | mean_smart_194_raw | mean_smart_197_raw | mean_smart_198_raw | mean_smart_240_raw | mean_smart_241_raw | mean_smart_242_raw | mean_smart_1_normalized | mean_smart_5_normalized | mean_smart_7_normalized | mean_smart_10_normalized | mean_smart_184_normalized | mean_smart_187_normalized | mean_smart_188_normalized | mean_smart_189_normalized | mean_smart_190_normalized | mean_smart_193_normalized | mean_smart_194_normalized | mean_smart_197_normalized | mean_smart_198_normalized | mean_smart_240_normalized | mean_smart_241_normalized | mean_smart_242_normalized | std_capacity_bytes | std_smart_1_raw | std_smart_5_raw | std_smart_7_raw | std_smart_10_raw | std_smart_184_raw | std_smart_187_raw | std_smart_188_raw | std_smart_189_raw | std_smart_190_raw | std_smart_193_raw | std_smart_194_raw | std_smart_197_raw | std_smart_198_raw | std_smart_240_raw | std_smart_241_raw | std_smart_242_raw | std_smart_1_normalized | std_smart_5_normalized | std_smart_7_normalized | std_smart_10_normalized | std_smart_184_normalized | std_smart_187_normalized | std_smart_188_normalized | std_smart_189_normalized | std_smart_190_normalized | std_smart_193_normalized | std_smart_194_normalized | std_smart_197_normalized | std_smart_198_normalized | std_smart_240_normalized | std_smart_241_normalized | std_smart_242_normalized | failure | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
25245 | S300V3AD | 4.000787e+12 | 1.477581e+08 | 8.00000 | 1.706016e+07 | 0.0 | 0.0 | 2.000000 | 0.0 | 0.0 | 24.000000 | 1373.500000 | 24.000000 | 8.000000 | 8.000000 | 733.500000 | 7.855681e+09 | 9.421427e+08 | 115.000000 | 100.000000 | 72.000000 | 100.0 | 100.0 | 98.000000 | 100.0 | 100.0 | 76.000000 | 100.0 | 24.000000 | 100.000000 | 100.000000 | 100.0 | 100.0 | 100.0 | 0.000000e+00 | 1.013742e+08 | 11.313708 | 3.925327e+05 | 0.0 | 0.0 | 2.828427 | 0.0 | 0.0 | 0.000000 | 0.707107 | 0.000000 | 11.313708 | 11.313708 | 16.263456 | 2.965821e+06 | 2.004330e+07 | 5.656854 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.828427 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | True |
25332 | S300VL6S | 4.000787e+12 | 1.224876e+08 | 15.47541 | 2.820471e+08 | 0.0 | 0.0 | 2.885246 | 0.0 | 0.0 | 29.836066 | 46223.196721 | 29.836066 | 4.983607 | 4.983607 | 21735.639344 | 4.831991e+10 | 6.951972e+10 | 115.081967 | 99.983607 | 77.622951 | 100.0 | 100.0 | 98.278689 | 100.0 | 100.0 | 70.163934 | 77.0 | 29.836066 | 99.983607 | 99.983607 | 100.0 | 100.0 | 100.0 | 1.555392e+09 | 7.510494e+07 | 119.829825 | 4.248969e+08 | 0.0 | 0.0 | 21.767023 | 0.0 | 0.0 | 2.130574 | 8.194360 | 2.130574 | 38.923211 | 38.923211 | 427.247692 | 2.731702e+08 | 2.939588e+09 | 4.148474 | 0.129099 | 8.157148 | 0.000000 | 0.000000 | 12.686133 | 0.000000 | 0.000000 | 2.130574 | 0.016529 | 2.130574 | 0.129099 | 0.129099 | 0.000000 | 0.000000 | 0.000000 | True |
25345 | S300VL9M | 4.000787e+12 | 1.189316e+08 | 0.00000 | 2.603642e+08 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 23.928177 | 37725.588398 | 23.928177 | 0.000000 | 0.000000 | 25721.668508 | 5.057224e+10 | 8.722665e+10 | 114.928177 | 100.000000 | 82.651934 | 100.0 | 100.0 | 100.000000 | 100.0 | 100.0 | 76.071823 | 82.0 | 23.928177 | 100.000000 | 100.000000 | 100.0 | 100.0 | 100.0 | 6.516147e+08 | 7.218180e+07 | 0.000000 | 1.447159e+08 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 1.106695 | 35.166457 | 1.106695 | 0.000000 | 0.000000 | 2517.029672 | 1.750767e+09 | 1.401718e+10 | 4.270681 | 0.022130 | 3.562712 | 0.022130 | 0.022130 | 0.022130 | 0.022130 | 0.022130 | 1.106695 | 0.011065 | 1.106695 | 0.022130 | 0.022130 | 0.022130 | 0.022130 | 0.022130 | True |
25442 | S300WCLM | 4.000787e+12 | 1.304328e+08 | 0.00000 | 2.310999e+08 | 0.0 | 0.0 | 3.052402 | 0.0 | 0.0 | 20.139738 | 34587.135371 | 20.139738 | 0.000000 | 0.000000 | 24127.454148 | 4.989504e+10 | 7.959794e+10 | 115.886463 | 100.000000 | 81.864629 | 100.0 | 100.0 | 96.947598 | 100.0 | 100.0 | 79.860262 | 83.0 | 20.139738 | 100.000000 | 100.000000 | 100.0 | 100.0 | 100.0 | 1.897599e+09 | 6.761556e+07 | 0.000000 | 2.152696e+08 | 0.0 | 0.0 | 0.223324 | 0.0 | 0.0 | 1.095302 | 44.154552 | 1.095302 | 0.000000 | 0.000000 | 1589.720013 | 9.884072e+08 | 9.903793e+09 | 3.253154 | 0.017506 | 3.609668 | 0.017506 | 0.017506 | 0.222680 | 0.017506 | 0.017506 | 1.095302 | 0.004376 | 1.095302 | 0.017506 | 0.017506 | 0.017506 | 0.017506 | 0.017506 | True |
25547 | S300WDLE | 4.000787e+12 | 1.234360e+08 | 0.00000 | 5.976873e+08 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 18.626812 | 136894.463768 | 18.626812 | 0.000000 | 0.000000 | 24386.358696 | 3.846574e+10 | 8.531973e+10 | 115.311594 | 100.000000 | 84.608696 | 100.0 | 100.0 | 100.000000 | 100.0 | 100.0 | 81.373188 | 32.0 | 18.626812 | 100.000000 | 100.000000 | 100.0 | 100.0 | 100.0 | 1.592292e+09 | 7.314549e+07 | 0.000000 | 3.985751e+08 | 0.0 | 0.0 | 0.000000 | 0.0 | 0.0 | 0.965901 | 56.613898 | 0.965901 | 0.000000 | 0.000000 | 1915.556400 | 2.904367e+09 | 1.170353e+10 | 3.920676 | 0.000000 | 6.594140 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.965846 | 0.000000 | 0.965901 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | True |
NOTE The percentage of failed hard drives retained is 312/393 = 0.79, and the percentage of healthy hard drives retained is 0.77. So in terms of NaN values dropped in the call in previous cell, the distribution is evenly spread across failed and healthy. (proportion of NaN is same)
# scale the data and find the top principal components
scaler = StandardScaler()
pca = PCA(n_components=3, random_state=42, whiten=True).fit_transform(
scaler.fit_transform(clean_group_stats.drop(["serial_number", "failure"], axis=1))
)
# plot pca
colors = ["blue", "red"]
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(
pca[:, 0],
pca[:, 1],
pca[:, 2],
c=clean_group_stats["failure"],
cmap=matplotlib.colors.ListedColormap(colors),
)
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7f0b9841a208>
# plot umap
client = Client()
with parallel_backend("dask"):
umap = UMAP(n_components=3, random_state=42).fit_transform(
scaler.fit_transform(
clean_group_stats.drop(["serial_number", "failure"], axis=1)
)
)
# plot umap
colors = ["blue", "red"]
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(
umap[:, 0],
umap[:, 1],
umap[:, 2],
c=clean_group_stats["failure"],
cmap=matplotlib.colors.ListedColormap(colors),
)
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7f0b58b60a90>
# tsne embeddings may be more meaningful than pca, but it is extremely slow
# tsne = TSNE(n_components=3, random_state=42)\
# .fit_transform(scaler.fit_transform(clean_group_stats.drop(['serial_number', 'failure'], axis=1)))
SMART 9 Behavior
SMART 9 represents number of power on hours. This can be indicative of lifespan and therefore be useable for the regression problem
# columns to extract from dataset for analyzing
analysis_cols = ["date", "serial_number", "smart_9_raw", "smart_9_normalized"]
# columns for which to make histogram
hist_cols = ["smart_9_raw", "smart_9_normalized"]
NOTE: The values above have entries from the same drive multiple times -- so it may not be a very good representation of how smart 9 generally is for failed drives. So, we try to get the max entry per serial number (assuming that that would be the latest entry) and plot histogram for that instead
# since failure=1 is marked on the last day that a drive worked, the entries on that day woudl be the most recent ones
# NOTE: this is a property of backblaze dataset, and may not generalize
failed_last_smart9_df = df[df["failure"] == 1][analysis_cols].compute()
# do the same for working drives
working_grouped = df[~df["serial_number"].isin(failed_sers["serial_number"])][
hist_cols + ["serial_number"]
].groupby("serial_number")
working_last_smart9_df = working_grouped.max()
dd.compute(working_last_smart9_df.shape)
((122948, 2),)
# size is not too bad -- we can bring it into memory
working_last_smart9_df = working_last_smart9_df.compute()
# working and failed plots together
for col in hist_cols:
# get figure and axes for plotting current column
fig, ax = plt.subplots()
# plot failed and working onto the axes one by one
data = working_last_smart9_df[col]
sns.distplot(data[~data.isna()], ax=ax, label="working")
data = failed_last_smart9_df[col]
sns.distplot(data[~data.isna()], ax=ax, label="failed")
plt.legend()
plt.show()