Step 2: Clean Up Data
In the previous step, the data was explored and it was found that the patterns in the data are quite different for each vendor. Therefore, in this notebook the data is cleaned up and analyzed for a specific vendor - Seagate. Seagate is chosen because it forms the majority in the backblaze dataset (~7M out of ~9M points).
The clean up in this notebook mainly consists of removing/replacing NaN values intelligently.
import gc
import os
import numpy as np
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from src import utils
# register progress bar for compute calls in dask so we have an estimate of how long task will take
pbar = ProgressBar()
pbar.register()
# inferred int32 types cause a type mismatch (int vs float) error when dask sees a null value
# null values cannot be interpreted as ints
custom_dtypes = {
"date": "object",
"serial_number": "object",
"model": "object",
"capacity_bytes": "float32",
"failure": "float32",
"smart_1_normalized": "float32",
"smart_1_raw": "float32",
"smart_2_normalized": "float32",
"smart_2_raw": "float32",
"smart_3_normalized": "float32",
"smart_3_raw": "float32",
"smart_4_normalized": "float32",
"smart_4_raw": "float32",
"smart_5_normalized": "float32",
"smart_5_raw": "float32",
"smart_7_normalized": "float32",
"smart_7_raw": "float32",
"smart_8_normalized": "float32",
"smart_8_raw": "float32",
"smart_9_normalized": "float32",
"smart_9_raw": "float32",
"smart_10_normalized": "float32",
"smart_10_raw": "float32",
"smart_11_normalized": "float32",
"smart_11_raw": "float32",
"smart_12_normalized": "float32",
"smart_12_raw": "float32",
"smart_13_normalized": "float32",
"smart_13_raw": "float32",
"smart_15_normalized": "float32",
"smart_15_raw": "float32",
"smart_16_normalized": "float32",
"smart_16_raw": "float32",
"smart_17_normalized": "float32",
"smart_17_raw": "float32",
"smart_22_normalized": "float32",
"smart_22_raw": "float32",
"smart_23_normalized": "float32",
"smart_23_raw": "float32",
"smart_24_normalized": "float32",
"smart_24_raw": "float32",
"smart_168_normalized": "float32",
"smart_168_raw": "float32",
"smart_170_normalized": "float32",
"smart_170_raw": "float32",
"smart_173_normalized": "float32",
"smart_173_raw": "float32",
"smart_174_normalized": "float32",
"smart_174_raw": "float32",
"smart_177_normalized": "float32",
"smart_177_raw": "float32",
"smart_179_normalized": "float32",
"smart_179_raw": "float32",
"smart_181_normalized": "float32",
"smart_181_raw": "float32",
"smart_182_normalized": "float32",
"smart_182_raw": "float32",
"smart_183_normalized": "float32",
"smart_183_raw": "float32",
"smart_184_normalized": "float32",
"smart_184_raw": "float32",
"smart_187_normalized": "float32",
"smart_187_raw": "float32",
"smart_188_normalized": "float32",
"smart_188_raw": "float32",
"smart_189_normalized": "float32",
"smart_189_raw": "float32",
"smart_190_normalized": "float32",
"smart_190_raw": "float32",
"smart_191_normalized": "float32",
"smart_191_raw": "float32",
"smart_192_normalized": "float32",
"smart_192_raw": "float32",
"smart_193_normalized": "float32",
"smart_193_raw": "float32",
"smart_194_normalized": "float32",
"smart_194_raw": "float32",
"smart_195_normalized": "float32",
"smart_195_raw": "float32",
"smart_196_normalized": "float32",
"smart_196_raw": "float32",
"smart_197_normalized": "float32",
"smart_197_raw": "float32",
"smart_198_normalized": "float32",
"smart_198_raw": "float32",
"smart_199_normalized": "float32",
"smart_199_raw": "float32",
"smart_200_normalized": "float32",
"smart_200_raw": "float32",
"smart_201_normalized": "float32",
"smart_201_raw": "float32",
"smart_218_normalized": "float32",
"smart_218_raw": "float32",
"smart_220_normalized": "float32",
"smart_220_raw": "float32",
"smart_222_normalized": "float32",
"smart_222_raw": "float32",
"smart_223_normalized": "float32",
"smart_223_raw": "float32",
"smart_224_normalized": "float32",
"smart_224_raw": "float32",
"smart_225_normalized": "float32",
"smart_225_raw": "float32",
"smart_226_normalized": "float32",
"smart_226_raw": "float32",
"smart_231_normalized": "float32",
"smart_231_raw": "float32",
"smart_232_normalized": "float32",
"smart_232_raw": "float32",
"smart_233_normalized": "float32",
"smart_233_raw": "float32",
"smart_235_normalized": "float32",
"smart_235_raw": "float32",
"smart_240_normalized": "float32",
"smart_240_raw": "float32",
"smart_241_normalized": "float32",
"smart_241_raw": "float32",
"smart_242_normalized": "float32",
"smart_242_raw": "float32",
"smart_250_normalized": "float32",
"smart_250_raw": "float32",
"smart_251_normalized": "float32",
"smart_251_raw": "float32",
"smart_252_normalized": "float32",
"smart_252_raw": "float32",
"smart_254_normalized": "float32",
"smart_254_raw": "float32",
"smart_255_normalized": "float32",
"smart_255_raw": "float32",
}
# read all the data into one dataframe
DATA_DIR = "/home/kachauha/Downloads/data_Q3_2018"
df = dd.read_csv(os.path.join(DATA_DIR, "*.csv"), dtype=custom_dtypes)
# get the hgst data
hgst_df = df[df["model"].str.startswith("HG")]
# get the serial numbers for all the failed hard drives
failed_serials = hgst_df[hgst_df["failure"] == 1]["serial_number"].compute()
[########################################] | 100% Completed | 36.4s
NOTE: Can also add date of failure, and its model to failed_serials (and working_serials) if it will be helpful for preprocessing in the future.
dd.compute(hgst_df.shape)
[########################################] | 100% Completed | 37.4s
((1911101, 109),)
# # get the serial numbers for all the failed hard drives, date of failure, and its model
# # multiple entries will exist per serial number since it will be a time series. get only the last one
# working_serials = hgst_df[~hgst_df['serial_number'].isin(failed_serials)]\
# ['serial_number']\
# .drop_duplicates(keep='last')\
# .compute()
[########################################] | 100% Completed | 51.3s
2203 Z300XGTR
4389 Z300XA99
4900 Z3015BTR
6971 Z300WWNK
8459 Z30149AZ
Name: serial_number, dtype: object
NOTE: We can probably get away with not calculating working serials and instead using not failed_serials for indexing. this will be much faster and efficient since we are searching in ~300 drives as oppospsed to ~80k drives.
Ensure Unique Indices
Having a unique index per entry (e.g. 2018-10-01 Z305B2QN) will make data processing less prone to errors. It will also expand the variety of merging and groupby operations that can be done on the data while still keeping it a dask dataframe (vs pandas dataframe). This is because some operations in dask require do not support duplicate indices.
# # are there any duplicate indices? [SPOILER ALERT] yes
# tmp = seagate_df.index.compute()
# tmp.duplicated().any() # True
# # can reset indices do the job?
# # no. see dask docs. indices are duplicated across dask partitions
# seagate_df = seagate_df.reset_index(drop=True)
# tmp = seagate_df.index.compute()
# tmp.duplicated().any() # True
# proof of duplicate indices
hgst_df.loc[0, :].compute().head()
[########################################] | 100% Completed | 45.0s
date | serial_number | model | capacity_bytes | failure | smart_1_normalized | smart_1_raw | smart_2_normalized | smart_2_raw | smart_3_normalized | ... | smart_250_normalized | smart_250_raw | smart_251_normalized | smart_251_raw | smart_252_normalized | smart_252_raw | smart_254_normalized | smart_254_raw | smart_255_normalized | smart_255_raw |
---|
0 rows × 129 columns
RESULT: There does not seem to be a way to remove duplication. It will limit ops in some ways, but we'll have to get around them.
Manual Feature Selection
Previous work done in this area strongly suggests not all stats are equally important. Backblaze suggests five, wikipedia suggests nine, and IBM research suggests a bunch. Make a superset of all these stats and work with those for now. Using too many features can be a rabbit hole we don't wanna go down in.
# work with critical columns for now
CRITICAL_STATS = [
1,
5,
7,
10,
184,
187,
188,
189,
190,
193,
194,
196,
197,
198,
201,
240,
241,
242,
] # NOTE: 201 is all nans
crit_cols_raw = ["smart_{}_raw".format(i) for i in CRITICAL_STATS]
crit_cols_normalized = ["smart_{}_normalized".format(i) for i in CRITICAL_STATS]
Find Patterns in NaNs and Clean Accordingly
# these are the columns analyzed and nans are accounted for these (see https://trello.com/c/tjFl6RHf)
done_stats = [
2,
8,
11,
196,
223,
225,
250,
251,
252,
16,
17,
168,
170,
173,
174,
177,
218,
231,
232,
233,
235,
254,
183,
200,
195,
191,
]
done_cols = ["smart_{}_raw".format(i) for i in done_stats] + [
"smart_{}_normalized".format(i) for i in done_stats
]
# the columns to keep for analysis
keep_cols = (
["date", "serial_number", "capacity_bytes", "failure"]
+ crit_cols_raw
+ crit_cols_normalized
)
# dummy value to replace nans
DUMMY_VALUE = -100
NOTE: It is observed (later in the following cells) that for HGST, most values are null for critical columns that we have selected. Furthermore, there are only a few columns which contain non null values. Therefore, we will work with all columns for the task of nan handling.
# copy of df on which to perform cleaning tasks
clean_df = hgst_df
# get ideal number of partitions
PARTITION_SIZE_BYTES = 100 * 10 ** 6
DF_SIZE_BYTES = clean_df.memory_usage(deep=True).sum().compute()
NUM_PARTITIONS = int(np.ceil(DF_SIZE_BYTES / PARTITION_SIZE_BYTES))
# repartition and save cleaned version of data
clean_df = clean_df.repartition(npartitions=NUM_PARTITIONS)
[########################################] | 100% Completed | 39.9s
# meta data for later use
initial_shape = dd.compute(clean_df.shape)[0]
num_total_datapts = initial_shape[0]
print("Initial shape =", initial_shape)
[########################################] | 100% Completed | 46.3s
Initial shape = (1911101, 109)
# how bad is the nan situation for critical columns? get counts as a percent of total
hgst_nans = utils.get_nan_count_percent(clean_df, num_total_datapts)
###################################################################
# # this chunk of code is used to get cols with the same percent of
# # nan values
# val = None
# for k, v in hgst_nanpercent.iteritems():
# if v > 0.999197 and v < 0.999199:
# if val is None:
# val = v
# print(k)
# print(v)
# if v == val:
# print('same as val')
# else:
# print('different')
###################################################################
# show only values which at least some nans
# & (hgst_nans['percent'] != 1)
# (hgst_nans['percent'] != 0) &
hgst_nans[(hgst_nans["percent"] != 1)].compute().sort_values(
by="percent", ascending=False
)
[########################################] | 100% Completed | 52.5s
count | percent | |
---|---|---|
smart_22_normalized | 1809673 | 0.946927 |
smart_22_raw | 1809673 | 0.946927 |
smart_9_raw | 23 | 0.000012 |
smart_10_normalized | 23 | 0.000012 |
smart_12_normalized | 23 | 0.000012 |
smart_12_raw | 23 | 0.000012 |
smart_192_normalized | 23 | 0.000012 |
smart_192_raw | 23 | 0.000012 |
smart_193_normalized | 23 | 0.000012 |
smart_193_raw | 23 | 0.000012 |
smart_194_normalized | 23 | 0.000012 |
smart_194_raw | 23 | 0.000012 |
smart_196_normalized | 23 | 0.000012 |
smart_196_raw | 23 | 0.000012 |
smart_197_normalized | 23 | 0.000012 |
smart_197_raw | 23 | 0.000012 |
smart_198_normalized | 23 | 0.000012 |
smart_198_raw | 23 | 0.000012 |
smart_199_normalized | 23 | 0.000012 |
smart_10_raw | 23 | 0.000012 |
smart_199_raw | 23 | 0.000012 |
smart_3_raw | 23 | 0.000012 |
smart_4_normalized | 23 | 0.000012 |
smart_1_normalized | 23 | 0.000012 |
smart_1_raw | 23 | 0.000012 |
smart_2_normalized | 23 | 0.000012 |
smart_2_raw | 23 | 0.000012 |
smart_3_normalized | 23 | 0.000012 |
smart_9_normalized | 23 | 0.000012 |
smart_4_raw | 23 | 0.000012 |
smart_5_normalized | 23 | 0.000012 |
smart_5_raw | 23 | 0.000012 |
smart_7_normalized | 23 | 0.000012 |
smart_7_raw | 23 | 0.000012 |
smart_8_normalized | 23 | 0.000012 |
smart_8_raw | 23 | 0.000012 |
capacity_bytes | 0 | 0.000000 |
model | 0 | 0.000000 |
serial_number | 0 | 0.000000 |
failure | 0 | 0.000000 |
date | 0 | 0.000000 |
RESULT For HGST, most features are either all-nan or almost-no-nan. All nans can be dropped. Within almost-no-nan featuers, none of the nans are coming from failed drives. So it is safe to drop, but still we will impute them with median values.
Part 1: Drop columns where all are nans
# get columns for which all the values are nans
all_nan_cols = hgst_nans[hgst_nans["percent"] == 1].index.compute()
# sanity check -- make sure the columns identified as all-nans are actually so
clean_df[all_nan_cols[:6]].head()
[########################################] | 100% Completed | 42.1s
[########################################] | 100% Completed | 3.2s
smart_11_normalized | smart_11_raw | smart_13_normalized | smart_13_raw | smart_15_normalized | smart_15_raw | |
---|---|---|---|---|---|---|
1 | NaN | NaN | NaN | NaN | NaN | NaN |
10 | NaN | NaN | NaN | NaN | NaN | NaN |
11 | NaN | NaN | NaN | NaN | NaN | NaN |
13 | NaN | NaN | NaN | NaN | NaN | NaN |
17 | NaN | NaN | NaN | NaN | NaN | NaN |
# drop it like it's hot
clean_df = clean_df.drop(all_nan_cols, axis=1)
# how do things look after this
utils.get_nan_count_percent(clean_df, num_total_datapts).compute().sort_values(
by="percent", ascending=False
)
[########################################] | 100% Completed | 41.5s
count | percent | |
---|---|---|
smart_22_normalized | 1809673 | 0.946927 |
smart_22_raw | 1809673 | 0.946927 |
smart_9_raw | 23 | 0.000012 |
smart_10_normalized | 23 | 0.000012 |
smart_12_normalized | 23 | 0.000012 |
smart_12_raw | 23 | 0.000012 |
smart_192_normalized | 23 | 0.000012 |
smart_192_raw | 23 | 0.000012 |
smart_193_normalized | 23 | 0.000012 |
smart_193_raw | 23 | 0.000012 |
smart_194_normalized | 23 | 0.000012 |
smart_194_raw | 23 | 0.000012 |
smart_196_normalized | 23 | 0.000012 |
smart_196_raw | 23 | 0.000012 |
smart_197_normalized | 23 | 0.000012 |
smart_197_raw | 23 | 0.000012 |
smart_198_normalized | 23 | 0.000012 |
smart_198_raw | 23 | 0.000012 |
smart_199_normalized | 23 | 0.000012 |
smart_10_raw | 23 | 0.000012 |
smart_199_raw | 23 | 0.000012 |
smart_3_raw | 23 | 0.000012 |
smart_4_normalized | 23 | 0.000012 |
smart_1_normalized | 23 | 0.000012 |
smart_1_raw | 23 | 0.000012 |
smart_2_normalized | 23 | 0.000012 |
smart_2_raw | 23 | 0.000012 |
smart_3_normalized | 23 | 0.000012 |
smart_9_normalized | 23 | 0.000012 |
smart_4_raw | 23 | 0.000012 |
smart_5_normalized | 23 | 0.000012 |
smart_5_raw | 23 | 0.000012 |
smart_7_normalized | 23 | 0.000012 |
smart_7_raw | 23 | 0.000012 |
smart_8_normalized | 23 | 0.000012 |
smart_8_raw | 23 | 0.000012 |
capacity_bytes | 0 | 0.000000 |
model | 0 | 0.000000 |
serial_number | 0 | 0.000000 |
failure | 0 | 0.000000 |
date | 0 | 0.000000 |
Part 2: Drop The Small Amount Of Rows Which Are Nans
# get the data points where 193 is null. inspect it for patterns
nan193_df = clean_df[clean_df["smart_193_raw"].isna()].compute()
# number of nans in this subset as a percentage of nans in the overall data
utils.get_nan_count_percent(nan193_df, num_total_datapts)
[########################################] | 100% Completed | 38.5s
count | percent | |
---|---|---|
date | 0 | 0.000000 |
serial_number | 0 | 0.000000 |
model | 0 | 0.000000 |
capacity_bytes | 0 | 0.000000 |
failure | 0 | 0.000000 |
smart_1_normalized | 23 | 0.000012 |
smart_1_raw | 23 | 0.000012 |
smart_2_normalized | 23 | 0.000012 |
smart_2_raw | 23 | 0.000012 |
smart_3_normalized | 23 | 0.000012 |
smart_3_raw | 23 | 0.000012 |
smart_4_normalized | 23 | 0.000012 |
smart_4_raw | 23 | 0.000012 |
smart_5_normalized | 23 | 0.000012 |
smart_5_raw | 23 | 0.000012 |
smart_7_normalized | 23 | 0.000012 |
smart_7_raw | 23 | 0.000012 |
smart_8_normalized | 23 | 0.000012 |
smart_8_raw | 23 | 0.000012 |
smart_9_normalized | 23 | 0.000012 |
smart_9_raw | 23 | 0.000012 |
smart_10_normalized | 23 | 0.000012 |
smart_10_raw | 23 | 0.000012 |
smart_12_normalized | 23 | 0.000012 |
smart_12_raw | 23 | 0.000012 |
smart_22_normalized | 23 | 0.000012 |
smart_22_raw | 23 | 0.000012 |
smart_192_normalized | 23 | 0.000012 |
smart_192_raw | 23 | 0.000012 |
smart_193_normalized | 23 | 0.000012 |
smart_193_raw | 23 | 0.000012 |
smart_194_normalized | 23 | 0.000012 |
smart_194_raw | 23 | 0.000012 |
smart_196_normalized | 23 | 0.000012 |
smart_196_raw | 23 | 0.000012 |
smart_197_normalized | 23 | 0.000012 |
smart_197_raw | 23 | 0.000012 |
smart_198_normalized | 23 | 0.000012 |
smart_198_raw | 23 | 0.000012 |
smart_199_normalized | 23 | 0.000012 |
smart_199_raw | 23 | 0.000012 |
# number of failed drives in subset
len(nan193_df[nan193_df["failure"] == 1])
0
# the serial numbers which are producing nans
nan193_sers = nan193_df["serial_number"].unique()
nan193_sers
array(['PL2331LAHDW5VJ', 'PL1331LAHDYU4H', 'PL2331LAHAW48J',
'PL1331LAHD57MH', 'PL2331LAHDNMTJ', 'PL2331LAHDRRAJ',
'PL1331LAHDYUKH', 'PL1331LAHDYUVH', 'PL2331LAHDRX9J',
'PL2331LAHDHS8J', 'PL2331LAHDHE0J', 'PL1331LAH4BMZH',
'PL1331LAHD580H', 'PL2331LAHDSTHJ', 'PL2331LAHDRDMJ',
'PL1331LAHD58PH', 'PL2331LAHAG8DJ', 'PL1331LAHD58GH',
'PL1331LAGRGB7H', 'PL1331LAGRVERH', 'PL2331LAGPZDRJ',
'PL1331LAGRKDLH', 'PL1331LAGRU0NH'], dtype=object)
# inspect samples one by one to see if there is anything we need to consider before filling nans
clean_df[clean_df["serial_number"] == "PL2331LAHDW5VJ"].compute()
[########################################] | 100% Completed | 36.7s
date | serial_number | model | capacity_bytes | failure | smart_1_normalized | smart_1_raw | smart_2_normalized | smart_2_raw | smart_3_normalized | ... | smart_194_normalized | smart_194_raw | smart_196_normalized | smart_196_raw | smart_197_normalized | smart_197_raw | smart_198_normalized | smart_198_raw | smart_199_normalized | smart_199_raw | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3498 | 2018-07-01 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3498 | 2018-07-02 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3499 | 2018-07-03 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3470 | 2018-07-04 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3468 | 2018-07-05 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3469 | 2018-07-06 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3467 | 2018-07-07 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3466 | 2018-07-08 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3466 | 2018-07-09 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3466 | 2018-07-10 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | -1.000000e+00 | 0.0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3465 | 2018-07-11 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 187.0 | 32.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3501 | 2018-07-12 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3499 | 2018-07-13 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3499 | 2018-07-14 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 187.0 | 32.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3496 | 2018-07-15 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 187.0 | 32.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3496 | 2018-07-16 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 187.0 | 32.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3496 | 2018-07-17 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 187.0 | 32.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3496 | 2018-07-18 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 187.0 | 32.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3497 | 2018-07-19 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3497 | 2018-07-20 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3550 | 2018-07-21 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3543 | 2018-07-22 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3543 | 2018-07-23 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3543 | 2018-07-24 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 193.0 | 31.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3545 | 2018-07-25 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 181.0 | 33.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3549 | 2018-07-26 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 181.0 | 33.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3549 | 2018-07-27 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 181.0 | 33.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3596 | 2018-07-28 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 181.0 | 33.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3383 | 2018-07-29 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 181.0 | 33.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3587 | 2018-07-30 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 181.0 | 33.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3538 | 2018-09-01 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3538 | 2018-09-02 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 181.0 | 33.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3538 | 2018-09-03 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 181.0 | 33.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3538 | 2018-09-04 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3538 | 2018-09-05 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 181.0 | 33.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3539 | 2018-09-06 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 181.0 | 33.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3541 | 2018-09-07 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3540 | 2018-09-08 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3529 | 2018-09-09 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3529 | 2018-09-10 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3532 | 2018-09-11 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3525 | 2018-09-12 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3525 | 2018-09-13 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3523 | 2018-09-14 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3523 | 2018-09-15 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3523 | 2018-09-16 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3524 | 2018-09-17 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3525 | 2018-09-18 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3525 | 2018-09-19 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3525 | 2018-09-20 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3526 | 2018-09-21 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3526 | 2018-09-22 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3526 | 2018-09-23 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3526 | 2018-09-24 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3563 | 2018-09-25 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3561 | 2018-09-26 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3561 | 2018-09-27 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 181.0 | 33.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3561 | 2018-09-28 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 187.0 | 32.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3516 | 2018-09-29 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 176.0 | 34.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
3511 | 2018-09-30 | PL2331LAHDW5VJ | HGST HMS5C4040BLE640 | 4.000787e+12 | 0.0 | 100.0 | 0.0 | 134.0 | 100.0 | 100.0 | ... | 181.0 | 33.0 | 100.0 | 0.0 | 100.0 | 0.0 | 100.0 | 0.0 | 200.0 | 0.0 |
92 rows × 41 columns
# get the number of rows that contain nan values, for each group
ser_nanpercent = (
nan193_df[["date", "serial_number", "smart_193_raw"]]
.groupby("serial_number")
.apply(lambda group: group["smart_193_raw"].isna().sum() / group.shape[0])
)
# are there any groups (serial_numbers) that have more than 1 nan row?
(ser_nanpercent != 1).any()
False
HUNCH: The 1 row of nan in each group exists because some glitch occured on one particular day
# get the days on which nan values occured
nan193_df["date"].value_counts()
2018-07-10 18
2018-08-15 5
Name: date, dtype: int64
# check if there were ANY non-nan values on this doomsday
badday_df = clean_df[clean_df["date"] == "2018-11-17"]
# for each col, true means ALL drives reported nans for this col on this day
is_allnan_col = badday_df.isna().all().compute()
# percentage of cols for whom ALL drives reported nans
is_allnan_col.sum() / is_allnan_col.shape[0]
[########################################] | 100% Completed | 37.4s
1.0
# did we have any failure cases on this day
(badday_df["failure"] == 1).any().compute()
[########################################] | 100% Completed | 34.0s
False
# are there any entries on this day where smart 22 is not null?
# this is checked because smart 22 is almost always null, so if we find something non-null, it's worth keeping
badday_df[["smart_22_raw", "smart_22_normalized"]].isna().all().compute()
[########################################] | 100% Completed | 36.6s
smart_22_raw True
smart_22_normalized True
dtype: bool
# see what non nan values existed
badday_df[~badday_df["smart_22_raw"].isna()][
["date", "serial_number", "model", "capacity_bytes", "failure", "smart_22_raw"]
].compute()
[########################################] | 100% Completed | 43.4s
date | serial_number | model | capacity_bytes | failure | smart_22_raw |
---|
RESULT: Since there exist non null values on this day, it is not wise to remove all entries from this day. This is especially true for SMART 22, where non null values are rare. For the columns that had null values only on this day, we can forward fill the values from the previous day.
NOTE: SMART 22 is rare in general because it is a feature related to helium drives. So not all drives will report it.
# fill in dummy values
cols_to_fill = list(nan193_df.columns)
# dont want to fill non-smart atributes like date, serial numbers
cols_to_fill = [col for col in cols_to_fill if col.startswith("smart")]
# dont want to fill smart 22
cols_to_fill.remove("smart_22_raw")
cols_to_fill.remove("smart_22_normalized")
# must do it in for loop, dask does not like indexing with list
# plus, its not straightforward to mask isna of specific columns
for col in cols_to_fill:
if col.startswith("smart"):
# # TODO: replace value_counts+max with median, when it is implemented in dask
# clean_df[col] = clean_df[col].fillna(value=clean_df[col].value_counts().idxmax())
clean_df[col] = clean_df[col].ffill()
# how do things look after this
utils.get_nan_count_percent(clean_df, num_total_datapts).compute()
[########################################] | 100% Completed | 52.2s
count | percent | |
---|---|---|
date | 0 | 0.000000 |
serial_number | 0 | 0.000000 |
model | 0 | 0.000000 |
capacity_bytes | 0 | 0.000000 |
failure | 0 | 0.000000 |
smart_1_normalized | 0 | 0.000000 |
smart_1_raw | 0 | 0.000000 |
smart_2_normalized | 0 | 0.000000 |
smart_2_raw | 0 | 0.000000 |
smart_3_normalized | 0 | 0.000000 |
smart_3_raw | 0 | 0.000000 |
smart_4_normalized | 0 | 0.000000 |
smart_4_raw | 0 | 0.000000 |
smart_5_normalized | 0 | 0.000000 |
smart_5_raw | 0 | 0.000000 |
smart_7_normalized | 0 | 0.000000 |
smart_7_raw | 0 | 0.000000 |
smart_8_normalized | 0 | 0.000000 |
smart_8_raw | 0 | 0.000000 |
smart_9_normalized | 0 | 0.000000 |
smart_9_raw | 0 | 0.000000 |
smart_10_normalized | 0 | 0.000000 |
smart_10_raw | 0 | 0.000000 |
smart_12_normalized | 0 | 0.000000 |
smart_12_raw | 0 | 0.000000 |
smart_22_normalized | 1809673 | 0.946927 |
smart_22_raw | 1809673 | 0.946927 |
smart_192_normalized | 0 | 0.000000 |
smart_192_raw | 0 | 0.000000 |
smart_193_normalized | 0 | 0.000000 |
smart_193_raw | 0 | 0.000000 |
smart_194_normalized | 0 | 0.000000 |
smart_194_raw | 0 | 0.000000 |
smart_196_normalized | 0 | 0.000000 |
smart_196_raw | 0 | 0.000000 |
smart_197_normalized | 0 | 0.000000 |
smart_197_raw | 0 | 0.000000 |
smart_198_normalized | 0 | 0.000000 |
smart_198_raw | 0 | 0.000000 |
smart_199_normalized | 0 | 0.000000 |
smart_199_raw | 0 | 0.000000 |
# clean up unused memory
del nan193_df
del nan193_sers
del ser_nanpercent
del badday_df
del is_allnan_col
del cols_to_fill
gc.collect()
4195
Part 3: Fill Dummy Value Where Nans Are Indicative Features
SMART 22 i.e. Current Helium Level is not applicable to regular hard drives. That is the reason for null values for this features. This feature is important, so it shouldnt be dropped, but it cannot be filled with mean/median/mode because this it represents a distinct property of drivse. So, a dummy value that does not exist anywhere else in the dataset is used to fill NaNs.
However, if a nan exists within a helium drive, then fill it with mean/median/ffill.
NOTE: Some libraries are ok with having NaNs instead of dummy values but sklearn is not. Hence the dummy filling is needed.
# serial numbers of all drives where 22 is reported as non nan at least once
nonnan22_serials = (
clean_df[~clean_df["smart_22_raw"].isna()]["serial_number"].unique().compute()
) # len = 2334
# of these serial numbers, which ones report at least one nan as well
isanynan22_serials = clean_df[clean_df["serial_number"].isin(nonnan22_serials)][
["date", "serial_number", "smart_22_raw"]
]
isanynan22_serials = (
isanynan22_serials.groupby("serial_number")
.apply(lambda g: g["smart_22_raw"].isna().any())
.compute()
)
[########################################] | 100% Completed | 44.6s
/home/kachauha/.local/share/virtualenvs/ceph_drive_failure-3-2yJKyM/lib/python3.7/site-packages/ipykernel_launcher.py:6: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.
Before: .apply(func)
After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
or: .apply(func, meta=('x', 'f8')) for series result
[########################################] | 100% Completed | 43.6s
# these are the drives that report at least one nan, and are known to be helium drives
# because they have reported non-nan value for smart 22 at least once
helium_nans = isanynan22_serials[isanynan22_serials]
helium_nans
Series([], dtype: bool)
# get detailed data for these drives
cols = ["date", "serial_number", "failure", "smart_22_raw", "smart_22_normalized"]
tmp = clean_df[clean_df["serial_number"].isin(helium_nans.index)][cols].compute()
[########################################] | 100% Completed | 51.2s
# on what dates do nan values occur for smart 22 for helium drives
tmp[tmp["smart_22_raw"].isna()]["date"].unique()
print(tmp[tmp["smart_22_raw"].isna()]["date"].unique())
[]
tmp[["serial_number", "smart_22_raw"]].groupby("serial_number").agg(["mean", "std"])
smart_22_raw | ||
---|---|---|
mean | std | |
serial_number |
tmp[["serial_number", "smart_22_normalized"]].groupby("serial_number").agg(
["mean", "std"]
)
smart_22_normalized | ||
---|---|---|
mean | std | |
serial_number |
clean_df["smart_22_normalized"].value_counts().compute()
[########################################] | 100% Completed | 52.3s
100.0 160200
99.0 19
98.0 16
97.0 11
95.0 7
86.0 7
84.0 6
89.0 5
79.0 5
85.0 5
96.0 5
90.0 4
77.0 4
72.0 4
73.0 4
74.0 4
83.0 4
78.0 4
88.0 3
92.0 3
82.0 3
75.0 3
76.0 2
80.0 2
81.0 2
94.0 2
87.0 2
91.0 1
93.0 1
70.0 1
68.0 1
Name: smart_22_normalized, dtype: int64
clean_df["smart_22_raw"].value_counts().compute()
[########################################] | 100% Completed | 54.5s
100.0 160200
99.0 19
98.0 16
97.0 11
95.0 7
86.0 7
84.0 6
89.0 5
79.0 5
85.0 5
96.0 5
90.0 4
77.0 4
72.0 4
73.0 4
74.0 4
83.0 4
78.0 4
88.0 3
92.0 3
82.0 3
75.0 3
76.0 2
80.0 2
81.0 2
94.0 2
87.0 2
91.0 1
93.0 1
70.0 1
68.0 1
Name: smart_22_raw, dtype: int64
RESULT The helium drives that report some nans do so only on one specific date. The mean (and mode) SMART 22 value for these drives is 100. So we replace the nans reported on that day with 100. After this step, all the helium drive nans are taken care of. Thus, we can proceed with filling rest of the nans with dummy value
clean_df[clean_df["serial_number"].isin(helium_nans.index)][
"smart_22_raw"
].mean().compute()
[########################################] | 100% Completed | 50.9s
100.0
(clean_df["date"] == "2018-11-17").sum().compute()
[########################################] | 100% Completed | 52.5s
21598
# rows which were recorded on the doomsday, and belong to drives that report non null smart 22
is_helium = clean_df["serial_number"].isin(helium_nans.index)
is_doomsday = clean_df["date"] == "2018-11-17"
# replace with mean values within that group
cols_to_fill = ["smart_22_raw", "smart_22_normalized"]
for col in cols_to_fill:
clean_df[col] = clean_df[col].mask(
is_helium & is_doomsday, clean_df[is_helium][col].mean()
)
# make sure that as of this cell, drives report either all nans or no nans for smart 22
tmp = clean_df[clean_df["serial_number"].isin(nonnan22_serials)][
["date", "serial_number", "smart_22_raw"]
]
tmp = tmp.groupby("serial_number").apply(lambda g: g["smart_22_raw"].isna().any())
tmp.any().compute()
[ ] | 0% Completed | 0.0s
/home/kachauha/.local/share/virtualenvs/ceph_drive_failure-3-2yJKyM/lib/python3.7/site-packages/ipykernel_launcher.py:3: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected.
Before: .apply(func)
After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
or: .apply(func, meta=('x', 'f8')) for series result
This is separate from the ipykernel package so we can avoid doing imports until
[########################################] | 100% Completed | 45.9s
False
# fill the rest of the nan values with dummy values
cols_to_fill = ["smart_22_raw", "smart_22_normalized"]
for col in cols_to_fill:
clean_df[col] = clean_df[col].fillna(DUMMY_VALUE)
# how do things look after this
utils.get_nan_count_percent(clean_df, num_total_datapts).compute().sort_values(
by="percent", ascending=False
)
[########################################] | 100% Completed | 44.3s
count | percent | |
---|---|---|
date | 0 | 0.0 |
smart_10_normalized | 0 | 0.0 |
smart_12_normalized | 0 | 0.0 |
smart_12_raw | 0 | 0.0 |
smart_22_normalized | 0 | 0.0 |
smart_22_raw | 0 | 0.0 |
smart_192_normalized | 0 | 0.0 |
smart_192_raw | 0 | 0.0 |
smart_193_normalized | 0 | 0.0 |
smart_193_raw | 0 | 0.0 |
smart_194_normalized | 0 | 0.0 |
smart_194_raw | 0 | 0.0 |
smart_196_normalized | 0 | 0.0 |
smart_196_raw | 0 | 0.0 |
smart_197_normalized | 0 | 0.0 |
smart_197_raw | 0 | 0.0 |
smart_198_normalized | 0 | 0.0 |
smart_198_raw | 0 | 0.0 |
smart_199_normalized | 0 | 0.0 |
smart_10_raw | 0 | 0.0 |
smart_9_raw | 0 | 0.0 |
serial_number | 0 | 0.0 |
smart_9_normalized | 0 | 0.0 |
model | 0 | 0.0 |
capacity_bytes | 0 | 0.0 |
failure | 0 | 0.0 |
smart_1_normalized | 0 | 0.0 |
smart_1_raw | 0 | 0.0 |
smart_2_normalized | 0 | 0.0 |
smart_2_raw | 0 | 0.0 |
smart_3_normalized | 0 | 0.0 |
smart_3_raw | 0 | 0.0 |
smart_4_normalized | 0 | 0.0 |
smart_4_raw | 0 | 0.0 |
smart_5_normalized | 0 | 0.0 |
smart_5_raw | 0 | 0.0 |
smart_7_normalized | 0 | 0.0 |
smart_7_raw | 0 | 0.0 |
smart_8_normalized | 0 | 0.0 |
smart_8_raw | 0 | 0.0 |
smart_199_raw | 0 | 0.0 |
# clean up garbage
del tmp
del nonnan22_serials
del isanynan22_serials
del helium_nans
gc.collect()
3921
Repartition and Save
The size of dataset may have changed after cleanup, so repartition for efficiency, and then save to disk so that cleanup does not have to be done every time for experimentation. According to Dask docs, the ideal size of a partition is around 100 MB (source: https://docs.dask.org/en/latest/dataframe-best-practices.html)
# get ideal number of partitions
PARTITION_SIZE_BYTES = 100 * 10 ** 6
DF_SIZE_BYTES = clean_df.memory_usage(deep=True).sum().compute()
NUM_PARTITIONS = int(np.ceil(DF_SIZE_BYTES / PARTITION_SIZE_BYTES))
# repartition and save cleaned version of data
clean_df = clean_df.repartition(npartitions=NUM_PARTITIONS)
# create save dir as same name as data dir, but suffixed with "_clean" and vendor name
head, tail = os.path.split(os.path.normpath(DATA_DIR))
save_dir = os.path.join(head, tail + "_hgst_clean")
# if path doesnt exist, mkdir
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# save partitions
clean_df.to_csv(os.path.join(save_dir, "partition_*.csv"), index=False)
[########################################] | 100% Completed | 49.7s
[########################################] | 100% Completed | 1min 25.3s
['/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_0.csv',
'/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_1.csv',
'/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_2.csv',
'/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_3.csv',
'/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_4.csv',
'/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_5.csv',
'/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_6.csv',
'/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_7.csv']
# unregister dask compute call progress bar
pbar.unregister()