ODH Logo

Step 2: Clean Up Data

In the previous step, the data was explored and it was found that the patterns in the data are quite different for each vendor. Therefore, in this notebook the data is cleaned up and analyzed for a specific vendor - Seagate. Seagate is chosen because it forms the majority in the backblaze dataset (~7M out of ~9M points).

The clean up in this notebook mainly consists of removing/replacing NaN values intelligently.

[1]
import gc
import os

import numpy as np

import dask.dataframe as dd
from dask.diagnostics import ProgressBar

from src import utils
[2]
# register progress bar for compute calls in dask so we have an estimate of how long task will take
pbar = ProgressBar()
pbar.register()
[3]
# inferred int32 types cause a type mismatch (int vs float) error when dask sees a null value
# null values cannot be interpreted as ints
custom_dtypes = {
    "date": "object",
    "serial_number": "object",
    "model": "object",
    "capacity_bytes": "float32",
    "failure": "float32",
    "smart_1_normalized": "float32",
    "smart_1_raw": "float32",
    "smart_2_normalized": "float32",
    "smart_2_raw": "float32",
    "smart_3_normalized": "float32",
    "smart_3_raw": "float32",
    "smart_4_normalized": "float32",
    "smart_4_raw": "float32",
    "smart_5_normalized": "float32",
    "smart_5_raw": "float32",
    "smart_7_normalized": "float32",
    "smart_7_raw": "float32",
    "smart_8_normalized": "float32",
    "smart_8_raw": "float32",
    "smart_9_normalized": "float32",
    "smart_9_raw": "float32",
    "smart_10_normalized": "float32",
    "smart_10_raw": "float32",
    "smart_11_normalized": "float32",
    "smart_11_raw": "float32",
    "smart_12_normalized": "float32",
    "smart_12_raw": "float32",
    "smart_13_normalized": "float32",
    "smart_13_raw": "float32",
    "smart_15_normalized": "float32",
    "smart_15_raw": "float32",
    "smart_16_normalized": "float32",
    "smart_16_raw": "float32",
    "smart_17_normalized": "float32",
    "smart_17_raw": "float32",
    "smart_22_normalized": "float32",
    "smart_22_raw": "float32",
    "smart_23_normalized": "float32",
    "smart_23_raw": "float32",
    "smart_24_normalized": "float32",
    "smart_24_raw": "float32",
    "smart_168_normalized": "float32",
    "smart_168_raw": "float32",
    "smart_170_normalized": "float32",
    "smart_170_raw": "float32",
    "smart_173_normalized": "float32",
    "smart_173_raw": "float32",
    "smart_174_normalized": "float32",
    "smart_174_raw": "float32",
    "smart_177_normalized": "float32",
    "smart_177_raw": "float32",
    "smart_179_normalized": "float32",
    "smart_179_raw": "float32",
    "smart_181_normalized": "float32",
    "smart_181_raw": "float32",
    "smart_182_normalized": "float32",
    "smart_182_raw": "float32",
    "smart_183_normalized": "float32",
    "smart_183_raw": "float32",
    "smart_184_normalized": "float32",
    "smart_184_raw": "float32",
    "smart_187_normalized": "float32",
    "smart_187_raw": "float32",
    "smart_188_normalized": "float32",
    "smart_188_raw": "float32",
    "smart_189_normalized": "float32",
    "smart_189_raw": "float32",
    "smart_190_normalized": "float32",
    "smart_190_raw": "float32",
    "smart_191_normalized": "float32",
    "smart_191_raw": "float32",
    "smart_192_normalized": "float32",
    "smart_192_raw": "float32",
    "smart_193_normalized": "float32",
    "smart_193_raw": "float32",
    "smart_194_normalized": "float32",
    "smart_194_raw": "float32",
    "smart_195_normalized": "float32",
    "smart_195_raw": "float32",
    "smart_196_normalized": "float32",
    "smart_196_raw": "float32",
    "smart_197_normalized": "float32",
    "smart_197_raw": "float32",
    "smart_198_normalized": "float32",
    "smart_198_raw": "float32",
    "smart_199_normalized": "float32",
    "smart_199_raw": "float32",
    "smart_200_normalized": "float32",
    "smart_200_raw": "float32",
    "smart_201_normalized": "float32",
    "smart_201_raw": "float32",
    "smart_218_normalized": "float32",
    "smart_218_raw": "float32",
    "smart_220_normalized": "float32",
    "smart_220_raw": "float32",
    "smart_222_normalized": "float32",
    "smart_222_raw": "float32",
    "smart_223_normalized": "float32",
    "smart_223_raw": "float32",
    "smart_224_normalized": "float32",
    "smart_224_raw": "float32",
    "smart_225_normalized": "float32",
    "smart_225_raw": "float32",
    "smart_226_normalized": "float32",
    "smart_226_raw": "float32",
    "smart_231_normalized": "float32",
    "smart_231_raw": "float32",
    "smart_232_normalized": "float32",
    "smart_232_raw": "float32",
    "smart_233_normalized": "float32",
    "smart_233_raw": "float32",
    "smart_235_normalized": "float32",
    "smart_235_raw": "float32",
    "smart_240_normalized": "float32",
    "smart_240_raw": "float32",
    "smart_241_normalized": "float32",
    "smart_241_raw": "float32",
    "smart_242_normalized": "float32",
    "smart_242_raw": "float32",
    "smart_250_normalized": "float32",
    "smart_250_raw": "float32",
    "smart_251_normalized": "float32",
    "smart_251_raw": "float32",
    "smart_252_normalized": "float32",
    "smart_252_raw": "float32",
    "smart_254_normalized": "float32",
    "smart_254_raw": "float32",
    "smart_255_normalized": "float32",
    "smart_255_raw": "float32",
}
[4]
# read all the data into one dataframe
DATA_DIR = "/home/kachauha/Downloads/data_Q3_2018"
df = dd.read_csv(os.path.join(DATA_DIR, "*.csv"), dtype=custom_dtypes)
[5]
# get the hgst data
hgst_df = df[df["model"].str.startswith("HG")]
[6]
# get the serial numbers for all the failed hard drives
failed_serials = hgst_df[hgst_df["failure"] == 1]["serial_number"].compute()
[########################################] | 100% Completed | 36.4s

NOTE: Can also add date of failure, and its model to failed_serials (and working_serials) if it will be helpful for preprocessing in the future.

[7]
dd.compute(hgst_df.shape)
[########################################] | 100% Completed | 37.4s
((1911101, 109),)
[15]
# # get the serial numbers for all the failed hard drives, date of failure, and its model
# # multiple entries will exist per serial number since it will be a time series. get only the last one
# working_serials = hgst_df[~hgst_df['serial_number'].isin(failed_serials)]\
#                     ['serial_number']\
#                     .drop_duplicates(keep='last')\
#                     .compute()
[########################################] | 100% Completed | 51.3s
2203    Z300XGTR
4389    Z300XA99
4900    Z3015BTR
6971    Z300WWNK
8459    Z30149AZ
Name: serial_number, dtype: object

NOTE: We can probably get away with not calculating working serials and instead using not failed_serials for indexing. this will be much faster and efficient since we are searching in ~300 drives as oppospsed to ~80k drives.

Ensure Unique Indices

Having a unique index per entry (e.g. 2018-10-01 Z305B2QN) will make data processing less prone to errors. It will also expand the variety of merging and groupby operations that can be done on the data while still keeping it a dask dataframe (vs pandas dataframe). This is because some operations in dask require do not support duplicate indices.

[7]
# # are there any duplicate indices? [SPOILER ALERT] yes
# tmp = seagate_df.index.compute()
# tmp.duplicated().any()    # True

# # can reset indices do the job?
# # no. see dask docs. indices are duplicated across dask partitions
# seagate_df = seagate_df.reset_index(drop=True)

# tmp = seagate_df.index.compute()
# tmp.duplicated().any()    # True
[12]
# proof of duplicate indices
hgst_df.loc[0, :].compute().head()
[########################################] | 100% Completed | 45.0s
date serial_number model capacity_bytes failure smart_1_normalized smart_1_raw smart_2_normalized smart_2_raw smart_3_normalized ... smart_250_normalized smart_250_raw smart_251_normalized smart_251_raw smart_252_normalized smart_252_raw smart_254_normalized smart_254_raw smart_255_normalized smart_255_raw

0 rows × 129 columns

RESULT: There does not seem to be a way to remove duplication. It will limit ops in some ways, but we'll have to get around them.

Manual Feature Selection

Previous work done in this area strongly suggests not all stats are equally important. Backblaze suggests five, wikipedia suggests nine, and IBM research suggests a bunch. Make a superset of all these stats and work with those for now. Using too many features can be a rabbit hole we don't wanna go down in.

[8]
# work with critical columns for now
CRITICAL_STATS = [
    1,
    5,
    7,
    10,
    184,
    187,
    188,
    189,
    190,
    193,
    194,
    196,
    197,
    198,
    201,
    240,
    241,
    242,
]  # NOTE: 201 is all nans
crit_cols_raw = ["smart_{}_raw".format(i) for i in CRITICAL_STATS]
crit_cols_normalized = ["smart_{}_normalized".format(i) for i in CRITICAL_STATS]

Find Patterns in NaNs and Clean Accordingly

[9]
# these are the columns analyzed and nans are accounted for these (see https://trello.com/c/tjFl6RHf)
done_stats = [
    2,
    8,
    11,
    196,
    223,
    225,
    250,
    251,
    252,
    16,
    17,
    168,
    170,
    173,
    174,
    177,
    218,
    231,
    232,
    233,
    235,
    254,
    183,
    200,
    195,
    191,
]
done_cols = ["smart_{}_raw".format(i) for i in done_stats] + [
    "smart_{}_normalized".format(i) for i in done_stats
]

# the columns to keep for analysis
keep_cols = (
    ["date", "serial_number", "capacity_bytes", "failure"]
    + crit_cols_raw
    + crit_cols_normalized
)

# dummy value to replace nans
DUMMY_VALUE = -100

NOTE: It is observed (later in the following cells) that for HGST, most values are null for critical columns that we have selected. Furthermore, there are only a few columns which contain non null values. Therefore, we will work with all columns for the task of nan handling.

[10]
# copy of df on which to perform cleaning tasks
clean_df = hgst_df

# get ideal number of partitions
PARTITION_SIZE_BYTES = 100 * 10 ** 6
DF_SIZE_BYTES = clean_df.memory_usage(deep=True).sum().compute()
NUM_PARTITIONS = int(np.ceil(DF_SIZE_BYTES / PARTITION_SIZE_BYTES))

# repartition and save cleaned version of data
clean_df = clean_df.repartition(npartitions=NUM_PARTITIONS)
[########################################] | 100% Completed | 39.9s
[11]
# meta data for later use
initial_shape = dd.compute(clean_df.shape)[0]
num_total_datapts = initial_shape[0]
print("Initial shape =", initial_shape)
[########################################] | 100% Completed | 46.3s Initial shape = (1911101, 109)
[12]
# how bad is the nan situation for critical columns? get counts as a percent of total
hgst_nans = utils.get_nan_count_percent(clean_df, num_total_datapts)

###################################################################
# # this chunk of code is used to get cols with the same percent of
# # nan values
# val = None
# for k, v in hgst_nanpercent.iteritems():
#     if v > 0.999197 and v < 0.999199:
#         if val is None:
#             val = v
#         print(k)
#         print(v)
#         if v == val:
#             print('same as val')
#         else:
#             print('different')
###################################################################

# show only values which at least some nans
#  & (hgst_nans['percent'] != 1)
# (hgst_nans['percent'] != 0) &
hgst_nans[(hgst_nans["percent"] != 1)].compute().sort_values(
    by="percent", ascending=False
)
[########################################] | 100% Completed | 52.5s
count percent
smart_22_normalized 1809673 0.946927
smart_22_raw 1809673 0.946927
smart_9_raw 23 0.000012
smart_10_normalized 23 0.000012
smart_12_normalized 23 0.000012
smart_12_raw 23 0.000012
smart_192_normalized 23 0.000012
smart_192_raw 23 0.000012
smart_193_normalized 23 0.000012
smart_193_raw 23 0.000012
smart_194_normalized 23 0.000012
smart_194_raw 23 0.000012
smart_196_normalized 23 0.000012
smart_196_raw 23 0.000012
smart_197_normalized 23 0.000012
smart_197_raw 23 0.000012
smart_198_normalized 23 0.000012
smart_198_raw 23 0.000012
smart_199_normalized 23 0.000012
smart_10_raw 23 0.000012
smart_199_raw 23 0.000012
smart_3_raw 23 0.000012
smart_4_normalized 23 0.000012
smart_1_normalized 23 0.000012
smart_1_raw 23 0.000012
smart_2_normalized 23 0.000012
smart_2_raw 23 0.000012
smart_3_normalized 23 0.000012
smart_9_normalized 23 0.000012
smart_4_raw 23 0.000012
smart_5_normalized 23 0.000012
smart_5_raw 23 0.000012
smart_7_normalized 23 0.000012
smart_7_raw 23 0.000012
smart_8_normalized 23 0.000012
smart_8_raw 23 0.000012
capacity_bytes 0 0.000000
model 0 0.000000
serial_number 0 0.000000
failure 0 0.000000
date 0 0.000000

RESULT For HGST, most features are either all-nan or almost-no-nan. All nans can be dropped. Within almost-no-nan featuers, none of the nans are coming from failed drives. So it is safe to drop, but still we will impute them with median values.

Part 1: Drop columns where all are nans

[13]
# get columns for which all the values are nans
all_nan_cols = hgst_nans[hgst_nans["percent"] == 1].index.compute()

# sanity check -- make sure the columns identified as all-nans are actually so
clean_df[all_nan_cols[:6]].head()
[########################################] | 100% Completed | 42.1s [########################################] | 100% Completed | 3.2s
smart_11_normalized smart_11_raw smart_13_normalized smart_13_raw smart_15_normalized smart_15_raw
1 NaN NaN NaN NaN NaN NaN
10 NaN NaN NaN NaN NaN NaN
11 NaN NaN NaN NaN NaN NaN
13 NaN NaN NaN NaN NaN NaN
17 NaN NaN NaN NaN NaN NaN
[14]
# drop it like it's hot
clean_df = clean_df.drop(all_nan_cols, axis=1)
[15]
# how do things look after this
utils.get_nan_count_percent(clean_df, num_total_datapts).compute().sort_values(
    by="percent", ascending=False
)
[########################################] | 100% Completed | 41.5s
count percent
smart_22_normalized 1809673 0.946927
smart_22_raw 1809673 0.946927
smart_9_raw 23 0.000012
smart_10_normalized 23 0.000012
smart_12_normalized 23 0.000012
smart_12_raw 23 0.000012
smart_192_normalized 23 0.000012
smart_192_raw 23 0.000012
smart_193_normalized 23 0.000012
smart_193_raw 23 0.000012
smart_194_normalized 23 0.000012
smart_194_raw 23 0.000012
smart_196_normalized 23 0.000012
smart_196_raw 23 0.000012
smart_197_normalized 23 0.000012
smart_197_raw 23 0.000012
smart_198_normalized 23 0.000012
smart_198_raw 23 0.000012
smart_199_normalized 23 0.000012
smart_10_raw 23 0.000012
smart_199_raw 23 0.000012
smart_3_raw 23 0.000012
smart_4_normalized 23 0.000012
smart_1_normalized 23 0.000012
smart_1_raw 23 0.000012
smart_2_normalized 23 0.000012
smart_2_raw 23 0.000012
smart_3_normalized 23 0.000012
smart_9_normalized 23 0.000012
smart_4_raw 23 0.000012
smart_5_normalized 23 0.000012
smart_5_raw 23 0.000012
smart_7_normalized 23 0.000012
smart_7_raw 23 0.000012
smart_8_normalized 23 0.000012
smart_8_raw 23 0.000012
capacity_bytes 0 0.000000
model 0 0.000000
serial_number 0 0.000000
failure 0 0.000000
date 0 0.000000

Part 2: Drop The Small Amount Of Rows Which Are Nans

[16]
# get the data points where 193 is null. inspect it for patterns
nan193_df = clean_df[clean_df["smart_193_raw"].isna()].compute()

# number of nans in this subset as a percentage of nans in the overall data
utils.get_nan_count_percent(nan193_df, num_total_datapts)
[########################################] | 100% Completed | 38.5s
count percent
date 0 0.000000
serial_number 0 0.000000
model 0 0.000000
capacity_bytes 0 0.000000
failure 0 0.000000
smart_1_normalized 23 0.000012
smart_1_raw 23 0.000012
smart_2_normalized 23 0.000012
smart_2_raw 23 0.000012
smart_3_normalized 23 0.000012
smart_3_raw 23 0.000012
smart_4_normalized 23 0.000012
smart_4_raw 23 0.000012
smart_5_normalized 23 0.000012
smart_5_raw 23 0.000012
smart_7_normalized 23 0.000012
smart_7_raw 23 0.000012
smart_8_normalized 23 0.000012
smart_8_raw 23 0.000012
smart_9_normalized 23 0.000012
smart_9_raw 23 0.000012
smart_10_normalized 23 0.000012
smart_10_raw 23 0.000012
smart_12_normalized 23 0.000012
smart_12_raw 23 0.000012
smart_22_normalized 23 0.000012
smart_22_raw 23 0.000012
smart_192_normalized 23 0.000012
smart_192_raw 23 0.000012
smart_193_normalized 23 0.000012
smart_193_raw 23 0.000012
smart_194_normalized 23 0.000012
smart_194_raw 23 0.000012
smart_196_normalized 23 0.000012
smart_196_raw 23 0.000012
smart_197_normalized 23 0.000012
smart_197_raw 23 0.000012
smart_198_normalized 23 0.000012
smart_198_raw 23 0.000012
smart_199_normalized 23 0.000012
smart_199_raw 23 0.000012
[17]
# number of failed drives in subset
len(nan193_df[nan193_df["failure"] == 1])
0
[19]
# the serial numbers which are producing nans
nan193_sers = nan193_df["serial_number"].unique()
nan193_sers
array(['PL2331LAHDW5VJ', 'PL1331LAHDYU4H', 'PL2331LAHAW48J',
       'PL1331LAHD57MH', 'PL2331LAHDNMTJ', 'PL2331LAHDRRAJ',
       'PL1331LAHDYUKH', 'PL1331LAHDYUVH', 'PL2331LAHDRX9J',
       'PL2331LAHDHS8J', 'PL2331LAHDHE0J', 'PL1331LAH4BMZH',
       'PL1331LAHD580H', 'PL2331LAHDSTHJ', 'PL2331LAHDRDMJ',
       'PL1331LAHD58PH', 'PL2331LAHAG8DJ', 'PL1331LAHD58GH',
       'PL1331LAGRGB7H', 'PL1331LAGRVERH', 'PL2331LAGPZDRJ',
       'PL1331LAGRKDLH', 'PL1331LAGRU0NH'], dtype=object)
[22]
# inspect samples one by one to see if there is anything we need to consider before filling nans
clean_df[clean_df["serial_number"] == "PL2331LAHDW5VJ"].compute()
[########################################] | 100% Completed | 36.7s
date serial_number model capacity_bytes failure smart_1_normalized smart_1_raw smart_2_normalized smart_2_raw smart_3_normalized ... smart_194_normalized smart_194_raw smart_196_normalized smart_196_raw smart_197_normalized smart_197_raw smart_198_normalized smart_198_raw smart_199_normalized smart_199_raw
3498 2018-07-01 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3498 2018-07-02 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3499 2018-07-03 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3470 2018-07-04 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3468 2018-07-05 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3469 2018-07-06 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3467 2018-07-07 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3466 2018-07-08 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3466 2018-07-09 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3466 2018-07-10 PL2331LAHDW5VJ HGST HMS5C4040BLE640 -1.000000e+00 0.0 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3465 2018-07-11 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 187.0 32.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3501 2018-07-12 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3499 2018-07-13 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3499 2018-07-14 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 187.0 32.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3496 2018-07-15 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 187.0 32.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3496 2018-07-16 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 187.0 32.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3496 2018-07-17 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 187.0 32.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3496 2018-07-18 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 187.0 32.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3497 2018-07-19 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3497 2018-07-20 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3550 2018-07-21 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3543 2018-07-22 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3543 2018-07-23 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3543 2018-07-24 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 193.0 31.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3545 2018-07-25 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 181.0 33.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3549 2018-07-26 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 181.0 33.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3549 2018-07-27 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 181.0 33.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3596 2018-07-28 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 181.0 33.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3383 2018-07-29 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 181.0 33.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3587 2018-07-30 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 181.0 33.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3538 2018-09-01 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3538 2018-09-02 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 181.0 33.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3538 2018-09-03 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 181.0 33.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3538 2018-09-04 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3538 2018-09-05 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 181.0 33.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3539 2018-09-06 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 181.0 33.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3541 2018-09-07 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3540 2018-09-08 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3529 2018-09-09 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3529 2018-09-10 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3532 2018-09-11 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3525 2018-09-12 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3525 2018-09-13 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3523 2018-09-14 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3523 2018-09-15 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3523 2018-09-16 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3524 2018-09-17 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3525 2018-09-18 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3525 2018-09-19 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3525 2018-09-20 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3526 2018-09-21 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3526 2018-09-22 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3526 2018-09-23 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3526 2018-09-24 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3563 2018-09-25 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3561 2018-09-26 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3561 2018-09-27 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 181.0 33.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3561 2018-09-28 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 187.0 32.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3516 2018-09-29 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 176.0 34.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0
3511 2018-09-30 PL2331LAHDW5VJ HGST HMS5C4040BLE640 4.000787e+12 0.0 100.0 0.0 134.0 100.0 100.0 ... 181.0 33.0 100.0 0.0 100.0 0.0 100.0 0.0 200.0 0.0

92 rows × 41 columns

[21]
# get the number of rows that contain nan values, for each group
ser_nanpercent = (
    nan193_df[["date", "serial_number", "smart_193_raw"]]
    .groupby("serial_number")
    .apply(lambda group: group["smart_193_raw"].isna().sum() / group.shape[0])
)

# are there any groups (serial_numbers) that have more than 1 nan row?
(ser_nanpercent != 1).any()
False

HUNCH: The 1 row of nan in each group exists because some glitch occured on one particular day

[24]
# get the days on which nan values occured
nan193_df["date"].value_counts()
2018-07-10    18
2018-08-15     5
Name: date, dtype: int64
[25]
# check if there were ANY non-nan values on this doomsday
badday_df = clean_df[clean_df["date"] == "2018-11-17"]

# for each col, true means ALL drives reported nans for this col on this day
is_allnan_col = badday_df.isna().all().compute()

# percentage of cols for whom ALL drives reported nans
is_allnan_col.sum() / is_allnan_col.shape[0]
[########################################] | 100% Completed | 37.4s
1.0
[26]
# did we have any failure cases on this day
(badday_df["failure"] == 1).any().compute()
[########################################] | 100% Completed | 34.0s
False
[27]
# are there any entries on this day where smart 22 is not null?
# this is checked because smart 22 is almost always null, so if we find something non-null, it's worth keeping
badday_df[["smart_22_raw", "smart_22_normalized"]].isna().all().compute()
[########################################] | 100% Completed | 36.6s
smart_22_raw           True
smart_22_normalized    True
dtype: bool
[28]
# see what non nan values existed
badday_df[~badday_df["smart_22_raw"].isna()][
    ["date", "serial_number", "model", "capacity_bytes", "failure", "smart_22_raw"]
].compute()
[########################################] | 100% Completed | 43.4s
date serial_number model capacity_bytes failure smart_22_raw

RESULT: Since there exist non null values on this day, it is not wise to remove all entries from this day. This is especially true for SMART 22, where non null values are rare. For the columns that had null values only on this day, we can forward fill the values from the previous day.

NOTE: SMART 22 is rare in general because it is a feature related to helium drives. So not all drives will report it.

[29]
# fill in dummy values
cols_to_fill = list(nan193_df.columns)

# dont want to fill non-smart atributes like date, serial numbers
cols_to_fill = [col for col in cols_to_fill if col.startswith("smart")]

# dont want to fill smart 22
cols_to_fill.remove("smart_22_raw")
cols_to_fill.remove("smart_22_normalized")

# must do it in for loop, dask does not like indexing with list
# plus, its not straightforward to mask isna of specific columns
for col in cols_to_fill:
    if col.startswith("smart"):
        #         # TODO: replace value_counts+max with median, when it is implemented in dask
        #         clean_df[col] = clean_df[col].fillna(value=clean_df[col].value_counts().idxmax())
        clean_df[col] = clean_df[col].ffill()
[30]
# how do things look after this
utils.get_nan_count_percent(clean_df, num_total_datapts).compute()
[########################################] | 100% Completed | 52.2s
count percent
date 0 0.000000
serial_number 0 0.000000
model 0 0.000000
capacity_bytes 0 0.000000
failure 0 0.000000
smart_1_normalized 0 0.000000
smart_1_raw 0 0.000000
smart_2_normalized 0 0.000000
smart_2_raw 0 0.000000
smart_3_normalized 0 0.000000
smart_3_raw 0 0.000000
smart_4_normalized 0 0.000000
smart_4_raw 0 0.000000
smart_5_normalized 0 0.000000
smart_5_raw 0 0.000000
smart_7_normalized 0 0.000000
smart_7_raw 0 0.000000
smart_8_normalized 0 0.000000
smart_8_raw 0 0.000000
smart_9_normalized 0 0.000000
smart_9_raw 0 0.000000
smart_10_normalized 0 0.000000
smart_10_raw 0 0.000000
smart_12_normalized 0 0.000000
smart_12_raw 0 0.000000
smart_22_normalized 1809673 0.946927
smart_22_raw 1809673 0.946927
smart_192_normalized 0 0.000000
smart_192_raw 0 0.000000
smart_193_normalized 0 0.000000
smart_193_raw 0 0.000000
smart_194_normalized 0 0.000000
smart_194_raw 0 0.000000
smart_196_normalized 0 0.000000
smart_196_raw 0 0.000000
smart_197_normalized 0 0.000000
smart_197_raw 0 0.000000
smart_198_normalized 0 0.000000
smart_198_raw 0 0.000000
smart_199_normalized 0 0.000000
smart_199_raw 0 0.000000
[31]
# clean up unused memory
del nan193_df
del nan193_sers
del ser_nanpercent
del badday_df
del is_allnan_col
del cols_to_fill
gc.collect()
4195

Part 3: Fill Dummy Value Where Nans Are Indicative Features

SMART 22 i.e. Current Helium Level is not applicable to regular hard drives. That is the reason for null values for this features. This feature is important, so it shouldnt be dropped, but it cannot be filled with mean/median/mode because this it represents a distinct property of drivse. So, a dummy value that does not exist anywhere else in the dataset is used to fill NaNs.

However, if a nan exists within a helium drive, then fill it with mean/median/ffill.

NOTE: Some libraries are ok with having NaNs instead of dummy values but sklearn is not. Hence the dummy filling is needed.

[32]
# serial numbers of all drives where 22 is reported as non nan at least once
nonnan22_serials = (
    clean_df[~clean_df["smart_22_raw"].isna()]["serial_number"].unique().compute()
)  # len = 2334

# of these serial numbers, which ones report at least one nan as well
isanynan22_serials = clean_df[clean_df["serial_number"].isin(nonnan22_serials)][
    ["date", "serial_number", "smart_22_raw"]
]
isanynan22_serials = (
    isanynan22_serials.groupby("serial_number")
    .apply(lambda g: g["smart_22_raw"].isna().any())
    .compute()
)
[########################################] | 100% Completed | 44.6s /home/kachauha/.local/share/virtualenvs/ceph_drive_failure-3-2yJKyM/lib/python3.7/site-packages/ipykernel_launcher.py:6: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected. Before: .apply(func) After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result or: .apply(func, meta=('x', 'f8')) for series result [########################################] | 100% Completed | 43.6s
[33]
# these are the drives that report at least one nan, and are known to be helium drives
# because they have reported non-nan value for smart 22 at least once
helium_nans = isanynan22_serials[isanynan22_serials]
helium_nans
Series([], dtype: bool)
[34]
# get detailed data for these drives
cols = ["date", "serial_number", "failure", "smart_22_raw", "smart_22_normalized"]
tmp = clean_df[clean_df["serial_number"].isin(helium_nans.index)][cols].compute()
[########################################] | 100% Completed | 51.2s
[35]
# on what dates do nan values occur for smart 22 for helium drives
tmp[tmp["smart_22_raw"].isna()]["date"].unique()
print(tmp[tmp["smart_22_raw"].isna()]["date"].unique())
[]
[36]
tmp[["serial_number", "smart_22_raw"]].groupby("serial_number").agg(["mean", "std"])
smart_22_raw
mean std
serial_number
[37]
tmp[["serial_number", "smart_22_normalized"]].groupby("serial_number").agg(
    ["mean", "std"]
)
smart_22_normalized
mean std
serial_number
[77]
clean_df["smart_22_normalized"].value_counts().compute()
[########################################] | 100% Completed | 52.3s
100.0    160200
99.0         19
98.0         16
97.0         11
95.0          7
86.0          7
84.0          6
89.0          5
79.0          5
85.0          5
96.0          5
90.0          4
77.0          4
72.0          4
73.0          4
74.0          4
83.0          4
78.0          4
88.0          3
92.0          3
82.0          3
75.0          3
76.0          2
80.0          2
81.0          2
94.0          2
87.0          2
91.0          1
93.0          1
70.0          1
68.0          1
Name: smart_22_normalized, dtype: int64
[78]
clean_df["smart_22_raw"].value_counts().compute()
[########################################] | 100% Completed | 54.5s
100.0    160200
99.0         19
98.0         16
97.0         11
95.0          7
86.0          7
84.0          6
89.0          5
79.0          5
85.0          5
96.0          5
90.0          4
77.0          4
72.0          4
73.0          4
74.0          4
83.0          4
78.0          4
88.0          3
92.0          3
82.0          3
75.0          3
76.0          2
80.0          2
81.0          2
94.0          2
87.0          2
91.0          1
93.0          1
70.0          1
68.0          1
Name: smart_22_raw, dtype: int64

RESULT The helium drives that report some nans do so only on one specific date. The mean (and mode) SMART 22 value for these drives is 100. So we replace the nans reported on that day with 100. After this step, all the helium drive nans are taken care of. Thus, we can proceed with filling rest of the nans with dummy value

[86]
clean_df[clean_df["serial_number"].isin(helium_nans.index)][
    "smart_22_raw"
].mean().compute()
[########################################] | 100% Completed | 50.9s
100.0
[83]
(clean_df["date"] == "2018-11-17").sum().compute()
[########################################] | 100% Completed | 52.5s
21598
[19]
# rows which were recorded on the doomsday, and belong to drives that report non null smart 22
is_helium = clean_df["serial_number"].isin(helium_nans.index)
is_doomsday = clean_df["date"] == "2018-11-17"

# replace with mean values within that group
cols_to_fill = ["smart_22_raw", "smart_22_normalized"]
for col in cols_to_fill:
    clean_df[col] = clean_df[col].mask(
        is_helium & is_doomsday, clean_df[is_helium][col].mean()
    )
[41]
# make sure that as of this cell, drives report either all nans or no nans for smart 22
tmp = clean_df[clean_df["serial_number"].isin(nonnan22_serials)][
    ["date", "serial_number", "smart_22_raw"]
]
tmp = tmp.groupby("serial_number").apply(lambda g: g["smart_22_raw"].isna().any())
tmp.any().compute()
[ ] | 0% Completed | 0.0s/home/kachauha/.local/share/virtualenvs/ceph_drive_failure-3-2yJKyM/lib/python3.7/site-packages/ipykernel_launcher.py:3: UserWarning: `meta` is not specified, inferred from partial data. Please provide `meta` if the result is unexpected. Before: .apply(func) After: .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result or: .apply(func, meta=('x', 'f8')) for series result This is separate from the ipykernel package so we can avoid doing imports until [########################################] | 100% Completed | 45.9s
False
[42]
# fill the rest of the nan values with dummy values
cols_to_fill = ["smart_22_raw", "smart_22_normalized"]
for col in cols_to_fill:
    clean_df[col] = clean_df[col].fillna(DUMMY_VALUE)
[43]
# how do things look after this
utils.get_nan_count_percent(clean_df, num_total_datapts).compute().sort_values(
    by="percent", ascending=False
)
[########################################] | 100% Completed | 44.3s
count percent
date 0 0.0
smart_10_normalized 0 0.0
smart_12_normalized 0 0.0
smart_12_raw 0 0.0
smart_22_normalized 0 0.0
smart_22_raw 0 0.0
smart_192_normalized 0 0.0
smart_192_raw 0 0.0
smart_193_normalized 0 0.0
smart_193_raw 0 0.0
smart_194_normalized 0 0.0
smart_194_raw 0 0.0
smart_196_normalized 0 0.0
smart_196_raw 0 0.0
smart_197_normalized 0 0.0
smart_197_raw 0 0.0
smart_198_normalized 0 0.0
smart_198_raw 0 0.0
smart_199_normalized 0 0.0
smart_10_raw 0 0.0
smart_9_raw 0 0.0
serial_number 0 0.0
smart_9_normalized 0 0.0
model 0 0.0
capacity_bytes 0 0.0
failure 0 0.0
smart_1_normalized 0 0.0
smart_1_raw 0 0.0
smart_2_normalized 0 0.0
smart_2_raw 0 0.0
smart_3_normalized 0 0.0
smart_3_raw 0 0.0
smart_4_normalized 0 0.0
smart_4_raw 0 0.0
smart_5_normalized 0 0.0
smart_5_raw 0 0.0
smart_7_normalized 0 0.0
smart_7_raw 0 0.0
smart_8_normalized 0 0.0
smart_8_raw 0 0.0
smart_199_raw 0 0.0
[44]
# clean up garbage
del tmp
del nonnan22_serials
del isanynan22_serials
del helium_nans
gc.collect()
3921

Repartition and Save

The size of dataset may have changed after cleanup, so repartition for efficiency, and then save to disk so that cleanup does not have to be done every time for experimentation. According to Dask docs, the ideal size of a partition is around 100 MB (source: https://docs.dask.org/en/latest/dataframe-best-practices.html)

[45]
# get ideal number of partitions
PARTITION_SIZE_BYTES = 100 * 10 ** 6
DF_SIZE_BYTES = clean_df.memory_usage(deep=True).sum().compute()
NUM_PARTITIONS = int(np.ceil(DF_SIZE_BYTES / PARTITION_SIZE_BYTES))

# repartition and save cleaned version of data
clean_df = clean_df.repartition(npartitions=NUM_PARTITIONS)

# create save dir as same name as data dir, but suffixed with "_clean" and vendor name
head, tail = os.path.split(os.path.normpath(DATA_DIR))
save_dir = os.path.join(head, tail + "_hgst_clean")

# if path doesnt exist, mkdir
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# save partitions
clean_df.to_csv(os.path.join(save_dir, "partition_*.csv"), index=False)
[########################################] | 100% Completed | 49.7s [########################################] | 100% Completed | 1min 25.3s
['/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_0.csv',
 '/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_1.csv',
 '/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_2.csv',
 '/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_3.csv',
 '/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_4.csv',
 '/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_5.csv',
 '/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_6.csv',
 '/home/kachauha/Downloads/data_Q3_2018_hgst_clean/partition_7.csv']
[46]
# unregister dask compute call progress bar
pbar.unregister()
[ ]