# Import required packages
import random
import pandas as pd # version 1.4.4
import numpy as np # version 1.21.5
import os
import datetime as dt
from pytz import FixedOffset # version 2022.1
import io
from scipy.stats import randint, beta, uniform, reciprocal # version 1.9.1
from scipy.signal import find_peaks
from scipy.integrate import simps
from scipy.ndimage import median_filter
# Ignore pandas future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin # version 1.2.2
from sklearn.model_selection import (RandomizedSearchCV, GridSearchCV, GroupKFold, ParameterGrid)
from sklearn.metrics import (balanced_accuracy_score, mean_absolute_error)
from sklearn.utils.fixes import loguniform
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import (LinearRegression, SGDRegressor)
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor # version 3.3.5
import json
import joblib # version 1.2.0
import multiprocessing # version
n_jobs = int(max(1, multiprocessing.cpu_count())/2)
# set path to data
data_path= # add data path
Figure 1. Summary of model development
size
in the code). After implementing median filter on the TAC data, we further attempted to remove remaining noise using moving average. Similarly, for moving average, there was one hyperparameter that we tuned, window size (named window
in the code). Moving average generates a few missing values. Peak detection algorithm (procedure II of our model) may produce wrong results if there are missing values in the TAC timeseries ([REF](https://docs.scipy.org/doc/)). All missing TAC values were removed before running the peak detection algorithm. distance
: Minimal time difference between neighboring peaks. prominence
(minimal required prominence): “The prominence of a peak measures how much a peak stands out from the surrounding baseline of the signal and is defined as the vertical distance between the peak and its lowest contour line.” Width
(minimal required width): This is the horizontal width of the peak in samples.Wlen
: “A window length in samples that optionally limits the evaluated area for each peak to a subset of timestamps” TAC_df
is a csv dataset containing TAC signal data collected by all participants.ref_df
is a csv dataset containing EMA app data (reference standard test). It has one row per participant per five hour interval.train_df
was created using TAC_df
and ref_df
. Negative TAC values have already been coded as zeros in TAC_df
dataset.
train_df
has the drinking event start times recorded in the EMA app (reference standard test) as well as TAC signals. Each row in this dataset represents a participant-five hour interval; train_df.shape
is (2429, 6)
. Each participant has on average 29 five-hour intervals.
Columns in train_df
:
1
if the interval (time
) encompasses start of a drinking event (reported with the EMA app) and 0
if it was not a drinking event.NaT
if drinking_event == 0
).NaN
if drinking_event == 0
).train_df
.
datetime
.# Load TAC data
TAC_df=pd.read_csv(data_path+"/04_Processed_data/Raw TAC_no negative value.csv")
TAC_df.datetime=pd.to_datetime(TAC_df.datetime)
TAC_df=TAC_df[["participant_id","datetime","TAC ug/L(air)"]]
TAC_df= TAC_df.sort_values(by=["participant_id","datetime"])
print("TAC data shape:", TAC_df.shape)
# Load EMA app data (reference standard test)
ref_df = pd.read_csv(data_path+"/04_Processed_data/EMA app data ready for model development.csv")
ref_df["time"]=pd.to_datetime(ref_df.time)
ref_df.drinking_timestamp=pd.to_datetime(ref_df.drinking_timestamp)
ref_df = ref_df[["participant_id","time","drinking_timestamp","drinking_event","ema_n_drinks"]]
ref_df= ref_df.sort_values(by=["participant_id","time"])
print("EMA app data shape:",ref_df.shape)
TAC data shape: (1964713, 3) EMA app data shape: (2429, 5)
# Here, we merge TAC_df and ref_df to make train_df.
# First, create a temporary column in ref_df with the time interval upper bound
ref_df['time_upper_bound'] = ref_df['time'] + pd.Timedelta(hours=5)
# Merge the two DataFrames based on participant_id
merged_df = pd.merge(TAC_df, ref_df, on='participant_id', how='right')
# Filter rows within the 5-hour intervals
merged_df = merged_df[(merged_df['datetime'] >= merged_df['time']) & (merged_df['datetime'] < merged_df['time_upper_bound'])]
# Group the DataFrames by participant_id and time, and aggregate datetime and TAC ug/L(air) values in lists
grouped_df = merged_df.groupby(['participant_id', 'time']).agg({'datetime': list, 'TAC ug/L(air)': list}).reset_index()
# Merge the aggregated values back to ref_df
train_df = pd.merge(ref_df, grouped_df, on=['participant_id', 'time'])
# Drop the temporary column in ref_df
train_df = train_df.drop(columns=['time_upper_bound'])
train_df.iloc[:,1:].head()
time | drinking_timestamp | drinking_event | ema_n_drinks | datetime | TAC ug/L(air) | |
---|---|---|---|---|---|---|
0 | 2021-03-25 09:00:00-04:00 | NaT | 0 | NaN | [2021-03-25 11:40:29-04:00, 2021-03-25 11:40:4... | [26.35, 3.14, 0.0, 10.45, 13.17, 12.75, 9.83, ... |
1 | 2021-03-25 14:00:00-04:00 | NaT | 0 | NaN | [2021-03-25 14:00:14-04:00, 2021-03-25 14:00:3... | [2.3, 2.3, 3.97, 2.09, 3.14, 2.72, 2.09, 2.51,... |
2 | 2021-03-25 19:00:00-04:00 | NaT | 0 | NaN | [2021-03-25 19:00:14-04:00, 2021-03-25 19:00:3... | [0.84, 0.21, 0.63, 0.0, 0.0, 0.0, 0.0, 0.0, 0.... |
3 | 2021-03-26 00:00:00-04:00 | NaT | 0 | NaN | [2021-03-26 00:00:14-04:00, 2021-03-26 00:00:3... | [12.13, 11.08, 12.34, 13.8, 14.01, 12.75, 12.7... |
4 | 2021-03-26 05:00:00-04:00 | NaT | 0 | NaN | [2021-03-26 05:00:14-04:00, 2021-03-26 05:00:3... | [0.63, 0.63, 0.84, 0.63, 0.84, 0.63, 0.84, 1.2... |
# Number of TAC values in each time interval
# Descriptive results for this number were almost the same for 5-hour intervals with/without a drinking event start time
train_df['TAC ug/L(air)'].apply(lambda x: len(x)).astype('int64').describe()
count 2429.000000 mean 808.856731 std 216.738438 min 1.000000 25% 900.000000 50% 900.000000 75% 900.000000 max 903.000000 Name: TAC ug/L(air), dtype: float64
DrinkDetector
. This estimator follows the scikit-learn API requirements (Ref: link).
class DrinkDetector(BaseEstimator, ClassifierMixin, TransformerMixin):
"""A scikit-learn compatible estimator that conducts procedures I and II of the model.
Parameters
----------
size: median_filter kernel size
window: rolling average window size
distance: peak minimum distance in peak detection algorithm
prominence: peak minimum prominence
wlen: window length
width: peak minimum width
Please see the following website for more details on the parameters:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.median_filter.html
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html
https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.find_peaks.html
"""
def __init__(self, size=39, window=206, distance=265, prominence=8.0,
wlen=2202, width=36):
self.size = size
self.window = window
self.distance = distance
self.prominence = prominence
self.wlen = wlen
self.width = width
def fit(self, X, y=None):
"""
Parameters
----------
X : TAC data collected with Skyn. X is a numpy array with three
columns "participant_id","datetime","TAC ug/L(air)".
y: EMA app/self-reported drinking event start times and number of drinkgs. y is a numpy array
with columns for "participant_id","time","drinking_timestamp","drinking_event","ema_n_drinks"
"""
return self
def predict(self, X):
X=pd.DataFrame(X,columns=["participant_id","datetime","TAC ug/L(air)"])
# Make X in long format with one row per participan TAC timestamp
data = [{'participant_id': row.participant_id, 'datetime': dt, 'TAC ug/L(air)': tac}
for row in X.itertuples() for dt, tac in zip(row.datetime, row._3)]
X = pd.DataFrame(data)
X.index=X.datetime
X = X[["participant_id","TAC ug/L(air)"]]
y_pred = pd.DataFrame(columns=["participant_id","index_test","right_bases","peak_maximum","peak_auc"])
for participant_id in X.iloc[:,0].unique():
# Get data for each participant.
p_data = X.query('participant_id == @participant_id').copy() # participant level TAC data
# PROCEDURE I (SIGNAL FILTERING)
## Median filter
p_data["medfilt"] = median_filter(p_data["TAC ug/L(air)"], size=self.size)
## Moving average on median filter
p_data["medfilt_rolling"] = p_data["medfilt"].rolling(window=self.window).mean()
## Remove missing values created in procedure I
p_data = p_data.loc[p_data["medfilt_rolling"].notna()]
# PROCEDURE II (PEAK DETECTION)
peaks, properties = find_peaks(p_data["medfilt_rolling"].values,
distance = self.distance,
prominence = (self.prominence, None),
wlen = self.wlen,
width = (self.width,None))
## store peak properties: Left bases are the index test, model detected drinking event start times
index_test = p_data["medfilt_rolling"].index[properties["left_bases"]]
## get right bases of detected peak too
rbs = p_data["medfilt_rolling"].index[properties["right_bases"]]
max_peak = p_data.loc[p_data.index.isin(p_data["medfilt_rolling"].index[peaks]),"medfilt_rolling"].values
dict_df = {'participant_id': participant_id, 'index_test':index_test,'right_bases':rbs, 'peak_maximum':max_peak}
peaks_props = pd.DataFrame(dict_df)
## AUC: we calculate area under the peak curve and add it to peaks_props
peaks_props["peak_auc"] = [simps(dx=1, y=p_data[it:rb].medfilt_rolling)
for it, rb in zip(peaks_props.index_test, peaks_props.right_bases)]
y_pred = pd.concat([y_pred, peaks_props])
y_pred = y_pred.values
return y_pred
def main_analysis(self,X, y=None):
y=pd.DataFrame(y,columns=["participant_id","time","drinking_timestamp","drinking_event","ema_n_drinks"])
y.participant_id=y.participant_id.astype("int64")
y.drinking_event=y.drinking_event.astype("int64")
y.ema_n_drinks=y.ema_n_drinks.astype(float)
y.sort_values(by="drinking_timestamp",inplace=True) # EMA app recorded drinking start time
y_pred = self.predict(X)
y_pred = pd.DataFrame(data=y_pred, columns=["participant_id","index_test","right_bases","peak_maximum","peak_auc"])
y_pred['index_test'] = pd.to_datetime(y_pred['index_test'],
utc=True).dt.tz_convert(FixedOffset(-240))
y_pred["peak_id"]=np.arange(0,y_pred.shape[0]) # make peak id in y_pred
y_pred.sort_values(by="index_test",inplace=True)
y_pred.participant_id=y_pred.participant_id.astype("int64")
# make a df for true pos. and false neg.
TP_FN=pd.merge_asof(left=y[y.drinking_event ==1 ], right=y_pred,
by="participant_id", left_on="drinking_timestamp", right_on="index_test",
allow_exact_matches=True,
direction="nearest",
tolerance=pd.Timedelta("5h"))
TP_FN["time_difference"]=abs(TP_FN.drinking_timestamp-TP_FN.index_test)
# for the duplicates, keep the ones with smaller time_difference values
# To do that, first find duplicate peak_ids with larger time difference
code_nan=TP_FN.loc[(TP_FN.peak_id.notna())&(TP_FN.duplicated(subset=["peak_id"],keep=False))
].groupby(by="index_test").max()
if code_nan.shape[0] > 0:
print(code_nan.shape[0],"duplicates were generated while calculating true pos. false neg.",end='\r')
# Set them to nan:
TP_FN.loc[(TP_FN.peak_id.isin(code_nan.peak_id))&
(TP_FN.time_difference.isin(code_nan.time_difference)),
['index_test','peak_id']]=np.nan
# drop time_difference
TP_FN.drop(columns=["time_difference"],inplace=True)
# Next, Make a df for true neg and concat it to TP_FN
TN=y.loc[y.drinking_timestamp.isna()]
TP_FN_TN=pd.concat([TP_FN,TN])
# Next, Make a df for false pos. (these are detected peaks not in TP_FN)
FP=y_pred.loc[~y_pred.peak_id.isin(TP_FN.peak_id)]
FP=FP.rename(columns={"index_test":"false_p"})
# merge FP to TP_FN_TN: If more than 1 FP in a 5-hour interval, just one of them is counted
TP_FN_TN.sort_values(by="time",inplace=True)
FP.sort_values(by="false_p",inplace=True)
TP_FN_TN_FP=pd.merge_asof(left=TP_FN_TN, right=FP, suffixes=('_TP', '_FP'),
by="participant_id", left_on="time", right_on="false_p",
allow_exact_matches=True, direction="forward",
tolerance=pd.Timedelta("5h"))
# Make a true_label and pred_label for scoring
TP_FN_TN_FP.rename(columns={"drinking_event":"true_label"},inplace=True)
TP = TP_FN_TN_FP.query('index_test.notnull() &'
'drinking_timestamp.notnull()').index
FN = TP_FN_TN_FP.query('index_test.isnull() &'
'drinking_timestamp.notnull()').index
FP = TP_FN_TN_FP.query('false_p.notnull() &'
'index_test.isnull()').index
TN = TP_FN_TN_FP.query('index_test.isnull() &'
'drinking_timestamp.isnull() &'
'false_p.isnull()').index
TP_FN_TN_FP.loc[TP,"pred_label"]=1 # True pos
TP_FN_TN_FP.loc[FN,"pred_label"]=0 # False neg
# if for a drinking interval, there are more than one peak,
# it is counted as one True positive
TP_FN_TN_FP.loc[FP,"pred_label"]=1 # False pos
TP_FN_TN_FP.loc[TN,"pred_label"]=0 # True neg.
# Compute the AUC score based on the binary labels and return it
score = balanced_accuracy_score(TP_FN_TN_FP['true_label'], TP_FN_TN_FP['pred_label'])
return score, TP_FN_TN_FP
def score(self,X, y=None):
score, _ = self.main_analysis(X, y)
return score
def outputs(self,X, y=None):
_, outputs = self.main_analysis(X, y)
outputs = outputs[["participant_id","true_label","pred_label",
"ema_n_drinks","peak_maximum_TP","peak_auc_TP"]]
return outputs
# set up X,y, and group
X = np.array(train_df[["participant_id","datetime","TAC ug/L(air)"]])
y = np.array(train_df[["participant_id","time","drinking_timestamp","drinking_event","ema_n_drinks"]])
groups = np.array(train_df["participant_id"])
RandomizedSearchCV
in scikit-learn. All six hyperparameters were integer-valued and there was no conditional hyperparameter. In this search, hyperparameter values were selected randomly within a range of possible values. When possible, the range for a hyperparameter was determined based on subject matter knowledge. For instance, the range for minimal required width was based on the possible minimum number of hours that alcohol can be detected in TAC, or distance was based on the time difference between two consecutive drinking events.GridSearchCV
in scikit-learn. Finetuning was defined as changing the values of only one to two hyperparameters, while holding the values for other hyperparameter constant, to find the best value for the changing hyperparameter (this is also known as staged or sequential grid search). After identifying the most important subspace in random grid search (i.e., best estimator in random grid search), we performed finetuning to further improve the model performance. Finetuning encompassed the following steps.# Create a pipeline
pipeline = Pipeline([
('DrinkDetector', DrinkDetector())
])
# Set the hyperparameters distributions
params = {
# possible values for each each hyperparameter were randomly pooled from a discrete uniform distribution ranging from 1 to 541
'DrinkDetector__size': randint(1, 542),
'DrinkDetector__window': randint(1, 541),
'DrinkDetector__distance': randint(180, 901),
'DrinkDetector__prominence': beta(a=2, b=2, loc=0, scale=21),
'DrinkDetector__wlen': randint(900, 4321),
'DrinkDetector__width': randint(1, 541),
}
# Create RandomizedSearchCV
gkf = GroupKFold(n_splits=5)
rand_search = RandomizedSearchCV(estimator=pipeline,
param_distributions=params,
n_iter=50,
scoring=None, # this would use the score method from the estimator
cv=gkf, verbose=1, n_jobs = n_jobs,
random_state=45)
rand_search.fit(X=X, y=y, groups=groups)
print("Best hyperparameters:", rand_search.best_params_)
print("Best score:", rand_search.best_score_)
Fitting 5 folds for each of 50 candidates, totalling 250 fits Best hyperparameters: {'DrinkDetector__distance': 594, 'DrinkDetector__prominence': 2.3074534860188396, 'DrinkDetector__size': 378, 'DrinkDetector__width': 16, 'DrinkDetector__window': 399, 'DrinkDetector__wlen': 2233} Best score: 0.8359545390006131
# Cross-validation results in random grid search for procedures I and II (first five rows)
pd.DataFrame(rand_search.cv_results_).head()
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_DrinkDetector__distance | param_DrinkDetector__prominence | param_DrinkDetector__size | param_DrinkDetector__width | param_DrinkDetector__window | param_DrinkDetector__wlen | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | mean_test_score | std_test_score | rank_test_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000763 | 0.000943 | 5.812212 | 1.038460 | 594 | 2.307453 | 378 | 16 | 399 | 2233 | {'DrinkDetector__distance': 594, 'DrinkDetecto... | 0.805026 | 0.818333 | 0.837618 | 0.865300 | 0.853496 | 0.835955 | 0.022085 | 1 |
1 | 0.000000 | 0.000000 | 5.619211 | 0.575981 | 749 | 13.280698 | 487 | 197 | 111 | 3586 | {'DrinkDetector__distance': 749, 'DrinkDetecto... | 0.761722 | 0.794444 | 0.822758 | 0.695019 | 0.853604 | 0.785509 | 0.054514 | 25 |
2 | 0.003207 | 0.006414 | 5.445204 | 0.607811 | 338 | 3.11376 | 190 | 208 | 402 | 2800 | {'DrinkDetector__distance': 338, 'DrinkDetecto... | 0.815442 | 0.785000 | 0.839881 | 0.838360 | 0.863464 | 0.828429 | 0.026503 | 2 |
3 | 0.000400 | 0.000490 | 4.533497 | 0.581077 | 734 | 14.609955 | 43 | 506 | 153 | 2932 | {'DrinkDetector__distance': 734, 'DrinkDetecto... | 0.599611 | 0.608889 | 0.631839 | 0.595053 | 0.620838 | 0.611246 | 0.013559 | 48 |
4 | 0.000000 | 0.000000 | 5.235737 | 0.506070 | 413 | 15.516487 | 204 | 317 | 311 | 3940 | {'DrinkDetector__distance': 413, 'DrinkDetecto... | 0.751305 | 0.794444 | 0.823889 | 0.753812 | 0.811451 | 0.786980 | 0.029630 | 24 |
best_estimator_
from random grid search.
random_search_best_estimator = rand_search.best_estimator_.fit(X)
random_search_best_estimator
Pipeline(steps=[('DrinkDetector', DrinkDetector(distance=594, prominence=2.3074534860188396, size=378, width=16, window=399, wlen=2233))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('DrinkDetector', DrinkDetector(distance=594, prominence=2.3074534860188396, size=378, width=16, window=399, wlen=2233))])
DrinkDetector(distance=594, prominence=2.3074534860188396, size=378, width=16, window=399, wlen=2233)
params_stages = [
{'DrinkDetector__size': [348,363,378], 'DrinkDetector__window': [349,374,399]},
{'DrinkDetector__distance': [587,591,594]},
{'DrinkDetector__prominence': [2.29,2.30,2.31,2.32]},
{'DrinkDetector__wlen': [2223,2233,2243]},
{'DrinkDetector__width': [12,14,16]}
]
gkf = GroupKFold(n_splits=5)
best_estimator = random_search_best_estimator
cv_results = []
total_fits = 0
for i, params in enumerate(params_stages):
fine_search = GridSearchCV(estimator=best_estimator,
param_grid=params, scoring=None,
cv=gkf, verbose=1, n_jobs=n_jobs, refit=True)
num_candidates = len(list(ParameterGrid(params)))
total_fits += (num_candidates * gkf.n_splits)
fine_search.fit(X=X, y=y, groups=groups)
best_estimator = fine_search.best_estimator_
best_score = fine_search.best_score_
cv_results.append(fine_search.cv_results_)
if i==4:
print("All fits = ",total_fits)
print("Best hyperparameters after fine-tuning:", best_estimator)
print("Best score:", best_score)
Fitting 5 folds for each of 9 candidates, totalling 45 fits Fitting 5 folds for each of 3 candidates, totalling 15 fits Fitting 5 folds for each of 4 candidates, totalling 20 fits Fitting 5 folds for each of 3 candidates, totalling 15 fits Fitting 5 folds for each of 3 candidates, totalling 15 fits All fits = 110 Best hyperparameters after fine-tuning: Pipeline(steps=[('DrinkDetector', DrinkDetector(distance=587, prominence=2.29, size=348, width=12, window=349, wlen=2223))]) Best score: 0.8433704994653132
best_estimator
and use this in procedure III.DrinkDetector_outputs = best_estimator.named_steps['DrinkDetector'].outputs(X, y)
DrinkDetector_outputs.iloc[198:,1:].head()
1 duplicates were generated while calculating true pos. false neg.
true_label | pred_label | ema_n_drinks | peak_maximum_TP | peak_auc_TP | |
---|---|---|---|---|---|
198 | 0 | 0.0 | NaN | NaN | NaN |
199 | 0 | 0.0 | NaN | NaN | NaN |
200 | 0 | 0.0 | NaN | NaN | NaN |
201 | 1 | 1.0 | 2.0 | 27.031862 | 12866.220081 |
202 | 1 | 0.0 | 6.0 | NaN | NaN |
pd.crosstab(DrinkDetector_outputs.true_label,DrinkDetector_outputs.pred_label)
pred_label | 0.0 | 1.0 |
---|---|---|
true_label | ||
0 | 2064 | 170 |
1 | 47 | 148 |
# Save best pipeline
best_pipeline = Pipeline([
('DrinkDetector', DrinkDetector(distance=587, prominence=2.29, size=348, width=12,
window=349, wlen=2223))
])
best_pipeline.fit(X, y)
joblib.dump(best_pipeline, 'procedure_InII.pkl')
['procedure_InII.pkl']
DrinkDetector_outputs
which was created in previous procedure. DrinkDetector_outputs
y
in procedure III.X
in procedure III.
# restrict the data to true positives (peak prop are only available for true positives)
DrinkDetector_outputs = DrinkDetector_outputs.query('true_label == 1 & pred_label == 1')
print(f"There were a total of {DrinkDetector_outputs.shape[0]} true positice cases.")
There were a total of 148 true positice cases.
# No missing data
DrinkDetector_outputs.isna().sum().sum()
0
# Set X and y
X = np.array(DrinkDetector_outputs[["peak_maximum_TP","peak_auc_TP"]])
y = np.array(DrinkDetector_outputs["ema_n_drinks"])
groups = np.array(DrinkDetector_outputs["participant_id"])
models = {
'LinearRegression': {
'model': LinearRegression(),
'params': {
'LinearRegression__fit_intercept': [True, False],
'LinearRegression__positive': [True, False]
}
},
'SGDRegressor': {
'model': SGDRegressor(),
'params': {
'SGDRegressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
'SGDRegressor__penalty':["l2","l1","elasticnet"],
'SGDRegressor__alpha':loguniform(1e-4, 1e0),
'SGDRegressor__fit_intercept': [True, False],
'SGDRegressor__learning_rate':["constant","optimal","invscaling","adaptive"],
'SGDRegressor__l1_ratio':uniform(0, 1),
'SGDRegressor__max_iter': [1000, 5000, 10000],
'SGDRegressor__tol': [1e-3, 1e-4, 1e-5]
}
},
'SVR': {
'model': SVR(),
'params': {
'SVR__kernel': ['rbf', 'sigmoid', 'linear'],
'SVR__C': reciprocal(0.1, 10),
'SVR__gamma': ["scale","auto"],
'SVR__epsilon':uniform(0.1,1),
'SVR__shrinking':[True, False]
}
},
'LGBMRegressor': {
'model': LGBMRegressor(),
'params': {
'LGBMRegressor__boosting_type':["gbdt","dart","goss"],
'LGBMRegressor__num_leaves': randint(low = 1, high=100),
'LGBMRegressor__max_depth': randint(low=-1, high=20),
'LGBMRegressor__learning_rate': uniform(0.01,2),
'LGBMRegressor__n_estimators': randint(low=1, high=200)
}
}
}
def find_best_model(X, y, groups, models):
best_score = np.inf
best_model = None
best_params = None
gkf = GroupKFold(n_splits=5)
for model_name, model_info in models.items():
pipeline = Pipeline([
('scaler', StandardScaler()),
(model_name, model_info['model'])
])
randomized_search = RandomizedSearchCV(
estimator = pipeline,
param_distributions = model_info['params'],
scoring = "neg_mean_absolute_error",
cv = gkf,
n_iter = 50,
random_state = 45, refit = False
)
randomized_times = randomized_search.fit(X=X, y=y, groups=groups)
if -randomized_search.best_score_ < best_score:
best_score = -randomized_search.best_score_
best_model = model_name
best_params = randomized_search.best_params_
cv_res = randomized_search.cv_results_
return best_model, best_params, best_score, cv_res
# Find the best model and hyperparameters
best_model, best_params, best_score, cv_res = find_best_model(X, y, groups, models)
print(f"Best model: {best_model}")
print(f"Best parameters: {best_params}")
print(f"Best score: {best_score}")
C:\Users\nhkia\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:305: UserWarning: The total space of parameters 4 is smaller than n_iter=50. Running 4 iterations. For exhaustive searches, use GridSearchCV. warnings.warn(
Best model: SVR Best parameters: {'SVR__C': 0.6716456431507717, 'SVR__epsilon': 1.0648019005876224, 'SVR__gamma': 'scale', 'SVR__kernel': 'sigmoid', 'SVR__shrinking': False} Best score: 2.201700608754006
# Cross-validation results in random grid search for procedures III
pd.DataFrame(cv_res).head()
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_SVR__C | param_SVR__epsilon | param_SVR__gamma | param_SVR__kernel | param_SVR__shrinking | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | mean_test_score | std_test_score | rank_test_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.006248 | 0.007653 | 0.000000 | 0.000000 | 9.506552 | 0.649545 | scale | sigmoid | False | {'SVR__C': 9.506551974984317, 'SVR__epsilon': ... | -33.852418 | -20.713941 | -18.279950 | -35.767277 | -18.220137 | -25.366745 | 7.786180 | 50 |
1 | 0.003123 | 0.006247 | 0.000000 | 0.000000 | 0.774353 | 0.572808 | auto | sigmoid | True | {'SVR__C': 0.7743530119612986, 'SVR__epsilon':... | -1.890978 | -2.152339 | -1.450243 | -4.037363 | -1.727136 | -2.251612 | 0.921530 | 3 |
2 | 0.003121 | 0.006243 | 0.003128 | 0.006256 | 0.537941 | 0.157238 | auto | linear | True | {'SVR__C': 0.5379407995493989, 'SVR__epsilon':... | -2.410833 | -2.436668 | -1.498158 | -3.597182 | -1.888605 | -2.366289 | 0.707654 | 9 |
3 | 0.003609 | 0.003863 | 0.000804 | 0.000402 | 1.996204 | 1.090722 | scale | rbf | False | {'SVR__C': 1.9962036354162338, 'SVR__epsilon':... | -1.935181 | -2.266980 | -1.962787 | -3.998925 | -1.896410 | -2.412057 | 0.804282 | 27 |
4 | 0.001795 | 0.001466 | 0.000604 | 0.000802 | 4.351994 | 0.34072 | scale | rbf | True | {'SVR__C': 4.351994273115294, 'SVR__epsilon': ... | -2.005662 | -2.448613 | -2.035063 | -3.826020 | -1.822290 | -2.427530 | 0.728634 | 34 |
# Update pipeline based on results from random grid search
pipeline = Pipeline([
('scaler', StandardScaler()),
('SVR', SVR(C = 0.672, epsilon = 1.065, gamma = 'scale', kernel = 'sigmoid',
shrinking = False))
])
# Finetunning on best estimator from random grid search
reg_params = {
'SVR__C': [0.067,0.672],
'SVR__epsilon': [1.065, 0.065],
'SVR__gamma': ["scale","auto" ],
'SVR__kernel': ['sigmoid', 'poly'],
'SVR__shrinking': [True, False]
}
gkf = GroupKFold(n_splits=5)
fine_search = GridSearchCV(estimator=pipeline,
param_grid=reg_params, scoring="neg_mean_absolute_error",
cv=gkf, verbose=1, n_jobs=n_jobs, refit=True)
fine_search.fit(X = X, y = y, groups = groups)
best_estimator = fine_search.best_estimator_
best_score = fine_search.best_score_
cv_res_ = fine_search.cv_results_
print("Best hyperparameters after fine-tuning:", best_estimator)
print("Best score:", best_score)
Fitting 5 folds for each of 32 candidates, totalling 160 fits Best hyperparameters after fine-tuning: Pipeline(steps=[('scaler', StandardScaler()), ('SVR', SVR(C=0.672, epsilon=0.065, kernel='sigmoid'))]) Best score: -2.1887871151690304
# Save procedure III results
scaler = StandardScaler()
best_model = SVR(C=0.672, epsilon=0.065, gamma = 'scale', kernel='sigmoid', shrinking = True)
best_pipeline = Pipeline(steps=[('scaler', scaler), ('model', best_model)])
best_pipeline.fit(X, y)
joblib.dump(best_pipeline, 'procedure_III.pkl')
['procedure_III.pkl']
</span>
best_pipeline
Pipeline(steps=[('scaler', StandardScaler()), ('model', SVR(C=0.672, epsilon=0.065, kernel='sigmoid'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('scaler', StandardScaler()), ('model', SVR(C=0.672, epsilon=0.065, kernel='sigmoid'))])
StandardScaler()
SVR(C=0.672, epsilon=0.065, kernel='sigmoid')