# Import required packages
import random 
import pandas as pd # version 1.4.4
import numpy as np # version 1.21.5
import os
import datetime as dt 
from pytz import FixedOffset # version 2022.1
import io

from scipy.stats import randint, beta, uniform, reciprocal # version 1.9.1
from scipy.signal import find_peaks 
from scipy.integrate import simps 
from scipy.ndimage import median_filter

# Ignore pandas future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin # version 1.2.2
from sklearn.model_selection import (RandomizedSearchCV, GridSearchCV, GroupKFold, ParameterGrid)  
from sklearn.metrics import (balanced_accuracy_score, mean_absolute_error) 
from sklearn.utils.fixes import loguniform 
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import (LinearRegression, SGDRegressor)
from sklearn.svm import SVR 
from sklearn.pipeline import Pipeline

from lightgbm import LGBMRegressor # version 3.3.5
import json 
import joblib # version 1.2.0

import multiprocessing # version
n_jobs = int(max(1, multiprocessing.cpu_count())/2)

# set path to data
data_path= # add data path


# Load TAC data
TAC_df=pd.read_csv(data_path+"/04_Processed_data/Raw TAC_no negative value.csv")
TAC_df.datetime=pd.to_datetime(TAC_df.datetime)
TAC_df=TAC_df[["participant_id","datetime","TAC ug/L(air)"]] 
TAC_df= TAC_df.sort_values(by=["participant_id","datetime"])
print("TAC data shape:", TAC_df.shape)

# Load EMA app data (reference standard test)
ref_df = pd.read_csv(data_path+"/04_Processed_data/EMA app data ready for model development.csv")
ref_df["time"]=pd.to_datetime(ref_df.time)
ref_df.drinking_timestamp=pd.to_datetime(ref_df.drinking_timestamp)
ref_df = ref_df[["participant_id","time","drinking_timestamp","drinking_event","ema_n_drinks"]]
ref_df= ref_df.sort_values(by=["participant_id","time"])
print("EMA app data shape:",ref_df.shape)

TAC data shape: (1964713, 3)
EMA app data shape: (2429, 5)


# Here, we merge TAC_df and ref_df to make train_df.
# First, create a temporary column in ref_df with the time interval upper bound
ref_df['time_upper_bound'] = ref_df['time'] + pd.Timedelta(hours=5)

# Merge the two DataFrames based on participant_id
merged_df = pd.merge(TAC_df, ref_df, on='participant_id', how='right')

# Filter rows within the 5-hour intervals
merged_df = merged_df[(merged_df['datetime'] >= merged_df['time']) & (merged_df['datetime'] < merged_df['time_upper_bound'])]

# Group the DataFrames by participant_id and time, and aggregate datetime and TAC ug/L(air) values in lists
grouped_df = merged_df.groupby(['participant_id', 'time']).agg({'datetime': list, 'TAC ug/L(air)': list}).reset_index()

# Merge the aggregated values back to ref_df
train_df = pd.merge(ref_df, grouped_df, on=['participant_id', 'time'])

# Drop the temporary column in ref_df
train_df = train_df.drop(columns=['time_upper_bound'])


train_df.iloc[:,1:].head()


# Number of TAC values in each time interval
# Descriptive results for this number were almost the same for 5-hour intervals with/without a drinking event start time
train_df['TAC ug/L(air)'].apply(lambda x: len(x)).astype('int64').describe()

count    2429.000000
mean      808.856731
std       216.738438
min         1.000000
25%       900.000000
50%       900.000000
75%       900.000000
max       903.000000
Name: TAC ug/L(air), dtype: float64


class DrinkDetector(BaseEstimator, ClassifierMixin, TransformerMixin):
    """A scikit-learn compatible estimator that conducts procedures I and II of the model.
    
    Parameters
    ----------
        size: median_filter kernel size
        window: rolling average window size
        distance: peak minimum distance in peak detection algorithm
        prominence: peak minimum prominence
        wlen: window length
        width: peak minimum width
        
        Please see the following website for more details on the parameters: 
        https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.median_filter.html
        https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html
        https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.find_peaks.html
    """
    
    def __init__(self, size=39, window=206, distance=265, prominence=8.0,
                 wlen=2202, width=36):
        self.size = size
        self.window = window
        self.distance = distance
        self.prominence = prominence
        self.wlen = wlen
        self.width = width
        
    def fit(self, X, y=None):
        """
        Parameters
        ----------
        X : TAC data collected with Skyn. X is a numpy array with three 
        columns "participant_id","datetime","TAC ug/L(air)".
        
        y: EMA app/self-reported drinking event start times and number of drinkgs. y is a numpy array
        with columns for "participant_id","time","drinking_timestamp","drinking_event","ema_n_drinks"
        """
        return self
        
    def predict(self, X):
        X=pd.DataFrame(X,columns=["participant_id","datetime","TAC ug/L(air)"])

        # Make X in long format with one row per participan TAC timestamp 
        data = [{'participant_id': row.participant_id, 'datetime': dt, 'TAC ug/L(air)': tac}
                for row in X.itertuples() for dt, tac in zip(row.datetime, row._3)]
        X = pd.DataFrame(data)
        X.index=X.datetime
        X = X[["participant_id","TAC ug/L(air)"]]
        
        y_pred = pd.DataFrame(columns=["participant_id","index_test","right_bases","peak_maximum","peak_auc"]) 
        for participant_id in X.iloc[:,0].unique():
            # Get data for each participant.
            p_data = X.query('participant_id == @participant_id').copy() # participant level TAC data
            
            # PROCEDURE I (SIGNAL FILTERING)
            ## Median filter
            p_data["medfilt"] = median_filter(p_data["TAC ug/L(air)"], size=self.size)

            ## Moving average on median filter
            p_data["medfilt_rolling"] = p_data["medfilt"].rolling(window=self.window).mean()   
            ## Remove missing values created in procedure I
            p_data = p_data.loc[p_data["medfilt_rolling"].notna()]

            # PROCEDURE II (PEAK DETECTION)
            peaks, properties = find_peaks(p_data["medfilt_rolling"].values, 
                                           distance = self.distance, 
                                           prominence = (self.prominence, None),
                                           wlen = self.wlen,
                                           width = (self.width,None))
            
            ## store peak properties: Left bases are the index test, model detected drinking event start times
            index_test = p_data["medfilt_rolling"].index[properties["left_bases"]]
            ## get right bases of detected peak too
            rbs = p_data["medfilt_rolling"].index[properties["right_bases"]]
            max_peak = p_data.loc[p_data.index.isin(p_data["medfilt_rolling"].index[peaks]),"medfilt_rolling"].values          
            dict_df = {'participant_id': participant_id, 'index_test':index_test,'right_bases':rbs, 'peak_maximum':max_peak} 
            peaks_props = pd.DataFrame(dict_df) 
            ## AUC: we calculate area under the peak curve and add it to peaks_props
            peaks_props["peak_auc"] = [simps(dx=1, y=p_data[it:rb].medfilt_rolling)
                                       for it, rb in zip(peaks_props.index_test, peaks_props.right_bases)]
            y_pred = pd.concat([y_pred, peaks_props])
        
        y_pred = y_pred.values
        return y_pred
    
    def main_analysis(self,X, y=None):
        y=pd.DataFrame(y,columns=["participant_id","time","drinking_timestamp","drinking_event","ema_n_drinks"]) 
        y.participant_id=y.participant_id.astype("int64")
        y.drinking_event=y.drinking_event.astype("int64")
        y.ema_n_drinks=y.ema_n_drinks.astype(float)
        y.sort_values(by="drinking_timestamp",inplace=True) # EMA app recorded drinking start time
        
        
        y_pred = self.predict(X)
        y_pred = pd.DataFrame(data=y_pred, columns=["participant_id","index_test","right_bases","peak_maximum","peak_auc"])
        y_pred['index_test'] = pd.to_datetime(y_pred['index_test'], 
                                              utc=True).dt.tz_convert(FixedOffset(-240))

        y_pred["peak_id"]=np.arange(0,y_pred.shape[0]) # make peak id in y_pred
        y_pred.sort_values(by="index_test",inplace=True)
        y_pred.participant_id=y_pred.participant_id.astype("int64")
        
        
        # make a df for true pos. and false neg.
        TP_FN=pd.merge_asof(left=y[y.drinking_event ==1 ], right=y_pred,
                            by="participant_id", left_on="drinking_timestamp", right_on="index_test",
                            allow_exact_matches=True, 
                            direction="nearest", 
                            tolerance=pd.Timedelta("5h")) 
        
        TP_FN["time_difference"]=abs(TP_FN.drinking_timestamp-TP_FN.index_test)
        # for the duplicates, keep the ones with smaller time_difference values
        # To do that, first find duplicate peak_ids with larger time difference
        code_nan=TP_FN.loc[(TP_FN.peak_id.notna())&(TP_FN.duplicated(subset=["peak_id"],keep=False))
                      ].groupby(by="index_test").max()
        
        if code_nan.shape[0] > 0:
            print(code_nan.shape[0],"duplicates were generated while calculating true pos. false neg.",end='\r')
            # Set them to nan:
            TP_FN.loc[(TP_FN.peak_id.isin(code_nan.peak_id))&
                      (TP_FN.time_difference.isin(code_nan.time_difference)),
                      ['index_test','peak_id']]=np.nan
        # drop time_difference
        TP_FN.drop(columns=["time_difference"],inplace=True) 
        
        # Next, Make a df for true neg and concat it to TP_FN
        TN=y.loc[y.drinking_timestamp.isna()]
        TP_FN_TN=pd.concat([TP_FN,TN])
        
        # Next, Make a df for false pos. (these are detected peaks not in TP_FN)
        FP=y_pred.loc[~y_pred.peak_id.isin(TP_FN.peak_id)]
        FP=FP.rename(columns={"index_test":"false_p"})

        # merge FP to TP_FN_TN: If more than 1 FP in a 5-hour interval, just one of them is counted
        TP_FN_TN.sort_values(by="time",inplace=True)
        FP.sort_values(by="false_p",inplace=True)
        TP_FN_TN_FP=pd.merge_asof(left=TP_FN_TN, right=FP, suffixes=('_TP', '_FP'), 
                                  by="participant_id", left_on="time", right_on="false_p",
                                  allow_exact_matches=True, direction="forward",
                                  tolerance=pd.Timedelta("5h"))
        
        # Make a true_label and pred_label for scoring
        TP_FN_TN_FP.rename(columns={"drinking_event":"true_label"},inplace=True) 
        TP = TP_FN_TN_FP.query('index_test.notnull() &'
                               'drinking_timestamp.notnull()').index
        FN = TP_FN_TN_FP.query('index_test.isnull() &'
                               'drinking_timestamp.notnull()').index
        FP = TP_FN_TN_FP.query('false_p.notnull() &'
                               'index_test.isnull()').index
        TN = TP_FN_TN_FP.query('index_test.isnull() &'
                               'drinking_timestamp.isnull() &'
                               'false_p.isnull()').index
        
        TP_FN_TN_FP.loc[TP,"pred_label"]=1 # True pos       
        TP_FN_TN_FP.loc[FN,"pred_label"]=0 # False neg
        # if for a drinking interval, there are more than one peak, 
        # it is counted as one True positive
        TP_FN_TN_FP.loc[FP,"pred_label"]=1 # False pos
        TP_FN_TN_FP.loc[TN,"pred_label"]=0 # True neg.

        # Compute the AUC score based on the binary labels and return it
        score = balanced_accuracy_score(TP_FN_TN_FP['true_label'], TP_FN_TN_FP['pred_label'])
        
        return score, TP_FN_TN_FP
    
    def score(self,X, y=None):
        score, _ = self.main_analysis(X, y)
        return score
    
    def outputs(self,X, y=None):
        _, outputs = self.main_analysis(X, y)
        outputs = outputs[["participant_id","true_label","pred_label",
                           "ema_n_drinks","peak_maximum_TP","peak_auc_TP"]]
        return outputs


# set up X,y, and group
X = np.array(train_df[["participant_id","datetime","TAC ug/L(air)"]])
y = np.array(train_df[["participant_id","time","drinking_timestamp","drinking_event","ema_n_drinks"]])
groups = np.array(train_df["participant_id"])


# Create a pipeline
pipeline = Pipeline([
    ('DrinkDetector', DrinkDetector())
])

# Set the hyperparameters distributions
params = {
    # possible values for each each hyperparameter were randomly pooled from a discrete uniform distribution ranging from 1 to 541
    'DrinkDetector__size': randint(1, 542), 
    'DrinkDetector__window': randint(1, 541), 
    'DrinkDetector__distance': randint(180, 901),
    'DrinkDetector__prominence': beta(a=2, b=2, loc=0, scale=21),
    'DrinkDetector__wlen': randint(900, 4321),
    'DrinkDetector__width': randint(1, 541),
}

# Create RandomizedSearchCV
gkf = GroupKFold(n_splits=5)
rand_search = RandomizedSearchCV(estimator=pipeline,
                                 param_distributions=params,
                                 n_iter=50,
                                 scoring=None, # this would use the score method from the estimator
                                 cv=gkf, verbose=1, n_jobs = n_jobs,
                                 random_state=45)

rand_search.fit(X=X, y=y, groups=groups)

print("Best hyperparameters:", rand_search.best_params_)
print("Best score:", rand_search.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best hyperparameters: {'DrinkDetector__distance': 594, 'DrinkDetector__prominence': 2.3074534860188396, 'DrinkDetector__size': 378, 'DrinkDetector__width': 16, 'DrinkDetector__window': 399, 'DrinkDetector__wlen': 2233}
Best score: 0.8359545390006131


# Cross-validation results in random grid search for procedures I and II (first five rows)
pd.DataFrame(rand_search.cv_results_).head()


random_search_best_estimator = rand_search.best_estimator_.fit(X)
random_search_best_estimator

Pipeline(steps=[('DrinkDetector',
                 DrinkDetector(distance=594, prominence=2.3074534860188396,
                               size=378, width=16, window=399, wlen=2233))])

Pipeline(steps=[('DrinkDetector',
                 DrinkDetector(distance=594, prominence=2.3074534860188396,
                               size=378, width=16, window=399, wlen=2233))])

DrinkDetector(distance=594, prominence=2.3074534860188396, size=378, width=16,
              window=399, wlen=2233)


params_stages = [
    {'DrinkDetector__size': [348,363,378], 'DrinkDetector__window': [349,374,399]},
    {'DrinkDetector__distance': [587,591,594]},
    {'DrinkDetector__prominence': [2.29,2.30,2.31,2.32]},
    {'DrinkDetector__wlen': [2223,2233,2243]},
    {'DrinkDetector__width': [12,14,16]}
]

gkf = GroupKFold(n_splits=5)
best_estimator = random_search_best_estimator
cv_results = []
total_fits = 0
for i, params in enumerate(params_stages):
    fine_search = GridSearchCV(estimator=best_estimator,
                               param_grid=params, scoring=None,
                               cv=gkf, verbose=1, n_jobs=n_jobs, refit=True)
    
    num_candidates = len(list(ParameterGrid(params)))
    total_fits += (num_candidates * gkf.n_splits)
    
    fine_search.fit(X=X, y=y, groups=groups)
    best_estimator = fine_search.best_estimator_
    best_score = fine_search.best_score_
    cv_results.append(fine_search.cv_results_)
    
    if i==4:
        print("All fits = ",total_fits)
        print("Best hyperparameters after fine-tuning:", best_estimator)
        print("Best score:", best_score)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
All fits =  110
Best hyperparameters after fine-tuning: Pipeline(steps=[('DrinkDetector',
                 DrinkDetector(distance=587, prominence=2.29, size=348,
                               width=12, window=349, wlen=2223))])
Best score: 0.8433704994653132


DrinkDetector_outputs = best_estimator.named_steps['DrinkDetector'].outputs(X, y)
DrinkDetector_outputs.iloc[198:,1:].head()

1 duplicates were generated while calculating true pos. false neg.


pd.crosstab(DrinkDetector_outputs.true_label,DrinkDetector_outputs.pred_label)


# Save best pipeline
best_pipeline = Pipeline([
    ('DrinkDetector', DrinkDetector(distance=587, prominence=2.29, size=348, width=12, 
                                    window=349, wlen=2223))
])

best_pipeline.fit(X, y)
joblib.dump(best_pipeline, 'procedure_InII.pkl')

['procedure_InII.pkl']


# restrict the data to true positives (peak prop are only available for true positives)
DrinkDetector_outputs = DrinkDetector_outputs.query('true_label == 1 & pred_label == 1')
print(f"There were a total of {DrinkDetector_outputs.shape[0]} true positice cases.")

There were a total of 148 true positice cases.


# No missing data
DrinkDetector_outputs.isna().sum().sum()

0


# Set X and y 
X = np.array(DrinkDetector_outputs[["peak_maximum_TP","peak_auc_TP"]])
y = np.array(DrinkDetector_outputs["ema_n_drinks"])
groups = np.array(DrinkDetector_outputs["participant_id"])


models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {
            'LinearRegression__fit_intercept': [True, False],
            'LinearRegression__positive': [True, False]
        }  
    },
    'SGDRegressor': {
        'model': SGDRegressor(),
        'params': {        
            'SGDRegressor__loss': ['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
            'SGDRegressor__penalty':["l2","l1","elasticnet"],
            'SGDRegressor__alpha':loguniform(1e-4, 1e0),
            'SGDRegressor__fit_intercept': [True, False],
            'SGDRegressor__learning_rate':["constant","optimal","invscaling","adaptive"],
            'SGDRegressor__l1_ratio':uniform(0, 1),
            'SGDRegressor__max_iter': [1000, 5000, 10000],
            'SGDRegressor__tol': [1e-3, 1e-4, 1e-5]
        }
    },
    'SVR': {
        'model': SVR(),
        'params': {
            'SVR__kernel': ['rbf', 'sigmoid', 'linear'],
            'SVR__C': reciprocal(0.1, 10),
            'SVR__gamma': ["scale","auto"],
            'SVR__epsilon':uniform(0.1,1),
            'SVR__shrinking':[True, False]
        }
    },
    'LGBMRegressor': {
        'model': LGBMRegressor(),
        'params': {           
            'LGBMRegressor__boosting_type':["gbdt","dart","goss"],
            'LGBMRegressor__num_leaves': randint(low = 1, high=100), 
            'LGBMRegressor__max_depth':  randint(low=-1, high=20), 
            'LGBMRegressor__learning_rate': uniform(0.01,2), 
            'LGBMRegressor__n_estimators': randint(low=1, high=200)
        }
    }
}

def find_best_model(X, y, groups, models):
    best_score = np.inf
    best_model = None
    best_params = None

    gkf = GroupKFold(n_splits=5)

    for model_name, model_info in models.items():
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            (model_name, model_info['model'])
        ])

        randomized_search = RandomizedSearchCV(
            estimator = pipeline,
            param_distributions = model_info['params'],
            scoring = "neg_mean_absolute_error",
            cv = gkf,
            n_iter = 50,
            random_state = 45, refit = False
        )

        randomized_times = randomized_search.fit(X=X, y=y, groups=groups)

        if -randomized_search.best_score_ < best_score:
            best_score = -randomized_search.best_score_
            best_model = model_name
            best_params = randomized_search.best_params_
            cv_res = randomized_search.cv_results_
            
    return best_model, best_params, best_score, cv_res

# Find the best model and hyperparameters
best_model, best_params, best_score, cv_res = find_best_model(X, y, groups, models)
print(f"Best model: {best_model}")
print(f"Best parameters: {best_params}")
print(f"Best score: {best_score}")

C:\Users\nhkia\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:305: UserWarning: The total space of parameters 4 is smaller than n_iter=50. Running 4 iterations. For exhaustive searches, use GridSearchCV.
  warnings.warn(

Best model: SVR
Best parameters: {'SVR__C': 0.6716456431507717, 'SVR__epsilon': 1.0648019005876224, 'SVR__gamma': 'scale', 'SVR__kernel': 'sigmoid', 'SVR__shrinking': False}
Best score: 2.201700608754006


# Cross-validation results in random grid search for procedures III
pd.DataFrame(cv_res).head()


# Update pipeline based on results from random grid search
pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('SVR', SVR(C = 0.672, epsilon = 1.065, gamma = 'scale', kernel = 'sigmoid',
                        shrinking = False))
])


# Finetunning on best estimator from random grid search
reg_params = {
    'SVR__C': [0.067,0.672],
    'SVR__epsilon': [1.065, 0.065],
    'SVR__gamma': ["scale","auto" ],
    'SVR__kernel': ['sigmoid', 'poly'],
    'SVR__shrinking': [True, False]
}

gkf = GroupKFold(n_splits=5)

fine_search = GridSearchCV(estimator=pipeline,
                           param_grid=reg_params, scoring="neg_mean_absolute_error",
                           cv=gkf, verbose=1, n_jobs=n_jobs, refit=True)

fine_search.fit(X = X, y = y, groups = groups)
best_estimator = fine_search.best_estimator_
best_score = fine_search.best_score_
cv_res_ = fine_search.cv_results_
 
print("Best hyperparameters after fine-tuning:", best_estimator)
print("Best score:", best_score)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best hyperparameters after fine-tuning: Pipeline(steps=[('scaler', StandardScaler()),
                ('SVR', SVR(C=0.672, epsilon=0.065, kernel='sigmoid'))])
Best score: -2.1887871151690304


# Save procedure III results
scaler = StandardScaler()
best_model = SVR(C=0.672, epsilon=0.065, gamma = 'scale', kernel='sigmoid', shrinking = True)
best_pipeline = Pipeline(steps=[('scaler', scaler), ('model', best_model)])
best_pipeline.fit(X, y)

joblib.dump(best_pipeline, 'procedure_III.pkl')

['procedure_III.pkl']


best_pipeline

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', SVR(C=0.672, epsilon=0.065, kernel='sigmoid'))])

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', SVR(C=0.672, epsilon=0.065, kernel='sigmoid'))])

StandardScaler()

SVR(C=0.672, epsilon=0.065, kernel='sigmoid')

	time	drinking_timestamp	ema_n_drinks	datetime	TAC ug/L(air)
0	2021-03-25 09:00:00-04:00	NaT	NaN	[2021-03-25 11:40:29-04:00, 2021-03-25 11:40:4...	[26.35, 3.14, 0.0, 10.45, 13.17, 12.75, 9.83, ...
1	2021-03-25 14:00:00-04:00	NaT	NaN	[2021-03-25 14:00:14-04:00, 2021-03-25 14:00:3...	[2.3, 2.3, 3.97, 2.09, 3.14, 2.72, 2.09, 2.51,...
2	2021-03-25 19:00:00-04:00	NaT	NaN	[2021-03-25 19:00:14-04:00, 2021-03-25 19:00:3...	[0.84, 0.21, 0.63, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
3	2021-03-26 00:00:00-04:00	NaT	NaN	[2021-03-26 00:00:14-04:00, 2021-03-26 00:00:3...	[12.13, 11.08, 12.34, 13.8, 14.01, 12.75, 12.7...
4	2021-03-26 05:00:00-04:00	NaT	NaN	[2021-03-26 05:00:14-04:00, 2021-03-26 05:00:3...	[0.63, 0.63, 0.84, 0.63, 0.84, 0.63, 0.84, 1.2...

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_DrinkDetector__distance	param_DrinkDetector__prominence	param_DrinkDetector__size	param_DrinkDetector__width	param_DrinkDetector__window	param_DrinkDetector__wlen	params	split0_test_score	split1_test_score	split2_test_score	split3_test_score	split4_test_score	mean_test_score	std_test_score	rank_test_score
0	0.000763	0.000943	5.812212	1.038460	594	2.307453	378	16	399	2233	{'DrinkDetector__distance': 594, 'DrinkDetecto...	0.805026	0.818333	0.837618	0.865300	0.853496	0.835955	0.022085	1
1	0.000000	0.000000	5.619211	0.575981	749	13.280698	487	197	111	3586	{'DrinkDetector__distance': 749, 'DrinkDetecto...	0.761722	0.794444	0.822758	0.695019	0.853604	0.785509	0.054514	25
2	0.003207	0.006414	5.445204	0.607811	338	3.11376	190	208	402	2800	{'DrinkDetector__distance': 338, 'DrinkDetecto...	0.815442	0.785000	0.839881	0.838360	0.863464	0.828429	0.026503	2
3	0.000400	0.000490	4.533497	0.581077	734	14.609955	43	506	153	2932	{'DrinkDetector__distance': 734, 'DrinkDetecto...	0.599611	0.608889	0.631839	0.595053	0.620838	0.611246	0.013559	48
4	0.000000	0.000000	5.235737	0.506070	413	15.516487	204	317	311	3940	{'DrinkDetector__distance': 413, 'DrinkDetecto...	0.751305	0.794444	0.823889	0.753812	0.811451	0.786980	0.029630	24

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_SVR__C	param_SVR__epsilon	param_SVR__gamma	param_SVR__kernel	param_SVR__shrinking	params	split0_test_score	split1_test_score	split2_test_score	split3_test_score	split4_test_score	mean_test_score	std_test_score	rank_test_score
0	0.006248	0.007653	0.000000	0.000000	9.506552	0.649545	scale	sigmoid	False	{'SVR__C': 9.506551974984317, 'SVR__epsilon': ...	-33.852418	-20.713941	-18.279950	-35.767277	-18.220137	-25.366745	7.786180	50
1	0.003123	0.006247	0.000000	0.000000	0.774353	0.572808	auto	sigmoid	True	{'SVR__C': 0.7743530119612986, 'SVR__epsilon':...	-1.890978	-2.152339	-1.450243	-4.037363	-1.727136	-2.251612	0.921530	3
2	0.003121	0.006243	0.003128	0.006256	0.537941	0.157238	auto	linear	True	{'SVR__C': 0.5379407995493989, 'SVR__epsilon':...	-2.410833	-2.436668	-1.498158	-3.597182	-1.888605	-2.366289	0.707654	9
3	0.003609	0.003863	0.000804	0.000402	1.996204	1.090722	scale	rbf	False	{'SVR__C': 1.9962036354162338, 'SVR__epsilon':...	-1.935181	-2.266980	-1.962787	-3.998925	-1.896410	-2.412057	0.804282	27
4	0.001795	0.001466	0.000604	0.000802	4.351994	0.34072	scale	rbf	True	{'SVR__C': 4.351994273115294, 'SVR__epsilon': ...	-2.005662	-2.448613	-2.035063	-3.826020	-1.822290	-2.427530	0.728634	34

Model Development¶

Procedures I and II¶

Procedure III¶

	true_label	pred_label	ema_n_drinks	peak_maximum_TP	peak_auc_TP
198	0	0.0	NaN	NaN	NaN
199	0	0.0	NaN	NaN	NaN
200	0	0.0	NaN	NaN	NaN
201	1	1.0	2.0	27.031862	12866.220081
202	1	0.0	6.0	NaN	NaN