# Notebook for training and testing AI models

## Setup

In [3]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from attention import Attention

In [4]:
print(tf.__version__)

2.8.0


In [5]:
# CONSTANTS
RAW_SLEEP_DATA_PATH = ".data/raw_bed_sleep-state.csv"
CLEANED_SLEEP_DATA_PATH = ".data/clean_bed_sleep-state.csv"

In [6]:
## Parameters and Hyper-parameters
SLEEP_STAGES = 4

## Import Data

### Cleaning Raw Data

In [59]:
import csv
import datetime
import itertools

In [104]:
datetime.datetime.strptime("2022-04-21T10:18:00+02:00","%Y-%m-%dT%H:%M:%S%z") + datetime.timedelta(minutes=1)

datetime.datetime(2022, 4, 21, 10, 19, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)))

In [115]:
def stage_probability(stage, to_test):
    return 1.0 if stage == to_test else 0.0

In [150]:
((start_time - in_bed_time).seconds)/3600

6.833333333333333

In [201]:
cleaned_info = []
date_seen = set()
previous_duration = 60
with open(RAW_SLEEP_DATA_PATH, mode ='r') as raw_file:
    csvFile = csv.reader(raw_file)
    # max_count = 1
    # stuff = set()
    in_bed_time = None
    current_sleep_id = -1
    for index, lines in enumerate(csvFile):
        if index == 0:
            cleaned_info.append([
                "sleep_id",
                "sleep_begin",
                "stage_start",
                "time_since_begin_sec",
                "stage_duration_sec",
                "stage_end", 
                "stage_value",
                "awake_probability",
                "light_probability",
                "deep_probability",
                "rem_probability",
            ])
            continue
        start_time = datetime.datetime.strptime(lines[0],"%Y-%m-%dT%H:%M:%S%z")
        if start_time in date_seen:
            continue
        date_seen.add(start_time)
        if not in_bed_time or in_bed_time > start_time:
            current_sleep_id += 1
            in_bed_time = start_time
        # for duration, stage in enumerate(
        # for offset, (duration, stage) in enumerate(
        #     zip(
        #         # itertools.accumulate(lines[1].strip("[]").split(","), lambda x,y: int(x)+int(y)//60, initial = 0), 
        #         map(int, lines[1].strip("[]").split(","))
        #         map(int, lines[2].strip("[]").split(","))
        #     )
        #         # map(int, lines[2].strip("[]").split(","))
        # ):
        for offset, (duration, stage) in enumerate(zip(map(int, lines[1].strip("[]").split(",")), map(int, lines[2].strip("[]").split(",")))):
            # print(f"{(index, subindex) = }, {duration = }, {stage = }")
            # print(f"{(index, duration) = } {stage = }")
            current_time = start_time + datetime.timedelta(seconds=offset*previous_duration)
            cleaned_info.append([
                current_sleep_id,
                in_bed_time,
                current_time, 
                (current_time - in_bed_time).seconds,
                duration, 
                current_time + datetime.timedelta(seconds=duration), 
                stage,
                stage_probability(0, stage),
                stage_probability(1, stage),
                stage_probability(2, stage),
                stage_probability(3, stage),
            ])
            previous_duration = duration
            # print(f"{(index, subindex) = }, {val = }")
        # print(list())
        # if index >= max_count:
        #     break


In [202]:
with open(CLEANED_SLEEP_DATA_PATH, 'w') as clean_file:
    write = csv.writer(clean_file)
    write.writerows(cleaned_info)
print("Finished Writing Cleaned Data")

Finished Writing Cleaned Data


### Creating DataFrame from clean raw data

In [7]:
# Get the cleaned data
sleep_df_raw = pd.read_csv(CLEANED_SLEEP_DATA_PATH)#, parse_dates=["start", "end"], infer_datetime_format=True)

In [8]:
# Preprocess data: 
#   1. convert to datetime
sleep_df_raw["sleep_begin"] = pd.to_datetime(sleep_df_raw["sleep_begin"], utc=True)
sleep_df_raw["stage_start"] = pd.to_datetime(sleep_df_raw["stage_start"], utc=True)
sleep_df_raw["stage_end"] = pd.to_datetime(sleep_df_raw["stage_end"], utc=True)
#   2. Separate time, hour and minute
#   MAYBE 3. smaller units: int16 or int8 

In [9]:
def get_minute(row, index):
    return row[index].time().minute

def get_hour(row, index):
    return row[index].time().hour

In [10]:
sleep_df_raw["stage_start_hour"] = sleep_df_raw.apply (lambda row: get_hour(row, "stage_start"), axis=1)
sleep_df_raw["stage_start_minute"] = sleep_df_raw.apply (lambda row: get_minute(row, "stage_start"), axis=1)

In [11]:
sleep_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551042 entries, 0 to 551041
Data columns (total 13 columns):
 #   Column                Non-Null Count   Dtype              
---  ------                --------------   -----              
 0   sleep_id              551042 non-null  int64              
 1   sleep_begin           551042 non-null  datetime64[ns, UTC]
 2   stage_start           551042 non-null  datetime64[ns, UTC]
 3   time_since_begin_sec  551042 non-null  int64              
 4   stage_duration_sec    551042 non-null  int64              
 5   stage_end             551042 non-null  datetime64[ns, UTC]
 6   stage_value           551042 non-null  int64              
 7   awake_probability     551042 non-null  float64            
 8   light_probability     551042 non-null  float64            
 9   deep_probability      551042 non-null  float64            
 10  rem_probability       551042 non-null  float64            
 11  stage_start_hour      551042 non-null  int64        

In [12]:
sleep_df_raw

Unnamed: 0,sleep_id,sleep_begin,stage_start,time_since_begin_sec,stage_duration_sec,stage_end,stage_value,awake_probability,light_probability,deep_probability,rem_probability,stage_start_hour,stage_start_minute
0,0,2022-04-21 08:18:00+00:00,2022-04-21 08:18:00+00:00,0,60,2022-04-21 08:19:00+00:00,0,1.0,0.0,0.0,0.0,8,18
1,0,2022-04-21 08:18:00+00:00,2022-04-21 08:19:00+00:00,60,60,2022-04-21 08:20:00+00:00,0,1.0,0.0,0.0,0.0,8,19
2,0,2022-04-21 08:18:00+00:00,2022-04-21 08:20:00+00:00,120,60,2022-04-21 08:21:00+00:00,0,1.0,0.0,0.0,0.0,8,20
3,0,2022-04-21 08:18:00+00:00,2022-04-21 08:21:00+00:00,180,60,2022-04-21 08:22:00+00:00,0,1.0,0.0,0.0,0.0,8,21
4,0,2022-04-21 08:18:00+00:00,2022-04-21 08:22:00+00:00,240,60,2022-04-21 08:23:00+00:00,0,1.0,0.0,0.0,0.0,8,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
551037,1132,2019-02-11 06:11:00+00:00,2019-02-11 13:17:00+00:00,25560,60,2019-02-11 13:18:00+00:00,1,0.0,1.0,0.0,0.0,13,17
551038,1132,2019-02-11 06:11:00+00:00,2019-02-11 13:18:00+00:00,25620,60,2019-02-11 13:19:00+00:00,1,0.0,1.0,0.0,0.0,13,18
551039,1132,2019-02-11 06:11:00+00:00,2019-02-11 13:19:00+00:00,25680,60,2019-02-11 13:20:00+00:00,1,0.0,1.0,0.0,0.0,13,19
551040,1132,2019-02-11 06:11:00+00:00,2019-02-11 13:20:00+00:00,25740,60,2019-02-11 13:21:00+00:00,1,0.0,1.0,0.0,0.0,13,20


In [13]:
sleep_data = sleep_df_raw[["sleep_id", "stage_start_hour", "stage_start_minute", "awake_probability", "rem_probability","light_probability", "deep_probability"]]
sleep_data.insert(loc=1, column="minutes_since_begin" , value= sleep_df_raw["time_since_begin_sec"]//60)

In [14]:
print(sleep_data.head())
print(sleep_data.info())

   sleep_id  minutes_since_begin  stage_start_hour  stage_start_minute  \
0         0                    0                 8                  18   
1         0                    1                 8                  19   
2         0                    2                 8                  20   
3         0                    3                 8                  21   
4         0                    4                 8                  22   

   awake_probability  rem_probability  light_probability  deep_probability  
0                1.0              0.0                0.0               0.0  
1                1.0              0.0                0.0               0.0  
2                1.0              0.0                0.0               0.0  
3                1.0              0.0                0.0               0.0  
4                1.0              0.0                0.0               0.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551042 entries, 0 to 551041
Data columns (t

In [15]:
sleep_data

Unnamed: 0,sleep_id,minutes_since_begin,stage_start_hour,stage_start_minute,awake_probability,rem_probability,light_probability,deep_probability
0,0,0,8,18,1.0,0.0,0.0,0.0
1,0,1,8,19,1.0,0.0,0.0,0.0
2,0,2,8,20,1.0,0.0,0.0,0.0
3,0,3,8,21,1.0,0.0,0.0,0.0
4,0,4,8,22,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
551037,1132,426,13,17,0.0,0.0,1.0,0.0
551038,1132,427,13,18,0.0,0.0,1.0,0.0
551039,1132,428,13,19,0.0,0.0,1.0,0.0
551040,1132,429,13,20,0.0,0.0,1.0,0.0


## Model Development

### Helper functions and class

In [16]:
TEST_SIZE = 122
VALIDATION_SIZE = 183
TIME_STEP_INPUT = 10 # in minutes
BATCH_SIZE = 32

In [17]:
def training_test_split_by_unique_index(data, index: str, test_size: int = 10):
    test_ids = np.random.choice(data[index].unique(), size = test_size, replace=False)
    return data[~data[index].isin(test_ids)], data[data[index].isin(test_ids)]

In [31]:
# Adapted from https://www.tensorflow.org/tutorials/structured_data/time_series
class WindowGenerator():
    def __init__(self, data, index: str = "sleep_id", input_width: int = TIME_STEP_INPUT, validation_size: int = VALIDATION_SIZE, test_size: int = TEST_SIZE, input_feature_slice: slice = slice(1,100), label_feature_slice: slice = slice(-4,100), generate_data_now: bool = True):
        # Partition data
        self.training, self.testing = training_test_split_by_unique_index(sleep_data, index, test_size)
        self.training, self.validation = training_test_split_by_unique_index(self.training, index, validation_size)

        # Window paramters
        self.input_width = input_width
        self.label_width = 1
        self.shift = 1

        self.total_window_size = self.input_width + self.shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

        self.input_feature_slice = input_feature_slice
        self.label_feature_slice = label_feature_slice

        self.sample_ds = self.make_dataset(sleep_data[sleep_data[index] == 0])

        if generate_data_now:
            self.training_ds = self.make_dataset(self.training)
            self.validation_ds = self.make_dataset(self.validation)
            self.testing_ds = self.make_dataset(self.testing)


    def __repr__(self):
        return "WindowGenerator:\n\t" +'\n\t'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
        ])

    def split_window(self, features):
        inputs = features[:, self.input_slice, self.input_feature_slice]
        labels = features[:, self.labels_slice, self.label_feature_slice]
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])
        return inputs, labels

    def make_dataset(self, data, index_group: str = "sleep_id", sort_by: str = "minutes_since_begin"):
        ds_all = None
        for i_group in data[index_group].unique():
            subset_data = np.array(data[data[index_group] == i_group].sort_values(by=[sort_by]), dtype=np.float32)
            ds = tf.keras.utils.timeseries_dataset_from_array(
                data=subset_data,
                targets=None,
                sequence_length=self.total_window_size,
                sequence_stride=1,
                shuffle=False,
                batch_size=BATCH_SIZE,)
            ds_all = ds if ds_all is None else ds_all.concatenate(ds)
        ds_all = ds_all.map(self.split_window)

        return ds_all

    # def generate_all_datasets(self):
    #     self._training_ds = self.make_dataset(self.training)
    #     self._validation_ds = self.make_dataset(self.validation)
    #     self._testing_ds = self.make_dataset(self.testing)

    # def training_dataset(self):
    #     if self._training_ds is None:
    #         self._training_ds = self.make_dataset(self.training)
    #     return self._training_ds

    # def validation_dataset(self):
    #     if self._validation_ds is None:
    #         self._validation_ds = self.make_dataset(self.validation)
    #     return self._validation_ds

    # def test_dataset(self):
    #     if self._testing_ds is None:
    #         self._testing_ds = self.make_dataset(self.testing)
    #     return self._testing_ds

### Data Prep

All inputs follow: (batch_size, timesteps, input_dim)

In [32]:
wg = WindowGenerator(sleep_data)
wg

WindowGenerator:
	Total window size: 11
	Input indices: [0 1 2 3 4 5 6 7 8 9]
	Label indices: [10]

In [33]:
sample = wg.sample_ds.take(1)

In [34]:
sample_array = list(sample.as_numpy_iterator())

In [37]:
INDEX_TIMESTEP = 18
sample_array[0][0][INDEX_TIMESTEP], sample_array[0][1][INDEX_TIMESTEP]

(array([[18.,  8., 36.,  1.,  0.,  0.,  0.],
        [19.,  8., 37.,  1.,  0.,  0.,  0.],
        [20.,  8., 38.,  1.,  0.,  0.,  0.],
        [21.,  8., 39.,  1.,  0.,  0.,  0.],
        [22.,  8., 40.,  1.,  0.,  0.,  0.],
        [23.,  8., 41.,  1.,  0.,  0.,  0.],
        [24.,  8., 42.,  1.,  0.,  0.,  0.],
        [25.,  8., 43.,  1.,  0.,  0.,  0.],
        [26.,  8., 44.,  1.,  0.,  0.,  0.],
        [27.,  8., 45.,  1.,  0.,  0.,  0.]], dtype=float32),
 array([[0., 0., 1., 0.]], dtype=float32))

### Model 1: LSTM

In [215]:
# Hyper-parameters
LSTM_UNITS = 16
LSTM_LEARNING_RATE = 0.0001

In [278]:
# Model Definition
lstm_model = keras.Sequential()
lstm_model.add(layers.Input(shape=(TIME_STEP_INPUT, 8)))
lstm_model.add(layers.LSTM(LSTM_UNITS, stateful=False, return_sequences=True))
# lstm_model.add(layers.LSTM(LSTM_UNITS, stateful=False, return_sequences=True))
lstm_model.add(layers.Dense(SLEEP_STAGES))
lstm_model.build()
print(lstm_model.summary())

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_14 (LSTM)              (None, 10, 16)            1600      
                                                                 
 dense_9 (Dense)             (None, 10, 4)             68        
                                                                 
Total params: 1,668
Trainable params: 1,668
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# Model Training
lstm_loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
lstm_optm = keras.optimizers.Adam(learning_rate=LSTM_LEARNING_RATE)

### Model 2: GRU

In [None]:
# Hyper-paramters
GRU_UNITS = 16

In [None]:
gru_model = keras.Sequential([
    layers.GRU(GRU_UNITS),
    layers.Dense(SLEEP_STAGES)
])
gru_model.add(layers.Embedding(input_dim=1000, output_dim=64))
print(gru_model.summary())

### Model 3: Attention Mechanism

In [None]:
ATTENTION_UNITS = 16

In [None]:
am_model = keras.Sequential([
    Attention(ATTENTION_UNITS)
    layers.Dense(SLEEP_STAGES)
])
print(am_model.summary())

### Model Head-to-Head testing

# Scratch