Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -------------
- 1.DATE TRANSFORM
- #transform the date string to an actual date
- #set the date as an index, inplace=True means change is applied
- #sort index, so the measurements are sorted by time
- features_df['Datetime'] = pd.to_datetime(features_df['Datetime'])
- features_df.set_index('Datetime',inplace=True)
- features_df.sort_index(inplace=True)
- features_df
- -------------
- 2.MERGING DATASETS
- #merge the two datasets into a new one by their indexes
- df = pd.merge(left=features_df, right=target_df, right_index=True, left_index=True)
- #merge the two datasets into a new one by one of their columns
- df = pd.merge(features_df, target_df, left_on='features_col', right_on='target_col')
- -------------
- 3.INTERPOLATION
- #interpolation as I understand it fills the missing value with the mean of its nearest neigbours.
- #effective when working with temperatures, or something simular in distribution
- for feature in features:
- df[feature] = df[feature].interpolate(method='linear')
- --------------
- 4.GROUP BY 30 MIN
- #so we have rows separated by 10 minutes, and in the new data set we want to kind of group 3 as 1, with their average values.
- df = df.groupby(pd.Grouper(freq="30min")).mean()
- df
- ----------------
- 5.CORRELATION MATRIX
- corr_matrix = df.corr()
- plt.figure(figsize=(12,8))
- sns.heatmap(corr_matrix,annot=True)
- plt.show()
- ---------------
- 6.LINEPLOT
- #when our index is a date, we can line plot a column to see how her values change compared to the date
- sns.lineplot(df['Temperature'])
- --------------
- 7.LAG CREATION
- #for each column and for the give raneg, create lags, and then drop the rows with missing values.
- #this time I included every column, but still not 100% sure how it works.
- for col in df.columns:
- for lag in range (1,6):
- df[f'{col}_{lag}'] = df[col].shift(lag)
- df.dropna(axis=0,inplace=True)
- ------------------
- 8.TRAIN TEST SPLIT
- #in X goes everything except what we train and in y goes the target column
- #this time we only left the lag columns, not sure why and when does that occur.
- from sklearn.model_selection import train_test_split
- X = df.drop('PowerConsumption',axis=1)
- y = df['PowerConsumption']
- X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=False)
- ---------------------
- 9.SCALING
- #Always fit_transform only the train data, and transform only the test data.
- #However, if its categorical output, it will already be Label Encoded, so it doesn't need any transform.
- #Because train_y is 1D, it needs to be reshaped so thta MinMaxScaler can work.
- scaler = MinMaxScaler()
- X_train = scaler.fit_transform(X_train)
- X_test = scaler.transform(X_test)
- scaler = MinMaxScaler()
- y_train = scaler.fit_transform(y_train.to_numpy().reshape(-1, 1))
- ----------------------
- 10. RESHAPE BEFORE LSTM (samples,lags,features)
- #reshape with: number of rows, number of lags per feature, number of features lagged
- X_train = X_train.reshape((X_train.shape[0], 5, (X_train.shape[1] // lag)))
- X_test = X_test.reshape((X_test.shape[0], 5, (X_test.shape[1] // lag)))
- -------------------------
- 11. LSTM Model DEFINITION
- #so apparently, only input we only specify number of lags and columns,
- #then LSTM layers are following with number of neurons, activation function and usually the first one has return_sequences=True
- #it finishes with a Dense layer, that always has 1 as for Regression and activation='linear' means print the actual value
- #this is different for Classification problems.
- model = Sequential([
- Input((X_train.shape[1], X_train.shape[2],)),
- LSTM(64, activation="relu", return_sequences=True),
- LSTM(32, activation="relu"),
- Dense(1, activation="linear")
- ])
- ------------------
- 12. MODEL COMPILE
- #mandatory after the model definition, adam is always there, but if there is classification, change the loss function and metrics.
- model.compile(
- loss="mean_squared_error",
- optimizer="adam",
- metrics=["mean_squared_error"],
- )
- ---------------
- 13. MODEL TRAIN
- # yeah and the result is usually named history
- history = model.fit(train_X, train_y, validation_split=0.20, epochs=16, batch_size=64, shuffle=False)
- ------------------------
- 14. PLOT LOSS FUNCTION
- sns.lineplot(history.history["loss"], label="loss")
- sns.lineplot(history.history["val_loss"], label="val_loss")
- ------------------------
- 15. MODEL PREDICTION
- y_pred = model.predict(X_test)
- -------------------------
- 16. INVERSE SCALE TRANSFORM
- #Because the model is trained on scaled X_train and y_train data, y_pred is scaled and we wan't to reverse it.
- y_pred = scaler.inverse_transform(y_pred)
- ---------------------------------
- 17. EVALUATION METRIC
- #for regression use MAE,MSE,R2, whereas for classification use accuracy, recall, F1 etc.
- mae = mean_absolute_error(y_test, y_pred)
- mse = mean_squared_error(y_test, y_pred)
- r2 = r2_score(y_test, y_pred)
- print(f"MAE:{mae}")
- print(f"MSE:{mse}")
- print(f"R2:{r2}")
- --------------------------
- 18.XGB MODEL
- #when using an xgb_model, don't fit_transform the y_test, and don't reshape as XGBoost is 2D.
- xgb_model = XGBRegressor(n_estimators=30).fit(X_train, y_train)
- y_pred = xgb_model.predict(X_test)
- mae = mean_absolute_error(y_test,y_pred)
- mse = mean_squared_error(y_test,y_pred)
- r2 = r2_score(y_test,y_pred)
- ---------------
- 19.GRID SEARCH
- grid_search = GridSearchCV(
- estimator=XGBRegressor(),
- param_grid={
- "n_estimators": [15, 20, 25, 30, 35, 40],
- "max_depth": [2, 3, 4, 5, 6, 7]
- },
- cv=TimeSeriesSplit(n_splits=5)
- )
- grid_search.fit(train_X, train_y)
- #calculates which are the best params of the ones given
- grid_search.best_params_
- #you create a new model with the best parameters and evaluate it
- sns.lineplot(x=test_y.index, y=test_y.values,color='red')
- sns.lineplot(x=test_y.index, y=pred_y,color='green')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement