Advertisement
dzocesrce

[VNP] A little bit of everything

May 23rd, 2025
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.69 KB | None | 0 0
  1. -------------
  2. 1.DATE TRANSFORM
  3. #transform the date string to an actual date
  4. #set the date as an index, inplace=True means change is applied
  5. #sort index, so the measurements are sorted by time
  6. features_df['Datetime'] = pd.to_datetime(features_df['Datetime'])
  7. features_df.set_index('Datetime',inplace=True)
  8. features_df.sort_index(inplace=True)
  9. features_df
  10. -------------
  11. 2.MERGING DATASETS
  12. #merge the two datasets into a new one by their indexes
  13. df = pd.merge(left=features_df, right=target_df, right_index=True, left_index=True)
  14. #merge the two datasets into a new one by one of their columns
  15. df = pd.merge(features_df, target_df, left_on='features_col', right_on='target_col')
  16. -------------
  17. 3.INTERPOLATION
  18. #interpolation as I understand it fills the missing value with the mean of its nearest neigbours.
  19. #effective when working with temperatures, or something simular in distribution
  20. for feature in features:
  21.   df[feature] = df[feature].interpolate(method='linear')
  22. --------------
  23. 4.GROUP BY 30 MIN
  24. #so we have rows separated by 10 minutes, and in the new data set we want to kind of group 3 as 1, with their average values.
  25. df = df.groupby(pd.Grouper(freq="30min")).mean()
  26. df
  27. ----------------
  28. 5.CORRELATION MATRIX
  29. corr_matrix = df.corr()
  30. plt.figure(figsize=(12,8))
  31. sns.heatmap(corr_matrix,annot=True)
  32. plt.show()
  33. ---------------
  34. 6.LINEPLOT
  35. #when our index is a date, we can line plot a column to see how her values change compared to the date
  36. sns.lineplot(df['Temperature'])
  37. --------------
  38. 7.LAG CREATION
  39. #for each column and for the give raneg, create lags, and then drop the rows with missing values.
  40. #this time I included every column, but still not 100% sure how it works.
  41. for col in df.columns:
  42.   for lag in range (1,6):
  43.     df[f'{col}_{lag}'] = df[col].shift(lag)
  44. df.dropna(axis=0,inplace=True)
  45. ------------------
  46. 8.TRAIN TEST SPLIT
  47. #in X goes everything except what we train and in y goes the target column
  48. #this time we only left the lag columns, not sure why and when does that occur.
  49. from sklearn.model_selection import train_test_split
  50. X = df.drop('PowerConsumption',axis=1)
  51. y = df['PowerConsumption']
  52. X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,shuffle=False)
  53. ---------------------
  54. 9.SCALING
  55. #Always fit_transform only the train data, and transform only the test data.
  56. #However, if its categorical output, it will already be Label Encoded, so it doesn't need any transform.
  57. #Because train_y is 1D, it needs to be reshaped so thta MinMaxScaler can work.
  58. scaler = MinMaxScaler()
  59. X_train = scaler.fit_transform(X_train)
  60. X_test = scaler.transform(X_test)
  61. scaler = MinMaxScaler()
  62. y_train = scaler.fit_transform(y_train.to_numpy().reshape(-1, 1))
  63. ----------------------
  64. 10. RESHAPE BEFORE LSTM (samples,lags,features)
  65. #reshape with: number of rows, number of lags per feature, number of features lagged
  66. X_train = X_train.reshape((X_train.shape[0], 5, (X_train.shape[1] // lag)))
  67. X_test = X_test.reshape((X_test.shape[0], 5, (X_test.shape[1] // lag)))
  68. -------------------------
  69. 11. LSTM Model DEFINITION
  70. #so apparently, only input we only specify number of lags and columns,
  71. #then LSTM layers are following with number of neurons, activation function and usually the first one has return_sequences=True
  72. #it finishes with a Dense layer, that always has 1 as for Regression and activation='linear' means print the actual value
  73. #this is different for Classification problems.
  74. model = Sequential([
  75.     Input((X_train.shape[1], X_train.shape[2],)),
  76.     LSTM(64, activation="relu", return_sequences=True),
  77.     LSTM(32, activation="relu"),
  78.     Dense(1, activation="linear")
  79. ])
  80. ------------------
  81. 12. MODEL COMPILE
  82. #mandatory after the model definition, adam is always there, but if there is classification, change the loss function and metrics.
  83. model.compile(
  84.     loss="mean_squared_error",
  85.     optimizer="adam",
  86.     metrics=["mean_squared_error"],
  87. )
  88. ---------------
  89. 13. MODEL TRAIN
  90. # yeah and the result is usually named history
  91. history = model.fit(train_X, train_y, validation_split=0.20, epochs=16, batch_size=64, shuffle=False)
  92. ------------------------
  93. 14. PLOT LOSS FUNCTION
  94. sns.lineplot(history.history["loss"], label="loss")
  95. sns.lineplot(history.history["val_loss"], label="val_loss")
  96. ------------------------
  97. 15. MODEL PREDICTION
  98. y_pred = model.predict(X_test)
  99. -------------------------
  100. 16. INVERSE SCALE TRANSFORM
  101. #Because the model is trained on scaled X_train and y_train data, y_pred is scaled and we wan't to reverse it.
  102. y_pred = scaler.inverse_transform(y_pred)
  103. ---------------------------------
  104. 17. EVALUATION METRIC
  105. #for regression use MAE,MSE,R2, whereas for classification use accuracy, recall, F1 etc.
  106. mae = mean_absolute_error(y_test, y_pred)
  107. mse = mean_squared_error(y_test, y_pred)
  108. r2 = r2_score(y_test, y_pred)
  109. print(f"MAE:{mae}")
  110. print(f"MSE:{mse}")
  111. print(f"R2:{r2}")
  112. --------------------------
  113. 18.XGB MODEL
  114. #when using an xgb_model, don't fit_transform the y_test, and don't reshape as XGBoost is 2D.
  115. xgb_model = XGBRegressor(n_estimators=30).fit(X_train, y_train)
  116. y_pred = xgb_model.predict(X_test)
  117. mae = mean_absolute_error(y_test,y_pred)
  118. mse = mean_squared_error(y_test,y_pred)
  119. r2 = r2_score(y_test,y_pred)
  120. ---------------
  121. 19.GRID SEARCH
  122. grid_search = GridSearchCV(
  123.     estimator=XGBRegressor(),
  124.     param_grid={
  125.         "n_estimators": [15, 20, 25, 30, 35, 40],
  126.         "max_depth": [2, 3, 4, 5, 6, 7]
  127.     },
  128.     cv=TimeSeriesSplit(n_splits=5)
  129. )
  130. grid_search.fit(train_X, train_y)
  131. #calculates which are the best params of the ones given
  132. grid_search.best_params_
  133. #you create a new model with the best parameters and evaluate it
  134. sns.lineplot(x=test_y.index, y=test_y.values,color='red')
  135. sns.lineplot(x=test_y.index, y=pred_y,color='green')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement