Advertisement
gagarin_1982

Untitled

May 21st, 2024
46
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.07 KB | None | 0 0
  1. # загрузка модулей и константы
  2. import pandas as pd
  3. import numpy as np
  4. import sklearn
  5.  
  6. from scipy.stats import shapiro
  7.  
  8. from sklearn.metrics import r2_score, f1_score, accuracy_score, make_scorer
  9. from sklearn.model_selection import cross_val_score
  10.  
  11. from sklearn.model_selection import train_test_split
  12. from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
  13. from sklearn.impute import SimpleImputer
  14.  
  15. from sklearn.linear_model import LinearRegression, LogisticRegression
  16.  
  17. from sklearn.neighbors import KNeighborsClassifier
  18. from sklearn.tree import DecisionTreeClassifier
  19. from sklearn.svm import SVC
  20. from sklearn.svm import SVR
  21.  
  22. from sklearn.compose import ColumnTransformer
  23. from sklearn.pipeline import Pipeline
  24.  
  25. from sklearn.model_selection import RandomizedSearchCV
  26. from sklearn.model_selection import GridSearchCV
  27.  
  28. import warnings
  29. warnings.simplefilter(action='ignore', category=FutureWarning)
  30.  
  31. RANDOM_STATE = 42
  32. TEST_SIZE = 0.25
  33.  
  34. # пользовательская функция для SMAPE
  35. def smape_score(y_true, y_pred):
  36. smape = (np.mean(2.0 * abs(y_true - y_pred) / (abs(y_true) + abs(y_pred))) * 100)
  37. return smape
  38.  
  39. smape_scorer = make_scorer(smape_score, greater_is_better=False)
  40.  
  41. # подбор модели (линейная регрессия)
  42.  
  43. # перечень входящих признаков
  44. input_features = [
  45. 'employment_years',
  46. 'last_year_promo',
  47. 'last_year_violations',
  48. 'supervisor_evaluation',
  49. 'salary'
  50. ]
  51. # целевой признак
  52. output_features = ['job_satisfaction_rate']
  53.  
  54. # train_job_sat - датасет с тренировочными данными для первой модели (удовлетворенность)
  55. X = train_job_sat[input_features]
  56. y = train_job_sat[output_features]
  57.  
  58. X_train, X_test, y_train, y_test = train_test_split(
  59. X,
  60. y,
  61. test_size = TEST_SIZE,
  62. random_state = RANDOM_STATE)
  63.  
  64. # разделение признаков для предобработки
  65. # номинальные категориальные
  66. ohe_columns = []
  67.  
  68. # порядковые категориальные
  69. ord_columns = [
  70. 'last_year_promo',
  71. 'last_year_violations',
  72. ]
  73.  
  74. # количественные
  75. num_columns = [
  76. 'employment_years',
  77. 'salary',
  78. 'supervisor_evaluation'
  79. ]
  80.  
  81. # пайплайн для подготовки признаков из списка номинальных категориальных переменных
  82. ohe_pipe = Pipeline(
  83. [('simpleImputer_ohe', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
  84. ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
  85. ]
  86. )
  87.  
  88. # пайплайн для подготовки признаков из списка порядковых категориальных переменных
  89. ord_pipe = Pipeline(
  90. [('simpleImputer_before_ord', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
  91. ('ord', OrdinalEncoder(
  92. categories=[
  93. ['no', 'yes'],
  94. ['no', 'yes'],
  95. ],
  96. handle_unknown='use_encoded_value', unknown_value=np.nan
  97. )
  98. ),
  99. ('simpleImputer_after_ord', SimpleImputer(missing_values=np.nan, strategy='most_frequent'))
  100. ]
  101. )
  102.  
  103. # общий пайплайн для подготовки данных
  104. data_preprocessor = ColumnTransformer(
  105. [('ohe', ohe_pipe, ohe_columns),
  106. ('ord', ord_pipe, ord_columns),
  107. ('num', MinMaxScaler(), num_columns)
  108. ],
  109. remainder='passthrough'
  110. )
  111.  
  112. # итоговый пайплайн
  113. pipe_final = Pipeline([
  114. ('preprocessor', data_preprocessor),
  115. ('models', LinearRegression())
  116. ])
  117.  
  118. # cловари гиперпараметров для моделей
  119. param_grid = [
  120. {
  121. 'models': [LinearRegression()],
  122. 'preprocessor__num': [StandardScaler(), MinMaxScaler(), 'passthrough']
  123. }
  124. ]
  125.  
  126.  
  127. # поиск лучшей модели и ее гиперпараметров
  128. model = GridSearchCV(
  129. pipe_final,
  130. param_grid,
  131. cv=5,
  132. scoring=smape_scorer,
  133. n_jobs=-1
  134. )
  135.  
  136. model.fit(X_train, y_train)
  137. print(model.best_score_)
  138.  
  139. preprocessor = model.best_estimator_.named_steps['preprocessor']
  140. X_retest_preprocessed = pd.DataFrame(preprocessor.transform(test_job_sat[input_features]), columns=input_features)
  141. y_retest_pred = model.predict(X_retest_preprocessed)
  142.  
  143.  
  144. ---------------------------------------------------------------------------
  145. TypeError Traceback (most recent call last)
  146. /tmp/ipykernel_48/3633290659.py in <module>
  147. 4
  148. 5
  149. ----> 6 y_retest_pred = model.predict(X_retest_preprocessed)
  150.  
  151. /opt/conda/lib/python3.9/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
  152. 118
  153. 119 # lambda, but not partial, allows help() to work with update_wrapper
  154. --> 120 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
  155. 121 # update the docstring of the returned function
  156. 122 update_wrapper(out, self.fn)
  157.  
  158. /opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_search.py in predict(self, X)
  159. 538 """
  160. 539 self._check_is_fitted('predict')
  161. --> 540 return self.best_estimator_.predict(X)
  162. 541
  163. 542 @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
  164.  
  165. /opt/conda/lib/python3.9/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
  166. 118
  167. 119 # lambda, but not partial, allows help() to work with update_wrapper
  168. --> 120 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
  169. 121 # update the docstring of the returned function
  170. 122 update_wrapper(out, self.fn)
  171.  
  172. /opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
  173. 416 Xt = X
  174. 417 for _, name, transform in self._iter(with_final=False):
  175. --> 418 Xt = transform.transform(Xt)
  176. 419 return self.steps[-1][-1].predict(Xt, **predict_params)
  177. 420
  178.  
  179. /opt/conda/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in transform(self, X)
  180. 563 "data given during fit."
  181. 564 )
  182. --> 565 Xs = self._fit_transform(X, None, _transform_one, fitted=True)
  183. 566 self._validate_output(Xs)
  184. 567
  185.  
  186. /opt/conda/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
  187. 433 self._iter(fitted=fitted, replace_strings=True))
  188. 434 try:
  189. --> 435 return Parallel(n_jobs=self.n_jobs)(
  190. 436 delayed(func)(
  191. 437 transformer=clone(trans) if not fitted else trans,
  192.  
  193. /opt/conda/lib/python3.9/site-packages/joblib/parallel.py in __call__(self, iterable)
  194. 1041 # remaining jobs.
  195. 1042 self._iterating = False
  196. -> 1043 if self.dispatch_one_batch(iterator):
  197. 1044 self._iterating = self._original_iterator is not None
  198. 1045
  199.  
  200. /opt/conda/lib/python3.9/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
  201. 859 return False
  202. 860 else:
  203. --> 861 self._dispatch(tasks)
  204. 862 return True
  205. 863
  206.  
  207. /opt/conda/lib/python3.9/site-packages/joblib/parallel.py in _dispatch(self, batch)
  208. 777 with self._lock:
  209. 778 job_idx = len(self._jobs)
  210. --> 779 job = self._backend.apply_async(batch, callback=cb)
  211. 780 # A job can complete so quickly than its callback is
  212. 781 # called before we get here, causing self._jobs to
  213.  
  214. /opt/conda/lib/python3.9/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
  215. 206 def apply_async(self, func, callback=None):
  216. 207 """Schedule a func to be run"""
  217. --> 208 result = ImmediateResult(func)
  218. 209 if callback:
  219. 210 callback(result)
  220.  
  221. /opt/conda/lib/python3.9/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
  222. 570 # Don't delay the application, to avoid keeping the input
  223. 571 # arguments in memory
  224. --> 572 self.results = batch()
  225. 573
  226. 574 def get(self):
  227.  
  228. /opt/conda/lib/python3.9/site-packages/joblib/parallel.py in __call__(self)
  229. 260 # change the default number of processes to -1
  230. 261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
  231. --> 262 return [func(*args, **kwargs)
  232. 263 for func, args, kwargs in self.items]
  233. 264
  234.  
  235. /opt/conda/lib/python3.9/site-packages/joblib/parallel.py in <listcomp>(.0)
  236. 260 # change the default number of processes to -1
  237. 261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
  238. --> 262 return [func(*args, **kwargs)
  239. 263 for func, args, kwargs in self.items]
  240. 264
  241.  
  242. /opt/conda/lib/python3.9/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
  243. 220 def __call__(self, *args, **kwargs):
  244. 221 with config_context(**self.config):
  245. --> 222 return self.function(*args, **kwargs)
  246.  
  247. /opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py in _transform_one(transformer, X, y, weight, **fit_params)
  248. 731
  249. 732 def _transform_one(transformer, X, y, weight, **fit_params):
  250. --> 733 res = transformer.transform(X)
  251. 734 # if we have a weight for this transformer, multiply output
  252. 735 if weight is None:
  253.  
  254. /opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py in _transform(self, X)
  255. 558 Xt = X
  256. 559 for _, _, transform in self._iter():
  257. --> 560 Xt = transform.transform(Xt)
  258. 561 return Xt
  259. 562
  260.  
  261. /opt/conda/lib/python3.9/site-packages/sklearn/preprocessing/_encoders.py in transform(self, X)
  262. 785 Transformed input.
  263. 786 """
  264. --> 787 X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
  265. 788 X_trans = X_int.astype(self.dtype, copy=False)
  266. 789
  267.  
  268. /opt/conda/lib/python3.9/site-packages/sklearn/preprocessing/_encoders.py in _transform(self, X, handle_unknown, force_all_finite)
  269. 127 for i in range(n_features):
  270. 128 Xi = X_list[i]
  271. --> 129 diff, valid_mask = _check_unknown(Xi, self.categories_[i],
  272. 130 return_mask=True)
  273. 131
  274.  
  275. /opt/conda/lib/python3.9/site-packages/sklearn/utils/_encode.py in _check_unknown(values, known_values, return_mask)
  276. 253
  277. 254 # check for nans in the known_values
  278. --> 255 if np.isnan(known_values).any():
  279. 256 diff_is_nan = np.isnan(diff)
  280. 257 if diff_is_nan.any():
  281.  
  282. TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement