Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # загрузка модулей и константы
- import pandas as pd
- import numpy as np
- import sklearn
- from scipy.stats import shapiro
- from sklearn.metrics import r2_score, f1_score, accuracy_score, make_scorer
- from sklearn.model_selection import cross_val_score
- from sklearn.model_selection import train_test_split
- from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
- from sklearn.impute import SimpleImputer
- from sklearn.linear_model import LinearRegression, LogisticRegression
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.svm import SVC
- from sklearn.svm import SVR
- from sklearn.compose import ColumnTransformer
- from sklearn.pipeline import Pipeline
- from sklearn.model_selection import RandomizedSearchCV
- from sklearn.model_selection import GridSearchCV
- import warnings
- warnings.simplefilter(action='ignore', category=FutureWarning)
- RANDOM_STATE = 42
- TEST_SIZE = 0.25
- # пользовательская функция для SMAPE
- def smape_score(y_true, y_pred):
- smape = (np.mean(2.0 * abs(y_true - y_pred) / (abs(y_true) + abs(y_pred))) * 100)
- return smape
- smape_scorer = make_scorer(smape_score, greater_is_better=False)
- # подбор модели (линейная регрессия)
- # перечень входящих признаков
- input_features = [
- 'employment_years',
- 'last_year_promo',
- 'last_year_violations',
- 'supervisor_evaluation',
- 'salary'
- ]
- # целевой признак
- output_features = ['job_satisfaction_rate']
- # train_job_sat - датасет с тренировочными данными для первой модели (удовлетворенность)
- X = train_job_sat[input_features]
- y = train_job_sat[output_features]
- X_train, X_test, y_train, y_test = train_test_split(
- X,
- y,
- test_size = TEST_SIZE,
- random_state = RANDOM_STATE)
- # разделение признаков для предобработки
- # номинальные категориальные
- ohe_columns = []
- # порядковые категориальные
- ord_columns = [
- 'last_year_promo',
- 'last_year_violations',
- ]
- # количественные
- num_columns = [
- 'employment_years',
- 'salary',
- 'supervisor_evaluation'
- ]
- # пайплайн для подготовки признаков из списка номинальных категориальных переменных
- ohe_pipe = Pipeline(
- [('simpleImputer_ohe', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
- ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
- ]
- )
- # пайплайн для подготовки признаков из списка порядковых категориальных переменных
- ord_pipe = Pipeline(
- [('simpleImputer_before_ord', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
- ('ord', OrdinalEncoder(
- categories=[
- ['no', 'yes'],
- ['no', 'yes'],
- ],
- handle_unknown='use_encoded_value', unknown_value=np.nan
- )
- ),
- ('simpleImputer_after_ord', SimpleImputer(missing_values=np.nan, strategy='most_frequent'))
- ]
- )
- # общий пайплайн для подготовки данных
- data_preprocessor = ColumnTransformer(
- [('ohe', ohe_pipe, ohe_columns),
- ('ord', ord_pipe, ord_columns),
- ('num', MinMaxScaler(), num_columns)
- ],
- remainder='passthrough'
- )
- # итоговый пайплайн
- pipe_final = Pipeline([
- ('preprocessor', data_preprocessor),
- ('models', LinearRegression())
- ])
- # cловари гиперпараметров для моделей
- param_grid = [
- {
- 'models': [LinearRegression()],
- 'preprocessor__num': [StandardScaler(), MinMaxScaler(), 'passthrough']
- }
- ]
- # поиск лучшей модели и ее гиперпараметров
- model = GridSearchCV(
- pipe_final,
- param_grid,
- cv=5,
- scoring=smape_scorer,
- n_jobs=-1
- )
- model.fit(X_train, y_train)
- print(model.best_score_)
- preprocessor = model.best_estimator_.named_steps['preprocessor']
- X_retest_preprocessed = pd.DataFrame(preprocessor.transform(test_job_sat[input_features]), columns=input_features)
- y_retest_pred = model.predict(X_retest_preprocessed)
- ---------------------------------------------------------------------------
- TypeError Traceback (most recent call last)
- /tmp/ipykernel_48/3633290659.py in <module>
- 4
- 5
- ----> 6 y_retest_pred = model.predict(X_retest_preprocessed)
- /opt/conda/lib/python3.9/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
- 118
- 119 # lambda, but not partial, allows help() to work with update_wrapper
- --> 120 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
- 121 # update the docstring of the returned function
- 122 update_wrapper(out, self.fn)
- /opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_search.py in predict(self, X)
- 538 """
- 539 self._check_is_fitted('predict')
- --> 540 return self.best_estimator_.predict(X)
- 541
- 542 @if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
- /opt/conda/lib/python3.9/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
- 118
- 119 # lambda, but not partial, allows help() to work with update_wrapper
- --> 120 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
- 121 # update the docstring of the returned function
- 122 update_wrapper(out, self.fn)
- /opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
- 416 Xt = X
- 417 for _, name, transform in self._iter(with_final=False):
- --> 418 Xt = transform.transform(Xt)
- 419 return self.steps[-1][-1].predict(Xt, **predict_params)
- 420
- /opt/conda/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in transform(self, X)
- 563 "data given during fit."
- 564 )
- --> 565 Xs = self._fit_transform(X, None, _transform_one, fitted=True)
- 566 self._validate_output(Xs)
- 567
- /opt/conda/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
- 433 self._iter(fitted=fitted, replace_strings=True))
- 434 try:
- --> 435 return Parallel(n_jobs=self.n_jobs)(
- 436 delayed(func)(
- 437 transformer=clone(trans) if not fitted else trans,
- /opt/conda/lib/python3.9/site-packages/joblib/parallel.py in __call__(self, iterable)
- 1041 # remaining jobs.
- 1042 self._iterating = False
- -> 1043 if self.dispatch_one_batch(iterator):
- 1044 self._iterating = self._original_iterator is not None
- 1045
- /opt/conda/lib/python3.9/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
- 859 return False
- 860 else:
- --> 861 self._dispatch(tasks)
- 862 return True
- 863
- /opt/conda/lib/python3.9/site-packages/joblib/parallel.py in _dispatch(self, batch)
- 777 with self._lock:
- 778 job_idx = len(self._jobs)
- --> 779 job = self._backend.apply_async(batch, callback=cb)
- 780 # A job can complete so quickly than its callback is
- 781 # called before we get here, causing self._jobs to
- /opt/conda/lib/python3.9/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
- 206 def apply_async(self, func, callback=None):
- 207 """Schedule a func to be run"""
- --> 208 result = ImmediateResult(func)
- 209 if callback:
- 210 callback(result)
- /opt/conda/lib/python3.9/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
- 570 # Don't delay the application, to avoid keeping the input
- 571 # arguments in memory
- --> 572 self.results = batch()
- 573
- 574 def get(self):
- /opt/conda/lib/python3.9/site-packages/joblib/parallel.py in __call__(self)
- 260 # change the default number of processes to -1
- 261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
- --> 262 return [func(*args, **kwargs)
- 263 for func, args, kwargs in self.items]
- 264
- /opt/conda/lib/python3.9/site-packages/joblib/parallel.py in <listcomp>(.0)
- 260 # change the default number of processes to -1
- 261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
- --> 262 return [func(*args, **kwargs)
- 263 for func, args, kwargs in self.items]
- 264
- /opt/conda/lib/python3.9/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
- 220 def __call__(self, *args, **kwargs):
- 221 with config_context(**self.config):
- --> 222 return self.function(*args, **kwargs)
- /opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py in _transform_one(transformer, X, y, weight, **fit_params)
- 731
- 732 def _transform_one(transformer, X, y, weight, **fit_params):
- --> 733 res = transformer.transform(X)
- 734 # if we have a weight for this transformer, multiply output
- 735 if weight is None:
- /opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py in _transform(self, X)
- 558 Xt = X
- 559 for _, _, transform in self._iter():
- --> 560 Xt = transform.transform(Xt)
- 561 return Xt
- 562
- /opt/conda/lib/python3.9/site-packages/sklearn/preprocessing/_encoders.py in transform(self, X)
- 785 Transformed input.
- 786 """
- --> 787 X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
- 788 X_trans = X_int.astype(self.dtype, copy=False)
- 789
- /opt/conda/lib/python3.9/site-packages/sklearn/preprocessing/_encoders.py in _transform(self, X, handle_unknown, force_all_finite)
- 127 for i in range(n_features):
- 128 Xi = X_list[i]
- --> 129 diff, valid_mask = _check_unknown(Xi, self.categories_[i],
- 130 return_mask=True)
- 131
- /opt/conda/lib/python3.9/site-packages/sklearn/utils/_encode.py in _check_unknown(values, known_values, return_mask)
- 253
- 254 # check for nans in the known_values
- --> 255 if np.isnan(known_values).any():
- 256 diff_is_nan = np.isnan(diff)
- 257 if diff_is_nan.any():
- TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement