Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def train_linear_model(df, features, target='median_house_value'):
- # Разделение признаков на категориальные и числовые
- categorical_cols = [col for col in features if df.schema[col].dataType == StringType()]
- numerical_cols = [col for col in features if df.schema[col].dataType != StringType()]
- # Обработка категориальных признаков
- if categorical_cols:
- indexer = StringIndexer(
- inputCols=categorical_cols, outputCols=[c+'_idx' for c in categorical_cols]
- )
- df = indexer.fit(df).transform(df)
- encoder = OneHotEncoder(
- inputCols=[c+'_idx' for c in categorical_cols], outputCols=[c+'_ohe' for c in categorical_cols]
- )
- df = encoder.fit(df).transform(df)
- categorical_assembler = VectorAssembler(
- inputCols=[c+'_ohe' for c in categorical_cols], outputCol="categorical_features"
- )
- df = categorical_assembler.transform(df)
- # Обработка числовых признаков
- if numerical_cols:
- numerical_assembler = VectorAssembler(inputCols=numerical_cols, outputCol="numerical_features")
- df = numerical_assembler.transform(df)
- standardScaler = StandardScaler(inputCol='numerical_features', outputCol="numerical_features_scaled")
- df = standardScaler.fit(df).transform(df)
- # Объединение всех признаков
- all_features = []
- if categorical_cols:
- all_features.append('categorical_features')
- if numerical_cols:
- all_features.append('numerical_features_scaled')
- final_assembler = VectorAssembler(inputCols=all_features, outputCol="features")
- df = final_assembler.transform(df)
- # Разделение на обучающую и тестовую выборки
- train_data, test_data = df.randomSplit([.8, .2], seed=2022)
- # Обучение модели
- lr = LinearRegression(labelCol=target, featuresCol='features')
- model = lr.fit(train_data)
- # Прогнозирование на тестовой выборке
- predictions = model.transform(test_data)
- # Вычисление метрик
- evaluator_r2 = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="r2")
- evaluator_mse = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="mse")
- evaluator_rmse = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="rmse")
- r2 = evaluator_r2.evaluate(predictions)
- mse = evaluator_mse.evaluate(predictions)
- rmse = evaluator_rmse.evaluate(predictions)
- print(f"R²: {r2}")
- print(f"MSE: {mse}")
- print(f"RMSE: {rmse}")
- features = [
- 'ocean_proximity', 'longitude', 'latitude', 'housing_median_age', 'total_rooms',
- 'total_bedrooms', 'population', 'households', 'median_income'
- ]
- train_linear_model(df, features=features)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement