спарк линрег фанк кастом

gagarin_1982

Aug 28th, 2024

Never

Add comment

Not a member of Pastebin yet? Sign Up, it unlocks many cool features!

text 2.96 KB | None | 0 0

raw download clone embed print report

def train_linear_model(df, features, target='median_house_value'):
# Разделение признаков на категориальные и числовые
categorical_cols = [col for col in features if df.schema[col].dataType == StringType()]
numerical_cols = [col for col in features if df.schema[col].dataType != StringType()]
# Обработка категориальных признаков
if categorical_cols:
indexer = StringIndexer(
inputCols=categorical_cols, outputCols=[c+'_idx' for c in categorical_cols]
)
df = indexer.fit(df).transform(df)
encoder = OneHotEncoder(
inputCols=[c+'_idx' for c in categorical_cols], outputCols=[c+'_ohe' for c in categorical_cols]
)
df = encoder.fit(df).transform(df)
categorical_assembler = VectorAssembler(
inputCols=[c+'_ohe' for c in categorical_cols], outputCol="categorical_features"
)
df = categorical_assembler.transform(df)
# Обработка числовых признаков
if numerical_cols:
numerical_assembler = VectorAssembler(inputCols=numerical_cols, outputCol="numerical_features")
df = numerical_assembler.transform(df)
standardScaler = StandardScaler(inputCol='numerical_features', outputCol="numerical_features_scaled")
df = standardScaler.fit(df).transform(df)
# Объединение всех признаков
all_features = []
if categorical_cols:
all_features.append('categorical_features')
if numerical_cols:
all_features.append('numerical_features_scaled')
final_assembler = VectorAssembler(inputCols=all_features, outputCol="features")
df = final_assembler.transform(df)
# Разделение на обучающую и тестовую выборки
train_data, test_data = df.randomSplit([.8, .2], seed=2022)
# Обучение модели
lr = LinearRegression(labelCol=target, featuresCol='features')
model = lr.fit(train_data)
# Прогнозирование на тестовой выборке
predictions = model.transform(test_data)
# Вычисление метрик
evaluator_r2 = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="r2")
evaluator_mse = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="mse")
evaluator_rmse = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="rmse")
r2 = evaluator_r2.evaluate(predictions)
mse = evaluator_mse.evaluate(predictions)
rmse = evaluator_rmse.evaluate(predictions)
print(f"R²: {r2}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
features = [
'ocean_proximity', 'longitude', 'latitude', 'housing_median_age', 'total_rooms',
'total_bedrooms', 'population', 'households', 'median_income'
]
train_linear_model(df, features=features)

Add Comment

Please, Sign In to add comment