Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
- url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
- df = pd.read_csv(url)
- print(df.isnull().sum())
- df['Age'].fillna(df['Age'].median(), inplace=True)
- df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
- df['Fare'].fillna(df['Fare'].median(), inplace=True)
- categorical_features = ['Sex', 'Embarked']
- encoder = OneHotEncoder(sparse_output=False, drop='first')
- encoded_data = encoder.fit_transform(df[categorical_features])
- encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())
- discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
- df[['Age_binned', 'Fare_binned']] = discretizer.fit_transform(df[['Age', 'Fare']])
- df.drop(columns=categorical_features + ['Age', 'Fare'], inplace=True)
- df_final = pd.concat([df, encoded_df], axis=1)
- display(df_final.head())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement