Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from sklearn.model_selection import KFold
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.linear_model import LogisticRegression
- from sklearn.metrics import accuracy_score, classification_report
- file_path = 'C:\\Users\\Test\\Desktop\\spam_data.csv'
- data = pd.read_csv(file_path, encoding='latin1')
- data['v1'] = data['v1'].map({'ham': 0, 'spam': 1})
- X = data['v2']
- y = data['v1']
- vectorizer = CountVectorizer()
- X_vectorized = vectorizer.fit_transform(X)
- kf = KFold(n_splits=10)
- accuracies = []
- for train_index, test_index in kf.split(X_vectorized):
- X_train, X_test = X_vectorized[train_index], X_vectorized[test_index]
- y_train, y_test = y.iloc[train_index], y.iloc[test_index]
- model = LogisticRegression(max_iter=1000)
- model.fit(X_train, y_train)
- y_pred = model.predict(X_test)
- accuracy = accuracy_score(y_test, y_pred)
- accuracies.append(accuracy)
- print(classification_report(y_test, y_pred, target_names=['ham', 'spam']))
- print(f'Average accuracy over 10 folds: {sum(accuracies) / len(accuracies):.4f}')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement