Practise

GamerBhai02

Jan 14th, 2025

Never

Add comment

Not a member of Pastebin yet? Sign Up, it unlocks many cool features!

Python 7.63 KB | Source Code | 0 0

raw download clone embed print report

# Solutions to Questions from Experiments
## Q1: NumPy Array Operations and Pandas DataFrame Manipulation
```python
import numpy as np
import pandas as pd
# Step 1: Create and reshape array
sales = np.array([120, 200, 150, 180]).reshape(2, 2)
print("Array:", sales)
# Step 2: Calculate sum and mean
print("Sum of sales:", np.sum(sales))
print("Mean of sales:", np.mean(sales))
# Step 3: Convert to DataFrame and export to CSV
data = {"Product": ["A", "B", "C", "D"], "Sales": [120, 200, 150, 180]}
df = pd.DataFrame(data)
df.to_csv("sales.csv", index=False)
# Step 4: Load the CSV and print DataFrame
loaded_df = pd.read_csv("sales.csv")
print("Loaded DataFrame:")
print(loaded_df)
```
## Q2: Missing Values and Bar Chart
```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Step 1: Handle missing values
data = {"Student": ["Alex", "Ben", "Cara", "Dana"],
"Math": [85, 74, 90, 88],
"Science": [78, np.nan, 87, 75],
"English": [92, 81, np.nan, 83]}
df = pd.DataFrame(data)
df = df.fillna(df.mean())
# Step 2: Create dictionary with average scores
average_scores = df.mean(axis=1)
student_avg = dict(zip(df["Student"], average_scores))
# Step 3: Convert to NumPy, calculate stats, and plot
scores = np.array(list(student_avg.values()))
print("Mean:", np.mean(scores), "Std Dev:", np.std(scores))
plt.bar(student_avg.keys(), student_avg.values())
plt.title("Student Average Scores")
plt.show()
```
## Q3: Data Normalization and Statistical Analysis
```python
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
# Create DataFrame
data = {"Car Model": ["Sedan", "SUV", "Truck", "Coupe"],
"Price": [20000, 30000, 25000, 27000],
"Mileage": [35, 45, 60, 40]}
df = pd.DataFrame(data)
# Calculate sum, mean, std
print("Price Summary:")
print("Sum:", df["Price"].sum(), "Mean:", df["Price"].mean(), "Std Dev:", df["Price"].std())
# Normalize columns
scaler = MinMaxScaler()
df[["Price", "Mileage"]] = scaler.fit_transform(df[["Price", "Mileage"]])
print(df)
```
## Q4: Data Cleaning, Standardization, and Visualization
```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
# Create DataFrame
data = {"Student": ["Tom", "Lucy", "Max", "Zoe"],
"Math": [78, 92, 85, 70],
"Science": [85, 88, np.nan, 78],
"English": [np.nan, 80, 75, 82]}
df = pd.DataFrame(data)
# Step 1: Replace missing values
df.fillna(df.median(), inplace=True)
# Step 2: Calculate total and average
print("Total and Average Scores:")
df["Total"] = df[["Math", "Science", "English"]].sum(axis=1)
df["Average"] = df[["Math", "Science", "English"]].mean(axis=1)
print(df)
# Step 3: Standardize
scaler = StandardScaler()
standardized_data = pd.DataFrame(scaler.fit_transform(df[["Math", "Science", "English"]]),
columns=["Math", "Science", "English"])
# Step 4: Plot standardized scores
for student in standardized_data.index:
plt.plot(["Math", "Science", "English"], standardized_data.iloc[student], label=df["Student"].iloc[student])
plt.legend()
plt.title("Standardized Scores")
plt.show()
```
## Q5: House Listings and Statistical Analysis
```python
import pandas as pd
from sklearn.preprocessing import StandardScaler
# Create DataFrame
data = {"House Type": ["Apartment", "Villa", "Townhouse", "Cottage"],
"Price": [150, 300, 200, 180],
"Area": [900, 2200, 1500, 1200]}
df = pd.DataFrame(data)
# Calculate stats
print("Price and Area Stats:")
print("Sum:", df[["Price", "Area"]].sum())
print("Mean:", df[["Price", "Area"]].mean())
print("Std Dev:", df[["Price", "Area"]].std())
# Standardize
scaler = StandardScaler()
df[["Price", "Area"]] = scaler.fit_transform(df[["Price", "Area"]])
print(df)
```
## Employee Performance and Salary Dataset Questions
### Question 1: Basic Data Preprocessing
```python
import pandas as pd
import numpy as np
# Create DataFrame
data = {
"Employee_ID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"Age": [25, 28, 35, 40, 29, 50, 32, 42, 23, 37],
"Department": ["HR", "Finance", "IT", "HR", "Marketing", "IT", "Finance", "Marketing", "HR", "IT"],
"Years_of_Experience": [2, 5, 10, 15, 6, 25, 8, 18, 1, 12],
"Salary": [45000, 65000, 90000, 75000, 72000, 120000, 84000, 97000, 40000, 88000],
"Performance_Score": [75, 82, 88, 70, 85, 92, 78, 90, 65, 83],
"Promotions_Last_5_Years": [0, 1, 1, 0, 1, 1, 1, 0, 0, 1],
"Education_Level": ["Bachelors", "Masters", "PhD", "Masters", "Bachelors", "PhD", "Masters", "PhD", "Bachelors", "Masters"]
}
df = pd.DataFrame(data)
# Step 1: Check and handle missing values
print("Missing Values:")
print(df.isnull().sum()) # No missing values in this dataset
# Step 2: Detect and handle outliers using IQR
for col in ["Age", "Salary", "Performance_Score"]:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f"{col} Outliers:", df[(df[col] < lower_bound) | (df[col] > upper_bound)])
```
### Question 2: Skewness, Kurtosis, and Box Plot
```python
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
# Calculate skewness and kurtosis
for col in ["Age", "Salary", "Performance_Score"]:
print(f"{col} Skewness:", skew(df[col]))
print(f"{col} Kurtosis:", kurtosis(df[col]))
# Plot box plot for Salary
plt.boxplot(df["Salary"], vert=False)
plt.title("Salary Distribution")
plt.show()
```
### Question 3: Feature Selection using ANOVA
```python
from scipy.stats import f_oneway
# Perform ANOVA
features = ["Age", "Years_of_Experience", "Salary"]
target = df["Performance_Score"]
for feature in features:
f_val, p_val = f_oneway(df[feature], target)
print(f"{feature}: F-value = {f_val}, P-value = {p_val}")
```
### Question 4: Heatmap for Correlation
```python
import seaborn as sns
# Calculate correlation matrix
correlation_matrix = df.corr()
print("Correlation Matrix:")
print(correlation_matrix)
# Plot heatmap
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()
# Identify highest positive and negative correlations
sorted_corr = correlation_matrix.unstack().sort_values(ascending=False)
print("Highest Positive Correlation:", sorted_corr[sorted_corr < 1].idxmax())
print("Highest Negative Correlation:", sorted_corr.idxmin())
```
### Question 5: Regression and Classification Models
```python
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
# Regression Task
X_reg = df[["Age", "Years_of_Experience", "Performance_Score"]]
y_reg = df["Salary"]
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0)
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)
print("Regression Coefficients:", reg_model.coef_)
# Classification Task
X_clf = df.drop(columns=["Promotions_Last_5_Years"])
y_clf = df["Promotions_Last_5_Years"]
X_train, X_test, y_train, y_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=0)
clf_model = LogisticRegression(max_iter=1000)
clf_model.fit(X_train, y_train)
y_pred = clf_model.predict(X_test)
print("Classification Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
```
---
Let me know if you need further clarification or enhancements for any part of these solutions!

Add Comment

Please, Sign In to add comment