Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Solutions to Questions from Experiments
- ## Q1: NumPy Array Operations and Pandas DataFrame Manipulation
- ```python
- import numpy as np
- import pandas as pd
- # Step 1: Create and reshape array
- sales = np.array([120, 200, 150, 180]).reshape(2, 2)
- print("Array:", sales)
- # Step 2: Calculate sum and mean
- print("Sum of sales:", np.sum(sales))
- print("Mean of sales:", np.mean(sales))
- # Step 3: Convert to DataFrame and export to CSV
- data = {"Product": ["A", "B", "C", "D"], "Sales": [120, 200, 150, 180]}
- df = pd.DataFrame(data)
- df.to_csv("sales.csv", index=False)
- # Step 4: Load the CSV and print DataFrame
- loaded_df = pd.read_csv("sales.csv")
- print("Loaded DataFrame:")
- print(loaded_df)
- ```
- ## Q2: Missing Values and Bar Chart
- ```python
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- # Step 1: Handle missing values
- data = {"Student": ["Alex", "Ben", "Cara", "Dana"],
- "Math": [85, 74, 90, 88],
- "Science": [78, np.nan, 87, 75],
- "English": [92, 81, np.nan, 83]}
- df = pd.DataFrame(data)
- df = df.fillna(df.mean())
- # Step 2: Create dictionary with average scores
- average_scores = df.mean(axis=1)
- student_avg = dict(zip(df["Student"], average_scores))
- # Step 3: Convert to NumPy, calculate stats, and plot
- scores = np.array(list(student_avg.values()))
- print("Mean:", np.mean(scores), "Std Dev:", np.std(scores))
- plt.bar(student_avg.keys(), student_avg.values())
- plt.title("Student Average Scores")
- plt.show()
- ```
- ## Q3: Data Normalization and Statistical Analysis
- ```python
- import pandas as pd
- from sklearn.preprocessing import MinMaxScaler
- # Create DataFrame
- data = {"Car Model": ["Sedan", "SUV", "Truck", "Coupe"],
- "Price": [20000, 30000, 25000, 27000],
- "Mileage": [35, 45, 60, 40]}
- df = pd.DataFrame(data)
- # Calculate sum, mean, std
- print("Price Summary:")
- print("Sum:", df["Price"].sum(), "Mean:", df["Price"].mean(), "Std Dev:", df["Price"].std())
- # Normalize columns
- scaler = MinMaxScaler()
- df[["Price", "Mileage"]] = scaler.fit_transform(df[["Price", "Mileage"]])
- print(df)
- ```
- ## Q4: Data Cleaning, Standardization, and Visualization
- ```python
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
- from sklearn.preprocessing import StandardScaler
- # Create DataFrame
- data = {"Student": ["Tom", "Lucy", "Max", "Zoe"],
- "Math": [78, 92, 85, 70],
- "Science": [85, 88, np.nan, 78],
- "English": [np.nan, 80, 75, 82]}
- df = pd.DataFrame(data)
- # Step 1: Replace missing values
- df.fillna(df.median(), inplace=True)
- # Step 2: Calculate total and average
- print("Total and Average Scores:")
- df["Total"] = df[["Math", "Science", "English"]].sum(axis=1)
- df["Average"] = df[["Math", "Science", "English"]].mean(axis=1)
- print(df)
- # Step 3: Standardize
- scaler = StandardScaler()
- standardized_data = pd.DataFrame(scaler.fit_transform(df[["Math", "Science", "English"]]),
- columns=["Math", "Science", "English"])
- # Step 4: Plot standardized scores
- for student in standardized_data.index:
- plt.plot(["Math", "Science", "English"], standardized_data.iloc[student], label=df["Student"].iloc[student])
- plt.legend()
- plt.title("Standardized Scores")
- plt.show()
- ```
- ## Q5: House Listings and Statistical Analysis
- ```python
- import pandas as pd
- from sklearn.preprocessing import StandardScaler
- # Create DataFrame
- data = {"House Type": ["Apartment", "Villa", "Townhouse", "Cottage"],
- "Price": [150, 300, 200, 180],
- "Area": [900, 2200, 1500, 1200]}
- df = pd.DataFrame(data)
- # Calculate stats
- print("Price and Area Stats:")
- print("Sum:", df[["Price", "Area"]].sum())
- print("Mean:", df[["Price", "Area"]].mean())
- print("Std Dev:", df[["Price", "Area"]].std())
- # Standardize
- scaler = StandardScaler()
- df[["Price", "Area"]] = scaler.fit_transform(df[["Price", "Area"]])
- print(df)
- ```
- ## Employee Performance and Salary Dataset Questions
- ### Question 1: Basic Data Preprocessing
- ```python
- import pandas as pd
- import numpy as np
- # Create DataFrame
- data = {
- "Employee_ID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
- "Age": [25, 28, 35, 40, 29, 50, 32, 42, 23, 37],
- "Department": ["HR", "Finance", "IT", "HR", "Marketing", "IT", "Finance", "Marketing", "HR", "IT"],
- "Years_of_Experience": [2, 5, 10, 15, 6, 25, 8, 18, 1, 12],
- "Salary": [45000, 65000, 90000, 75000, 72000, 120000, 84000, 97000, 40000, 88000],
- "Performance_Score": [75, 82, 88, 70, 85, 92, 78, 90, 65, 83],
- "Promotions_Last_5_Years": [0, 1, 1, 0, 1, 1, 1, 0, 0, 1],
- "Education_Level": ["Bachelors", "Masters", "PhD", "Masters", "Bachelors", "PhD", "Masters", "PhD", "Bachelors", "Masters"]
- }
- df = pd.DataFrame(data)
- # Step 1: Check and handle missing values
- print("Missing Values:")
- print(df.isnull().sum()) # No missing values in this dataset
- # Step 2: Detect and handle outliers using IQR
- for col in ["Age", "Salary", "Performance_Score"]:
- Q1 = df[col].quantile(0.25)
- Q3 = df[col].quantile(0.75)
- IQR = Q3 - Q1
- lower_bound = Q1 - 1.5 * IQR
- upper_bound = Q3 + 1.5 * IQR
- print(f"{col} Outliers:", df[(df[col] < lower_bound) | (df[col] > upper_bound)])
- ```
- ### Question 2: Skewness, Kurtosis, and Box Plot
- ```python
- from scipy.stats import skew, kurtosis
- import matplotlib.pyplot as plt
- # Calculate skewness and kurtosis
- for col in ["Age", "Salary", "Performance_Score"]:
- print(f"{col} Skewness:", skew(df[col]))
- print(f"{col} Kurtosis:", kurtosis(df[col]))
- # Plot box plot for Salary
- plt.boxplot(df["Salary"], vert=False)
- plt.title("Salary Distribution")
- plt.show()
- ```
- ### Question 3: Feature Selection using ANOVA
- ```python
- from scipy.stats import f_oneway
- # Perform ANOVA
- features = ["Age", "Years_of_Experience", "Salary"]
- target = df["Performance_Score"]
- for feature in features:
- f_val, p_val = f_oneway(df[feature], target)
- print(f"{feature}: F-value = {f_val}, P-value = {p_val}")
- ```
- ### Question 4: Heatmap for Correlation
- ```python
- import seaborn as sns
- # Calculate correlation matrix
- correlation_matrix = df.corr()
- print("Correlation Matrix:")
- print(correlation_matrix)
- # Plot heatmap
- sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
- plt.title("Correlation Heatmap")
- plt.show()
- # Identify highest positive and negative correlations
- sorted_corr = correlation_matrix.unstack().sort_values(ascending=False)
- print("Highest Positive Correlation:", sorted_corr[sorted_corr < 1].idxmax())
- print("Highest Negative Correlation:", sorted_corr.idxmin())
- ```
- ### Question 5: Regression and Classification Models
- ```python
- from sklearn.linear_model import LinearRegression, LogisticRegression
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import accuracy_score, confusion_matrix
- # Regression Task
- X_reg = df[["Age", "Years_of_Experience", "Performance_Score"]]
- y_reg = df["Salary"]
- X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0)
- reg_model = LinearRegression()
- reg_model.fit(X_train, y_train)
- print("Regression Coefficients:", reg_model.coef_)
- # Classification Task
- X_clf = df.drop(columns=["Promotions_Last_5_Years"])
- y_clf = df["Promotions_Last_5_Years"]
- X_train, X_test, y_train, y_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=0)
- clf_model = LogisticRegression(max_iter=1000)
- clf_model.fit(X_train, y_train)
- y_pred = clf_model.predict(X_test)
- print("Classification Accuracy:", accuracy_score(y_test, y_pred))
- print("Confusion Matrix:")
- print(confusion_matrix(y_test, y_pred))
- ```
- ---
- Let me know if you need further clarification or enhancements for any part of these solutions!
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement