Advertisement
GamerBhai02

Practise

Jan 14th, 2025
28
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.63 KB | Source Code | 0 0
  1. # Solutions to Questions from Experiments
  2.  
  3. ## Q1: NumPy Array Operations and Pandas DataFrame Manipulation
  4. ```python
  5. import numpy as np
  6. import pandas as pd
  7.  
  8. # Step 1: Create and reshape array
  9. sales = np.array([120, 200, 150, 180]).reshape(2, 2)
  10. print("Array:", sales)
  11.  
  12. # Step 2: Calculate sum and mean
  13. print("Sum of sales:", np.sum(sales))
  14. print("Mean of sales:", np.mean(sales))
  15.  
  16. # Step 3: Convert to DataFrame and export to CSV
  17. data = {"Product": ["A", "B", "C", "D"], "Sales": [120, 200, 150, 180]}
  18. df = pd.DataFrame(data)
  19. df.to_csv("sales.csv", index=False)
  20.  
  21. # Step 4: Load the CSV and print DataFrame
  22. loaded_df = pd.read_csv("sales.csv")
  23. print("Loaded DataFrame:")
  24. print(loaded_df)
  25. ```
  26.  
  27. ## Q2: Missing Values and Bar Chart
  28. ```python
  29. import pandas as pd
  30. import numpy as np
  31. import matplotlib.pyplot as plt
  32.  
  33. # Step 1: Handle missing values
  34. data = {"Student": ["Alex", "Ben", "Cara", "Dana"],
  35.         "Math": [85, 74, 90, 88],
  36.         "Science": [78, np.nan, 87, 75],
  37.         "English": [92, 81, np.nan, 83]}
  38. df = pd.DataFrame(data)
  39. df = df.fillna(df.mean())
  40.  
  41. # Step 2: Create dictionary with average scores
  42. average_scores = df.mean(axis=1)
  43. student_avg = dict(zip(df["Student"], average_scores))
  44.  
  45. # Step 3: Convert to NumPy, calculate stats, and plot
  46. scores = np.array(list(student_avg.values()))
  47. print("Mean:", np.mean(scores), "Std Dev:", np.std(scores))
  48. plt.bar(student_avg.keys(), student_avg.values())
  49. plt.title("Student Average Scores")
  50. plt.show()
  51. ```
  52.  
  53. ## Q3: Data Normalization and Statistical Analysis
  54. ```python
  55. import pandas as pd
  56. from sklearn.preprocessing import MinMaxScaler
  57.  
  58. # Create DataFrame
  59. data = {"Car Model": ["Sedan", "SUV", "Truck", "Coupe"],
  60.         "Price": [20000, 30000, 25000, 27000],
  61.         "Mileage": [35, 45, 60, 40]}
  62. df = pd.DataFrame(data)
  63.  
  64. # Calculate sum, mean, std
  65. print("Price Summary:")
  66. print("Sum:", df["Price"].sum(), "Mean:", df["Price"].mean(), "Std Dev:", df["Price"].std())
  67.  
  68. # Normalize columns
  69. scaler = MinMaxScaler()
  70. df[["Price", "Mileage"]] = scaler.fit_transform(df[["Price", "Mileage"]])
  71. print(df)
  72. ```
  73.  
  74. ## Q4: Data Cleaning, Standardization, and Visualization
  75. ```python
  76. import pandas as pd
  77. import numpy as np
  78. import matplotlib.pyplot as plt
  79. from sklearn.preprocessing import StandardScaler
  80.  
  81. # Create DataFrame
  82. data = {"Student": ["Tom", "Lucy", "Max", "Zoe"],
  83.         "Math": [78, 92, 85, 70],
  84.         "Science": [85, 88, np.nan, 78],
  85.         "English": [np.nan, 80, 75, 82]}
  86. df = pd.DataFrame(data)
  87.  
  88. # Step 1: Replace missing values
  89. df.fillna(df.median(), inplace=True)
  90.  
  91. # Step 2: Calculate total and average
  92. print("Total and Average Scores:")
  93. df["Total"] = df[["Math", "Science", "English"]].sum(axis=1)
  94. df["Average"] = df[["Math", "Science", "English"]].mean(axis=1)
  95. print(df)
  96.  
  97. # Step 3: Standardize
  98. scaler = StandardScaler()
  99. standardized_data = pd.DataFrame(scaler.fit_transform(df[["Math", "Science", "English"]]),
  100.                                   columns=["Math", "Science", "English"])
  101.  
  102. # Step 4: Plot standardized scores
  103. for student in standardized_data.index:
  104.     plt.plot(["Math", "Science", "English"], standardized_data.iloc[student], label=df["Student"].iloc[student])
  105. plt.legend()
  106. plt.title("Standardized Scores")
  107. plt.show()
  108. ```
  109.  
  110. ## Q5: House Listings and Statistical Analysis
  111. ```python
  112. import pandas as pd
  113. from sklearn.preprocessing import StandardScaler
  114.  
  115. # Create DataFrame
  116. data = {"House Type": ["Apartment", "Villa", "Townhouse", "Cottage"],
  117.         "Price": [150, 300, 200, 180],
  118.         "Area": [900, 2200, 1500, 1200]}
  119. df = pd.DataFrame(data)
  120.  
  121. # Calculate stats
  122. print("Price and Area Stats:")
  123. print("Sum:", df[["Price", "Area"]].sum())
  124. print("Mean:", df[["Price", "Area"]].mean())
  125. print("Std Dev:", df[["Price", "Area"]].std())
  126.  
  127. # Standardize
  128. scaler = StandardScaler()
  129. df[["Price", "Area"]] = scaler.fit_transform(df[["Price", "Area"]])
  130. print(df)
  131. ```
  132.  
  133. ## Employee Performance and Salary Dataset Questions
  134.  
  135. ### Question 1: Basic Data Preprocessing
  136. ```python
  137. import pandas as pd
  138. import numpy as np
  139.  
  140. # Create DataFrame
  141. data = {
  142.     "Employee_ID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
  143.     "Age": [25, 28, 35, 40, 29, 50, 32, 42, 23, 37],
  144.     "Department": ["HR", "Finance", "IT", "HR", "Marketing", "IT", "Finance", "Marketing", "HR", "IT"],
  145.     "Years_of_Experience": [2, 5, 10, 15, 6, 25, 8, 18, 1, 12],
  146.     "Salary": [45000, 65000, 90000, 75000, 72000, 120000, 84000, 97000, 40000, 88000],
  147.     "Performance_Score": [75, 82, 88, 70, 85, 92, 78, 90, 65, 83],
  148.     "Promotions_Last_5_Years": [0, 1, 1, 0, 1, 1, 1, 0, 0, 1],
  149.     "Education_Level": ["Bachelors", "Masters", "PhD", "Masters", "Bachelors", "PhD", "Masters", "PhD", "Bachelors", "Masters"]
  150. }
  151. df = pd.DataFrame(data)
  152.  
  153. # Step 1: Check and handle missing values
  154. print("Missing Values:")
  155. print(df.isnull().sum())  # No missing values in this dataset
  156.  
  157. # Step 2: Detect and handle outliers using IQR
  158. for col in ["Age", "Salary", "Performance_Score"]:
  159.     Q1 = df[col].quantile(0.25)
  160.     Q3 = df[col].quantile(0.75)
  161.     IQR = Q3 - Q1
  162.     lower_bound = Q1 - 1.5 * IQR
  163.     upper_bound = Q3 + 1.5 * IQR
  164.     print(f"{col} Outliers:", df[(df[col] < lower_bound) | (df[col] > upper_bound)])
  165. ```
  166.  
  167. ### Question 2: Skewness, Kurtosis, and Box Plot
  168. ```python
  169. from scipy.stats import skew, kurtosis
  170. import matplotlib.pyplot as plt
  171.  
  172. # Calculate skewness and kurtosis
  173. for col in ["Age", "Salary", "Performance_Score"]:
  174.     print(f"{col} Skewness:", skew(df[col]))
  175.     print(f"{col} Kurtosis:", kurtosis(df[col]))
  176.  
  177. # Plot box plot for Salary
  178. plt.boxplot(df["Salary"], vert=False)
  179. plt.title("Salary Distribution")
  180. plt.show()
  181. ```
  182.  
  183. ### Question 3: Feature Selection using ANOVA
  184. ```python
  185. from scipy.stats import f_oneway
  186.  
  187. # Perform ANOVA
  188. features = ["Age", "Years_of_Experience", "Salary"]
  189. target = df["Performance_Score"]
  190. for feature in features:
  191.     f_val, p_val = f_oneway(df[feature], target)
  192.     print(f"{feature}: F-value = {f_val}, P-value = {p_val}")
  193. ```
  194.  
  195. ### Question 4: Heatmap for Correlation
  196. ```python
  197. import seaborn as sns
  198.  
  199. # Calculate correlation matrix
  200. correlation_matrix = df.corr()
  201. print("Correlation Matrix:")
  202. print(correlation_matrix)
  203.  
  204. # Plot heatmap
  205. sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
  206. plt.title("Correlation Heatmap")
  207. plt.show()
  208.  
  209. # Identify highest positive and negative correlations
  210. sorted_corr = correlation_matrix.unstack().sort_values(ascending=False)
  211. print("Highest Positive Correlation:", sorted_corr[sorted_corr < 1].idxmax())
  212. print("Highest Negative Correlation:", sorted_corr.idxmin())
  213. ```
  214.  
  215. ### Question 5: Regression and Classification Models
  216. ```python
  217. from sklearn.linear_model import LinearRegression, LogisticRegression
  218. from sklearn.model_selection import train_test_split
  219. from sklearn.metrics import accuracy_score, confusion_matrix
  220.  
  221. # Regression Task
  222. X_reg = df[["Age", "Years_of_Experience", "Performance_Score"]]
  223. y_reg = df["Salary"]
  224. X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=0)
  225. reg_model = LinearRegression()
  226. reg_model.fit(X_train, y_train)
  227. print("Regression Coefficients:", reg_model.coef_)
  228.  
  229. # Classification Task
  230. X_clf = df.drop(columns=["Promotions_Last_5_Years"])
  231. y_clf = df["Promotions_Last_5_Years"]
  232. X_train, X_test, y_train, y_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=0)
  233. clf_model = LogisticRegression(max_iter=1000)
  234. clf_model.fit(X_train, y_train)
  235. y_pred = clf_model.predict(X_test)
  236. print("Classification Accuracy:", accuracy_score(y_test, y_pred))
  237. print("Confusion Matrix:")
  238. print(confusion_matrix(y_test, y_pred))
  239. ```
  240.  
  241. ---
  242.  
  243. Let me know if you need further clarification or enhancements for any part of these solutions!
  244.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement