Advertisement
Coda_D

Untitled

Jun 11th, 2025
1,329
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
SQL 15.18 KB | Help | 0 0
  1. /*
  2. Remember to replace placeholder names like `your_schema_name`, `source_table`, `target_table` with actual names. Also, specific syntax for date functions or system catalog queries can vary slightly between SQL databases (e.g., `GETDATE()` in SQL Server vs. `NOW()` in PostgreSQL/MySQL for current timestamp).
  3. */
  4.  
  5. **Hypothetical TABLE Structures:**
  6.  
  7. --`employees` table:**
  8.     * `employee_id` (INT, PRIMARY KEY)
  9.     * `first_name` (VARCHAR)
  10.     * `last_name` (VARCHAR)
  11.     * `email` (VARCHAR)
  12.     * `phone_number` (VARCHAR)
  13.     * `hire_date` (DATE)
  14.     * `job_id` (VARCHAR)
  15.     * `salary` (DECIMAL)
  16.     * `commission_pct` (DECIMAL, NULLABLE)
  17.     * `manager_id` (INT, NULLABLE, FOREIGN KEY TO `employee_id`)
  18.     * `department_id` (INT, NULLABLE, FOREIGN KEY TO `departments.department_id`)
  19.     * `last_updated` (TIMESTAMP)
  20.  
  21. --`departments` table:**
  22.     * `department_id` (INT, PRIMARY KEY)
  23.     * `department_name` (VARCHAR)
  24.     * `location_id` (INT)
  25.  
  26. Let's get started:
  27.  
  28. ---
  29.  
  30. ### I. Data Cleaning Scripts
  31.  
  32. Data cleaning involves identifying and correcting or removing errors, inconsistencies, and inaccuracies in data.
  33.  
  34. **1. Finding and Handling NULL Values**
  35.  
  36. --Count NULLs in a specific column:**
  37. --sql
  38.    SELECT COUNT(*)
  39.    FROM employees
  40.    WHERE commission_pct IS NULL;
  41. --
  42.    --*Explanation: Counts how many employees have no commission percentage recorded.*
  43.  
  44. --Select rows where a column is NULL:**
  45. --sql
  46.    SELECT *
  47.    FROM employees
  48.    WHERE department_id IS NULL;
  49. --
  50.    --*Explanation: Retrieves all employees not assigned to a department.*
  51.  
  52. --Update NULL values to a default (e.g., 0 for commission_pct):**
  53. --sql
  54.    UPDATE employees
  55.    SET commission_pct = 0
  56.    WHERE commission_pct IS NULL;
  57. --
  58.    --*Explanation: Sets the commission percentage to 0 for all employees where it was previously NULL.*
  59.  
  60. --Using COALESCE to provide a default value in a query (doesn't CHANGE the underlying DATA):**
  61. --sql
  62.     SELECT
  63.         employee_id,
  64.         first_name,
  65.         last_name,
  66.         COALESCE(commission_pct, 0) AS commission_rate
  67.     FROM employees;
  68. --
  69.     --*Explanation: If `commission_pct` is NULL, it displays 0 in the `commission_rate` column for that row.*
  70.  
  71. **2. Removing Duplicate ROWS**
  72.  
  73. --Identify duplicate rows based on certain columns (e.g., email):**
  74. --sql
  75.     SELECT
  76.         email,
  77.         COUNT(*)
  78.     FROM employees
  79.     GROUP BY email
  80.     HAVING COUNT(*) > 1;
  81. --
  82.     --*Explanation: Shows emails that appear more than once in the `employees` table.*
  83.  
  84. --Delete duplicate rows (keeping one instance - method varies by SQL dialect):**
  85.     --Using a Common Table Expression (CTE) and ROW_NUMBER() (supported by many modern SQL databases):**
  86.     --sql
  87.         WITH RowNumCTE AS (
  88.             SELECT
  89.                 employee_id,
  90.                 email,
  91.                 ROW_NUMBER() OVER(PARTITION BY email ORDER BY hire_date DESC) AS rn
  92.             FROM employees
  93.         )
  94.         DELETE FROM RowNumCTE
  95.         WHERE rn > 1;
  96.     --
  97.         --*Explanation: Assigns a row number to each employee within groups of identical emails, ordered by hire date (keeping the most recent). Then deletes rows where the row number is greater than 1 (i.e., the duplicates).*
  98.  
  99. **3. Standardizing DATA Formats**
  100.  
  101. --Convert text to uppercase or lowercase:**
  102. --sql
  103.     UPDATE employees
  104.     SET email = LOWER(email);
  105. --
  106.     --*Explanation: Converts all email addresses to lowercase to ensure consistency.*
  107.  
  108. --Trim leading/trailing whitespace:**
  109. --sql
  110.     UPDATE employees
  111.     SET first_name = TRIM(first_name),
  112.         last_name = TRIM(last_name);
  113. --
  114.     --*Explanation: Removes any accidental spaces before or after first and last names.*
  115.  
  116. --Format dates (example: displaying hire_date in 'YYYY-MM-DD' format - specific function may vary):**
  117. --sql
  118.     SELECT
  119.         employee_id,
  120.         TO_CHAR(hire_date, 'YYYY-MM-DD') AS formatted_hire_date -- PostgreSQL/Oracle
  121.         -- FORMAT(hire_date, 'yyyy-MM-dd') -- SQL Server
  122.         -- DATE_FORMAT(hire_date, '%Y-%m-%d') -- MySQL
  123.     FROM employees;
  124. --
  125.     --*Explanation: Shows the hire date in a specific string format.*
  126.  
  127. **4. Correcting Inconsistent VALUES**
  128.  
  129. --Using CASE statements to standardize values:**
  130. --sql
  131.     UPDATE employees
  132.     SET job_id = CASE
  133.                     WHEN job_id = 'SALES_REP' THEN 'SA_REP'
  134.                     WHEN job_id = 'SALES_MAN' THEN 'SA_MAN'
  135.                     ELSE job_id
  136.                  END
  137.     WHERE job_id IN ('SALES_REP', 'SALES_MAN');
  138. --
  139.     --*Explanation: Standardizes different codes for similar job roles.*
  140.  
  141. ---
  142.  
  143. ### II. DATA Manipulation Scripts
  144.  
  145. DATA manipulation involves adding, deleting, OR modifying DATA IN the DATABASE.
  146.  
  147. **1. Inserting DATA**
  148.  
  149. --Insert a new row:**
  150. --sql
  151.     INSERT INTO departments (department_id, department_name, location_id)
  152.     VALUES (280, 'Public Relations', 1700);
  153. --
  154.     --*Explanation: Adds a new department to the `departments` table.*
  155.  
  156. --Insert data from another table:**
  157. --sql
  158.     INSERT INTO employees_archive (employee_id, first_name, last_name, hire_date)
  159.     SELECT employee_id, first_name, last_name, hire_date
  160.     FROM employees
  161.     WHERE hire_date < '2010-01-01';
  162. --
  163.     --*Explanation: Copies employees hired before 2010 into an `employees_archive` table.*
  164.  
  165. **2. Updating DATA**
  166.  
  167. --Update specific records:**
  168. --sql
  169.     UPDATE employees
  170.     SET salary = salary * 1.05, last_updated = CURRENT_TIMESTAMP
  171.     WHERE department_id = 90; -- Assuming department_id 90 is 'Executive'
  172. --
  173.     --*Explanation: Gives a 5% salary raise to all employees in department 90 and updates their `last_updated` timestamp.*
  174.  
  175. --Conditional update:**
  176. --sql
  177.     UPDATE employees
  178.     SET commission_pct = 0.10
  179.     WHERE department_id IN (SELECT department_id FROM departments WHERE department_name = 'Sales')
  180.       AND commission_pct IS NULL;
  181. --
  182.     --*Explanation: Sets a default commission of 10% for salespeople who don't have one.*
  183.  
  184. **3. Deleting DATA**
  185.  
  186. --Delete specific records:**
  187. --sql
  188.     DELETE FROM employees
  189.     WHERE employee_id = 207;
  190. --
  191.     --*Explanation: Removes the employee with `employee_id` 207.*
  192.  
  193. --Delete records based on a condition:**
  194. --sql
  195.     DELETE FROM employees
  196.     WHERE department_id IS NULL AND hire_date < '2024-01-01';
  197. --
  198.     --*Explanation: Deletes employees who are not assigned to a department and were hired before 2024.*
  199.  
  200. ---
  201.  
  202. ### III. DATA Analysis Scripts
  203.  
  204. These scripts are used TO query AND retrieve meaningful insights FROM the DATA.
  205.  
  206. **1. Basic SELECT AND Filtering**
  207.  
  208. --Select all columns from a table:**
  209. --sql
  210.     SELECT * FROM employees;
  211. --
  212.  
  213. --Select specific columns:**
  214. --sql
  215.     SELECT first_name, last_name, email, salary
  216.     FROM employees;
  217. --
  218.  
  219. --Filtering with WHERE:**
  220. --sql
  221.     SELECT first_name, last_name, salary
  222.     FROM employees
  223.     WHERE salary > 10000
  224.     AND department_id = 80; -- Sales department
  225. --
  226.     --*Explanation: Finds salespeople earning more than $10,000.*
  227.  
  228. --Using LIKE for pattern matching:**
  229. --sql
  230.     SELECT first_name, last_name
  231.     FROM employees
  232.     WHERE last_name LIKE 'S%';
  233. --
  234.     --*Explanation: Finds employees whose last name starts with 'S'.*
  235.  
  236. --Using IN to specify multiple possible values:**
  237. --sql
  238.     SELECT *
  239.     FROM employees
  240.     WHERE department_id IN (30, 50, 80);
  241. --
  242.     --*Explanation: Retrieves employees from departments 30, 50, or 80.*
  243.  
  244. **2. Aggregation AND GROUPING**
  245.  
  246. --Count total employees:**
  247. --sql
  248.     SELECT COUNT(*) AS total_employees FROM employees;
  249. --
  250.  
  251. --Calculate average salary:**
  252. --sql
  253.     SELECT AVG(salary) AS average_salary FROM employees;
  254. --
  255.  
  256. --Find min/max salary:**
  257. --sql
  258.     SELECT MIN(salary) AS min_salary, MAX(salary) AS max_salary FROM employees;
  259. --
  260.  
  261. --Count employees and average salary per department:**
  262. --sql
  263.     SELECT
  264.         d.department_name,
  265.         COUNT(e.employee_id) AS number_of_employees,
  266.         AVG(e.salary) AS average_salary
  267.     FROM employees e
  268.     JOIN departments d ON e.department_id = d.department_id
  269.     GROUP BY d.department_name
  270.     ORDER BY average_salary DESC;
  271. --
  272.     --*Explanation: Shows the number of employees and average salary for each department, ordered by the highest average salary.*
  273.  
  274. --Filtering groups with HAVING:**
  275. --sql
  276.     SELECT
  277.         department_id,
  278.         COUNT(*) AS num_employees,
  279.         AVG(salary) AS avg_salary
  280.     FROM employees
  281.     GROUP BY department_id
  282.     HAVING COUNT(*) > 10
  283.     ORDER BY avg_salary DESC;
  284. --
  285.     --*Explanation: Shows departments with more than 10 employees, ordered by average salary.*
  286.  
  287. **3. Joins**
  288.  
  289. --INNER JOIN (get employees and their department names):**
  290. --sql
  291.     SELECT
  292.         e.first_name,
  293.         e.last_name,
  294.         d.department_name
  295.     FROM employees e
  296.     INNER JOIN departments d ON e.department_id = d.department_id;
  297. --
  298.     --*Explanation: Retrieves employees who have a matching department in the `departments` table.*
  299.  
  300. --LEFT JOIN (get all employees, and their department names if they have one):**
  301. --sql
  302.     SELECT
  303.         e.first_name,
  304.         e.last_name,
  305.         COALESCE(d.department_name, 'No Department') AS department_name
  306.     FROM employees e
  307.     LEFT JOIN departments d ON e.department_id = d.department_id;
  308. --
  309.     --*Explanation: Retrieves all employees; if an employee has no department, 'No Department' is shown.*
  310.  
  311. --RIGHT JOIN (get all departments, and their employees if any):**
  312. --sql
  313.     SELECT
  314.         d.department_name,
  315.         e.first_name,
  316.         e.last_name
  317.     FROM employees e
  318.     RIGHT JOIN departments d ON e.department_id = d.department_id;
  319. --
  320.     --*Explanation: Retrieves all departments; if a department has no employees, employee fields will be NULL.*
  321.  
  322. --FULL OUTER JOIN (get all employees and all departments):**
  323. --sql
  324.     SELECT
  325.         e.first_name,
  326.         e.last_name,
  327.         d.department_name
  328.     FROM employees e
  329.     FULL OUTER JOIN departments d ON e.department_id = d.department_id;
  330. --
  331.     --*Explanation: Retrieves all rows from both tables, matching where possible, and filling with NULLs otherwise.*
  332.  
  333. **4. Subqueries**
  334.  
  335. --Subquery in the WHERE clause (employees earning more than average):**
  336. --sql
  337.     SELECT first_name, last_name, salary
  338.     FROM employees
  339.     WHERE salary > (SELECT AVG(salary) FROM employees);
  340. --
  341.     --*Explanation: Finds employees whose salary is above the company average.*
  342.  
  343. --Subquery in the FROM clause (derived table):**
  344. --sql
  345.     SELECT
  346.         dept_summary.department_name,
  347.         dept_summary.avg_dept_salary
  348.     FROM (
  349.         SELECT
  350.             d.department_name,
  351.             AVG(e.salary) AS avg_dept_salary
  352.         FROM employees e
  353.         JOIN departments d ON e.department_id = d.department_id
  354.         GROUP BY d.department_name
  355.     ) AS dept_summary
  356.     WHERE dept_summary.avg_dept_salary > 7000;
  357. --
  358.     --*Explanation: First calculates average salaries per department, then filters those departments where the average is above $7000.*
  359.  
  360. **5. Window Functions (FOR more advanced analysis)**
  361.  
  362. --Rank employees by salary within each department:**
  363. --sql
  364.     SELECT
  365.         first_name,
  366.         last_name,
  367.         salary,
  368.         department_id,
  369.         RANK() OVER (PARTITION BY department_id ORDER BY salary DESC) AS salary_rank_in_dept
  370.     FROM employees;
  371. --
  372.     --*Explanation: Assigns a rank to each employee based on their salary within their department.*
  373.  
  374. --Calculate running total of salaries:**
  375. --sql
  376.     SELECT
  377.         employee_id,
  378.         hire_date,
  379.         salary,
  380.         SUM(salary) OVER (ORDER BY hire_date, employee_id) AS running_total_salary
  381.     FROM employees;
  382. --
  383.     --*Explanation: Shows a cumulative sum of salaries as you go through employees ordered by their hire date.*
  384.  
  385. --Compare employee salary to average salary of their department:**
  386. --sql
  387.     SELECT
  388.         e.first_name,
  389.         e.last_name,
  390.         e.salary,
  391.         d.department_name,
  392.         AVG(e.salary) OVER (PARTITION BY e.department_id) AS avg_dept_salary,
  393.         e.salary - AVG(e.salary) OVER (PARTITION BY e.department_id) AS diff_from_avg
  394.     FROM employees e
  395.     JOIN departments d ON e.department_id = d.department_id;
  396. --
  397.     --*Explanation: For each employee, shows their salary, their department's average salary, and the difference.*
  398.  
  399. ---
  400.  
  401. ### IV. Smoke Testing Scripts
  402.  
  403. Smoke tests are quick checks TO ensure the DATA AND DATABASE STRUCTURE are fundamentally sound.
  404.  
  405. **1. TABLE Existence AND STRUCTURE**
  406.  
  407. --Check if a table exists (method varies by SQL dialect):**
  408.     --SQL Server:**
  409.     --sql
  410.         IF OBJECT_ID('employees', 'U') IS NOT NULL
  411.             SELECT 'Table employees exists.'
  412.         ELSE
  413.             SELECT 'Table employees does NOT exist.';
  414.     --
  415.     --PostgreSQL/MySQL (Information Schema):**
  416.     --sql
  417.         SELECT TABLE_NAME
  418.         FROM information_schema.TABLES
  419.         WHERE table_schema = 'your_schema_name' -- or current_database() for PG, database() for MySQL
  420.           AND TABLE_NAME = 'employees';
  421.     --
  422.         --*Explanation: Queries the system catalog to verify table presence.*
  423.  
  424. **2. ROW Counts**
  425.  
  426. --Get total row count for a table:**
  427. --sql
  428.     SELECT COUNT(*) FROM employees;
  429. --
  430.     --*Explanation: Confirms the table is not empty and gives a sense of data volume.*
  431.  
  432. --Compare row counts between related tables (e.g., after an ETL process):**
  433. --sql
  434.     SELECT
  435.         (SELECT COUNT(*) FROM source_table) AS source_count,
  436.         (SELECT COUNT(*) FROM target_table) AS target_count;
  437. --
  438.     --*Explanation: Basic check for data transfer completeness.*
  439.  
  440. **3. Basic DATA Integrity Checks**
  441.  
  442. --Check for unexpected NULLs in critical columns:**
  443. --sql
  444.     SELECT COUNT(*)
  445.     FROM employees
  446.     WHERE employee_id IS NULL OR first_name IS NULL OR hire_date IS NULL;
  447. --
  448.     --*Explanation: Primary keys or essential attributes should generally not be NULL.*
  449.  
  450. --Check for foreign key integrity (e.g., all `department_id` in `employees` exist in `departments`):**
  451. --sql
  452.     SELECT e.employee_id, e.department_id
  453.     FROM employees e
  454.     LEFT JOIN departments d ON e.department_id = d.department_id
  455.     WHERE d.department_id IS NULL AND e.department_id IS NOT NULL;
  456. --
  457.     --*Explanation: Identifies "orphan" records in the `employees` table that reference non-existent departments.*
  458.  
  459. --Check for duplicate primary keys:**
  460. --sql
  461.     SELECT employee_id, COUNT(*)
  462.     FROM employees
  463.     GROUP BY employee_id
  464.     HAVING COUNT(*) > 1;
  465. --
  466.     --*Explanation: Primary keys must be unique; this query finds any violations.*
  467.  
  468. --Check for valid date ranges:**
  469. --sql
  470.     SELECT MIN(hire_date), MAX(hire_date)
  471.     FROM employees;
  472.     -- Also check for future dates if not expected:
  473.     SELECT COUNT(*) FROM employees WHERE hire_date > CURRENT_DATE;
  474. --
  475.     --*Explanation: Ensures dates are within expected boundaries.*
  476.  
  477. --Check for valid values in categorical columns:**
  478. --sql
  479.     SELECT DISTINCT job_id
  480.     FROM employees
  481.     ORDER BY job_id;
  482.     -- Then manually review if the distinct values are all expected.
  483.     -- Or check for values NOT IN a predefined list:
  484.     SELECT job_id, COUNT(*)
  485.     FROM employees
  486.     WHERE job_id NOT IN ('SA_REP', 'IT_PROG', 'MK_MAN', 'HR_REP', 'AD_PRES') -- List of known valid job_ids
  487.     GROUP BY job_id;
  488. --
  489.     --*Explanation: Helps identify typos or unexpected categories.*
  490.  
  491. ---
  492.  
  493.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement