Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def generateSalesData(): org.apache.spark.sql.DataFrame = {
- import spark.implicits._
- // Define case class for sales data
- case class SalesData(
- shop_id: Int,
- month_id: Int,
- patient_id: Int,
- trn_id: Int,
- pack_id: Int,
- day_id: Int,
- prescriber_id: Integer // Using Integer to handle null values
- )
- // Data for the good shop (shop_id=1)
- // Has consistent number of patients and transactions across all months
- val goodShopData = Seq(
- // Month 1 - 202101 - stable values (20 patients, mainly RX)
- SalesData(1, 202101, 1, 101, 1, 20210115, 1),
- SalesData(1, 202101, 2, 102, 1, 20210115, 1),
- SalesData(1, 202101, 3, 103, 1, 20210115, 1),
- SalesData(1, 202101, 4, 104, 1, 20210115, 2),
- SalesData(1, 202101, 5, 105, 1, 20210115, 2),
- SalesData(1, 202101, 6, 106, 2, 20210115, 2),
- SalesData(1, 202101, 7, 107, 2, 20210115, 3),
- SalesData(1, 202101, 8, 108, 2, 20210115, 3),
- SalesData(1, 202101, 9, 109, 2, 20210115, 3),
- SalesData(1, 202101, 10, 110, 2, 20210115, 1),
- SalesData(1, 202101, 11, 111, 3, 20210116, null),
- SalesData(1, 202101, 12, 112, 3, 20210116, null),
- SalesData(1, 202101, 13, 113, 3, 20210116, null),
- SalesData(1, 202101, 14, 114, 1, 20210116, 2),
- SalesData(1, 202101, 15, 115, 1, 20210116, 2),
- SalesData(1, 202101, 16, 116, 1, 20210116, 3),
- SalesData(1, 202101, 17, 117, 1, 20210116, 3),
- SalesData(1, 202101, 18, 118, 2, 20210116, 1),
- SalesData(1, 202101, 19, 119, 2, 20210116, 1),
- SalesData(1, 202101, 20, 120, 2, 20210116, 2),
- // Month 2 - 202102 - similar stable values
- SalesData(1, 202102, 1, 201, 1, 20210215, 1),
- SalesData(1, 202102, 2, 202, 1, 20210215, 1),
- SalesData(1, 202102, 3, 203, 1, 20210215, 1),
- SalesData(1, 202102, 4, 204, 1, 20210215, 2),
- SalesData(1, 202102, 5, 205, 1, 20210215, 2),
- SalesData(1, 202102, 6, 206, 2, 20210215, 2),
- SalesData(1, 202102, 7, 207, 2, 20210215, 3),
- SalesData(1, 202102, 8, 208, 2, 20210215, 3),
- SalesData(1, 202102, 9, 209, 2, 20210215, 3),
- SalesData(1, 202102, 10, 210, 2, 20210215, 1),
- SalesData(1, 202102, 11, 211, 3, 20210216, null),
- SalesData(1, 202102, 12, 212, 3, 20210216, null),
- SalesData(1, 202102, 13, 213, 3, 20210216, null),
- SalesData(1, 202102, 14, 214, 1, 20210216, 2),
- SalesData(1, 202102, 15, 215, 1, 20210216, 2),
- SalesData(1, 202102, 16, 216, 1, 20210216, 3),
- SalesData(1, 202102, 17, 217, 1, 20210216, 3),
- SalesData(1, 202102, 18, 218, 2, 20210216, 1),
- SalesData(1, 202102, 19, 219, 2, 20210216, 1),
- SalesData(1, 202102, 20, 220, 2, 20210216, 2),
- // Month 3 - 202103 - similar stable values
- SalesData(1, 202103, 1, 301, 1, 20210315, 1),
- SalesData(1, 202103, 2, 302, 1, 20210315, 1),
- SalesData(1, 202103, 3, 303, 1, 20210315, 1),
- SalesData(1, 202103, 4, 304, 1, 20210315, 2),
- SalesData(1, 202103, 5, 305, 1, 20210315, 2),
- SalesData(1, 202103, 6, 306, 2, 20210315, 2),
- SalesData(1, 202103, 7, 307, 2, 20210315, 3),
- SalesData(1, 202103, 8, 308, 2, 20210315, 3),
- SalesData(1, 202103, 9, 309, 2, 20210315, 3),
- SalesData(1, 202103, 10, 310, 2, 20210315, 1),
- SalesData(1, 202103, 11, 311, 3, 20210316, null),
- SalesData(1, 202103, 12, 312, 3, 20210316, null),
- SalesData(1, 202103, 13, 313, 3, 20210316, null),
- SalesData(1, 202103, 14, 314, 1, 20210316, 2),
- SalesData(1, 202103, 15, 315, 1, 20210316, 2),
- SalesData(1, 202103, 16, 316, 1, 20210316, 3),
- SalesData(1, 202103, 17, 317, 1, 20210316, 3),
- SalesData(1, 202103, 18, 318, 2, 20210316, 1),
- SalesData(1, 202103, 19, 319, 2, 20210316, 1),
- SalesData(1, 202103, 20, 320, 2, 20210316, 2),
- // Month 4 - 202104 - similar stable values
- SalesData(1, 202104, 1, 401, 1, 20210415, 1),
- SalesData(1, 202104, 2, 402, 1, 20210415, 1),
- SalesData(1, 202104, 3, 403, 1, 20210415, 1),
- SalesData(1, 202104, 4, 404, 1, 20210415, 2),
- SalesData(1, 202104, 5, 405, 1, 20210415, 2),
- SalesData(1, 202104, 6, 406, 2, 20210415, 2),
- SalesData(1, 202104, 7, 407, 2, 20210415, 3),
- SalesData(1, 202104, 8, 408, 2, 20210415, 3),
- SalesData(1, 202104, 9, 409, 2, 20210415, 3),
- SalesData(1, 202104, 10, 410, 2, 20210415, 1),
- SalesData(1, 202104, 11, 411, 3, 20210416, null),
- SalesData(1, 202104, 12, 412, 3, 20210416, null),
- SalesData(1, 202104, 13, 413, 3, 20210416, null),
- SalesData(1, 202104, 14, 414, 1, 20210416, 2),
- SalesData(1, 202104, 15, 415, 1, 20210416, 2),
- SalesData(1, 202104, 16, 416, 1, 20210416, 3),
- SalesData(1, 202104, 17, 417, 1, 20210416, 3),
- SalesData(1, 202104, 18, 418, 2, 20210416, 1),
- SalesData(1, 202104, 19, 419, 2, 20210416, 1),
- SalesData(1, 202104, 20, 420, 2, 20210416, 2)
- )
- // Data for the bad shop (shop_id=2)
- // Has large variations and no data in the last month (will violate all rules)
- val badShopData = Seq(
- // Month 1 - 202101 - lots of RX (30 patients)
- SalesData(2, 202101, 101, 1001, 1, 20210115, 1),
- SalesData(2, 202101, 102, 1002, 1, 20210115, 1),
- SalesData(2, 202101, 103, 1003, 1, 20210115, 1),
- SalesData(2, 202101, 104, 1004, 1, 20210115, 2),
- SalesData(2, 202101, 105, 1005, 1, 20210115, 2),
- SalesData(2, 202101, 106, 1006, 1, 20210115, 3),
- SalesData(2, 202101, 107, 1007, 1, 20210115, 1),
- SalesData(2, 202101, 108, 1008, 1, 20210115, 2),
- SalesData(2, 202101, 109, 1009, 1, 20210115, 3),
- SalesData(2, 202101, 110, 1010, 1, 20210115, 1),
- SalesData(2, 202101, 111, 1011, 1, 20210116, 2),
- SalesData(2, 202101, 112, 1012, 1, 20210116, 3),
- SalesData(2, 202101, 113, 1013, 1, 20210116, 1),
- SalesData(2, 202101, 114, 1014, 1, 20210116, 2),
- SalesData(2, 202101, 115, 1015, 1, 20210116, 3),
- SalesData(2, 202101, 116, 1016, 2, 20210116, 1),
- SalesData(2, 202101, 117, 1017, 2, 20210116, 2),
- SalesData(2, 202101, 118, 1018, 2, 20210116, 3),
- SalesData(2, 202101, 119, 1019, 2, 20210116, 1),
- SalesData(2, 202101, 120, 1020, 2, 20210116, 2),
- SalesData(2, 202101, 121, 1021, 3, 20210117, null),
- SalesData(2, 202101, 122, 1022, 3, 20210117, null),
- SalesData(2, 202101, 123, 1023, 3, 20210117, null),
- SalesData(2, 202101, 124, 1024, 3, 20210117, null),
- SalesData(2, 202101, 125, 1025, 3, 20210117, null),
- SalesData(2, 202101, 126, 1026, 2, 20210117, 3),
- SalesData(2, 202101, 127, 1027, 2, 20210117, 1),
- SalesData(2, 202101, 128, 1028, 2, 20210117, 2),
- SalesData(2, 202101, 129, 1029, 2, 20210117, 3),
- SalesData(2, 202101, 130, 1030, 2, 20210117, 1),
- // Month 2 - 202102 - few patients (5) and no RX (to create high RX trend)
- SalesData(2, 202102, 101, 2001, 3, 20210215, null),
- SalesData(2, 202102, 102, 2002, 3, 20210215, null),
- SalesData(2, 202102, 103, 2003, 3, 20210215, null),
- SalesData(2, 202102, 104, 2004, 3, 20210215, null),
- SalesData(2, 202102, 105, 2005, 3, 20210215, null),
- // Month 3 - 202103 - many patients (25) and lots of RX again (to create high RX trend)
- SalesData(2, 202103, 101, 3001, 1, 20210315, 1),
- SalesData(2, 202103, 102, 3002, 1, 20210315, 1),
- SalesData(2, 202103, 103, 3003, 1, 20210315, 1),
- SalesData(2, 202103, 104, 3004, 1, 20210315, 2),
- SalesData(2, 202103, 105, 3005, 1, 20210315, 2),
- SalesData(2, 202103, 106, 3006, 1, 20210315, 3),
- SalesData(2, 202103, 107, 3007, 1, 20210315, 1),
- SalesData(2, 202103, 108, 3008, 1, 20210315, 2),
- SalesData(2, 202103, 109, 3009, 1, 20210315, 3),
- SalesData(2, 202103, 110, 3010, 1, 20210315, 1),
- SalesData(2, 202103, 111, 3011, 1, 20210316, 2),
- SalesData(2, 202103, 112, 3012, 1, 20210316, 3),
- SalesData(2, 202103, 113, 3013, 1, 20210316, 1),
- SalesData(2, 202103, 114, 3014, 1, 20210316, 2),
- SalesData(2, 202103, 115, 3015, 1, 20210316, 3),
- SalesData(2, 202103, 116, 3016, 2, 20210316, 1),
- SalesData(2, 202103, 117, 3017, 2, 20210316, 2),
- SalesData(2, 202103, 118, 3018, 2, 20210316, 3),
- SalesData(2, 202103, 119, 3019, 2, 20210316, 1),
- SalesData(2, 202103, 120, 3020, 2, 20210316, 2),
- SalesData(2, 202103, 121, 3021, 3, 20210317, null),
- SalesData(2, 202103, 122, 3022, 3, 20210317, null),
- SalesData(2, 202103, 123, 3023, 3, 20210317, null),
- SalesData(2, 202103, 124, 3024, 3, 20210317, null),
- SalesData(2, 202103, 125, 3025, 3, 20210317, null)
- // Month 4 - 202104 - no data intentionally (will violate LOW_TRANSACTIONS_NUMBER and LOW_PATIENTS_NUMBER)
- )
- // Combine data and create DataFrame
- val allData = goodShopData ++ badShopData
- val salesDF = allData.toDF()
- println(s"Wygenerowano dane sprzedaży z ${salesDF.count()} rekordami")
- salesDF
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement