Advertisement
tills

testdata2

May 2nd, 2025
211
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Scala 8.67 KB | None | 0 0
  1. def generateSalesData(): org.apache.spark.sql.DataFrame = {
  2.   import spark.implicits._
  3.  
  4.   // Define case class for sales data
  5.   case class SalesData(
  6.     shop_id: Int,
  7.     month_id: Int,
  8.     patient_id: Int,
  9.     trn_id: Int,
  10.     pack_id: Int,
  11.     day_id: Int,
  12.     prescriber_id: Integer  // Using Integer to handle null values
  13.   )
  14.  
  15.   // Data for the good shop (shop_id=1)
  16.   // Has consistent number of patients and transactions across all months
  17.   val goodShopData = Seq(
  18.     // Month 1 - 202101 - stable values (20 patients, mainly RX)
  19.     SalesData(1, 202101, 1, 101, 1, 20210115, 1),
  20.     SalesData(1, 202101, 2, 102, 1, 20210115, 1),
  21.     SalesData(1, 202101, 3, 103, 1, 20210115, 1),
  22.     SalesData(1, 202101, 4, 104, 1, 20210115, 2),
  23.     SalesData(1, 202101, 5, 105, 1, 20210115, 2),
  24.     SalesData(1, 202101, 6, 106, 2, 20210115, 2),
  25.     SalesData(1, 202101, 7, 107, 2, 20210115, 3),
  26.     SalesData(1, 202101, 8, 108, 2, 20210115, 3),
  27.     SalesData(1, 202101, 9, 109, 2, 20210115, 3),
  28.     SalesData(1, 202101, 10, 110, 2, 20210115, 1),
  29.     SalesData(1, 202101, 11, 111, 3, 20210116, null),
  30.     SalesData(1, 202101, 12, 112, 3, 20210116, null),
  31.     SalesData(1, 202101, 13, 113, 3, 20210116, null),
  32.     SalesData(1, 202101, 14, 114, 1, 20210116, 2),
  33.     SalesData(1, 202101, 15, 115, 1, 20210116, 2),
  34.     SalesData(1, 202101, 16, 116, 1, 20210116, 3),
  35.     SalesData(1, 202101, 17, 117, 1, 20210116, 3),
  36.     SalesData(1, 202101, 18, 118, 2, 20210116, 1),
  37.     SalesData(1, 202101, 19, 119, 2, 20210116, 1),
  38.     SalesData(1, 202101, 20, 120, 2, 20210116, 2),
  39.  
  40.     // Month 2 - 202102 - similar stable values
  41.     SalesData(1, 202102, 1, 201, 1, 20210215, 1),
  42.     SalesData(1, 202102, 2, 202, 1, 20210215, 1),
  43.     SalesData(1, 202102, 3, 203, 1, 20210215, 1),
  44.     SalesData(1, 202102, 4, 204, 1, 20210215, 2),
  45.     SalesData(1, 202102, 5, 205, 1, 20210215, 2),
  46.     SalesData(1, 202102, 6, 206, 2, 20210215, 2),
  47.     SalesData(1, 202102, 7, 207, 2, 20210215, 3),
  48.     SalesData(1, 202102, 8, 208, 2, 20210215, 3),
  49.     SalesData(1, 202102, 9, 209, 2, 20210215, 3),
  50.     SalesData(1, 202102, 10, 210, 2, 20210215, 1),
  51.     SalesData(1, 202102, 11, 211, 3, 20210216, null),
  52.     SalesData(1, 202102, 12, 212, 3, 20210216, null),
  53.     SalesData(1, 202102, 13, 213, 3, 20210216, null),
  54.     SalesData(1, 202102, 14, 214, 1, 20210216, 2),
  55.     SalesData(1, 202102, 15, 215, 1, 20210216, 2),
  56.     SalesData(1, 202102, 16, 216, 1, 20210216, 3),
  57.     SalesData(1, 202102, 17, 217, 1, 20210216, 3),
  58.     SalesData(1, 202102, 18, 218, 2, 20210216, 1),
  59.     SalesData(1, 202102, 19, 219, 2, 20210216, 1),
  60.     SalesData(1, 202102, 20, 220, 2, 20210216, 2),
  61.  
  62.     // Month 3 - 202103 - similar stable values
  63.     SalesData(1, 202103, 1, 301, 1, 20210315, 1),
  64.     SalesData(1, 202103, 2, 302, 1, 20210315, 1),
  65.     SalesData(1, 202103, 3, 303, 1, 20210315, 1),
  66.     SalesData(1, 202103, 4, 304, 1, 20210315, 2),
  67.     SalesData(1, 202103, 5, 305, 1, 20210315, 2),
  68.     SalesData(1, 202103, 6, 306, 2, 20210315, 2),
  69.     SalesData(1, 202103, 7, 307, 2, 20210315, 3),
  70.     SalesData(1, 202103, 8, 308, 2, 20210315, 3),
  71.     SalesData(1, 202103, 9, 309, 2, 20210315, 3),
  72.     SalesData(1, 202103, 10, 310, 2, 20210315, 1),
  73.     SalesData(1, 202103, 11, 311, 3, 20210316, null),
  74.     SalesData(1, 202103, 12, 312, 3, 20210316, null),
  75.     SalesData(1, 202103, 13, 313, 3, 20210316, null),
  76.     SalesData(1, 202103, 14, 314, 1, 20210316, 2),
  77.     SalesData(1, 202103, 15, 315, 1, 20210316, 2),
  78.     SalesData(1, 202103, 16, 316, 1, 20210316, 3),
  79.     SalesData(1, 202103, 17, 317, 1, 20210316, 3),
  80.     SalesData(1, 202103, 18, 318, 2, 20210316, 1),
  81.     SalesData(1, 202103, 19, 319, 2, 20210316, 1),
  82.     SalesData(1, 202103, 20, 320, 2, 20210316, 2),
  83.  
  84.     // Month 4 - 202104 - similar stable values
  85.     SalesData(1, 202104, 1, 401, 1, 20210415, 1),
  86.     SalesData(1, 202104, 2, 402, 1, 20210415, 1),
  87.     SalesData(1, 202104, 3, 403, 1, 20210415, 1),
  88.     SalesData(1, 202104, 4, 404, 1, 20210415, 2),
  89.     SalesData(1, 202104, 5, 405, 1, 20210415, 2),
  90.     SalesData(1, 202104, 6, 406, 2, 20210415, 2),
  91.     SalesData(1, 202104, 7, 407, 2, 20210415, 3),
  92.     SalesData(1, 202104, 8, 408, 2, 20210415, 3),
  93.     SalesData(1, 202104, 9, 409, 2, 20210415, 3),
  94.     SalesData(1, 202104, 10, 410, 2, 20210415, 1),
  95.     SalesData(1, 202104, 11, 411, 3, 20210416, null),
  96.     SalesData(1, 202104, 12, 412, 3, 20210416, null),
  97.     SalesData(1, 202104, 13, 413, 3, 20210416, null),
  98.     SalesData(1, 202104, 14, 414, 1, 20210416, 2),
  99.     SalesData(1, 202104, 15, 415, 1, 20210416, 2),
  100.     SalesData(1, 202104, 16, 416, 1, 20210416, 3),
  101.     SalesData(1, 202104, 17, 417, 1, 20210416, 3),
  102.     SalesData(1, 202104, 18, 418, 2, 20210416, 1),
  103.     SalesData(1, 202104, 19, 419, 2, 20210416, 1),
  104.     SalesData(1, 202104, 20, 420, 2, 20210416, 2)
  105.   )
  106.  
  107.   // Data for the bad shop (shop_id=2)
  108.   // Has large variations and no data in the last month (will violate all rules)
  109.   val badShopData = Seq(
  110.     // Month 1 - 202101 - lots of RX (30 patients)
  111.     SalesData(2, 202101, 101, 1001, 1, 20210115, 1),
  112.     SalesData(2, 202101, 102, 1002, 1, 20210115, 1),
  113.     SalesData(2, 202101, 103, 1003, 1, 20210115, 1),
  114.     SalesData(2, 202101, 104, 1004, 1, 20210115, 2),
  115.     SalesData(2, 202101, 105, 1005, 1, 20210115, 2),
  116.     SalesData(2, 202101, 106, 1006, 1, 20210115, 3),
  117.     SalesData(2, 202101, 107, 1007, 1, 20210115, 1),
  118.     SalesData(2, 202101, 108, 1008, 1, 20210115, 2),
  119.     SalesData(2, 202101, 109, 1009, 1, 20210115, 3),
  120.     SalesData(2, 202101, 110, 1010, 1, 20210115, 1),
  121.     SalesData(2, 202101, 111, 1011, 1, 20210116, 2),
  122.     SalesData(2, 202101, 112, 1012, 1, 20210116, 3),
  123.     SalesData(2, 202101, 113, 1013, 1, 20210116, 1),
  124.     SalesData(2, 202101, 114, 1014, 1, 20210116, 2),
  125.     SalesData(2, 202101, 115, 1015, 1, 20210116, 3),
  126.     SalesData(2, 202101, 116, 1016, 2, 20210116, 1),
  127.     SalesData(2, 202101, 117, 1017, 2, 20210116, 2),
  128.     SalesData(2, 202101, 118, 1018, 2, 20210116, 3),
  129.     SalesData(2, 202101, 119, 1019, 2, 20210116, 1),
  130.     SalesData(2, 202101, 120, 1020, 2, 20210116, 2),
  131.     SalesData(2, 202101, 121, 1021, 3, 20210117, null),
  132.     SalesData(2, 202101, 122, 1022, 3, 20210117, null),
  133.     SalesData(2, 202101, 123, 1023, 3, 20210117, null),
  134.     SalesData(2, 202101, 124, 1024, 3, 20210117, null),
  135.     SalesData(2, 202101, 125, 1025, 3, 20210117, null),
  136.     SalesData(2, 202101, 126, 1026, 2, 20210117, 3),
  137.     SalesData(2, 202101, 127, 1027, 2, 20210117, 1),
  138.     SalesData(2, 202101, 128, 1028, 2, 20210117, 2),
  139.     SalesData(2, 202101, 129, 1029, 2, 20210117, 3),
  140.     SalesData(2, 202101, 130, 1030, 2, 20210117, 1),
  141.  
  142.     // Month 2 - 202102 - few patients (5) and no RX (to create high RX trend)
  143.     SalesData(2, 202102, 101, 2001, 3, 20210215, null),
  144.     SalesData(2, 202102, 102, 2002, 3, 20210215, null),
  145.     SalesData(2, 202102, 103, 2003, 3, 20210215, null),
  146.     SalesData(2, 202102, 104, 2004, 3, 20210215, null),
  147.     SalesData(2, 202102, 105, 2005, 3, 20210215, null),
  148.  
  149.     // Month 3 - 202103 - many patients (25) and lots of RX again (to create high RX trend)
  150.     SalesData(2, 202103, 101, 3001, 1, 20210315, 1),
  151.     SalesData(2, 202103, 102, 3002, 1, 20210315, 1),
  152.     SalesData(2, 202103, 103, 3003, 1, 20210315, 1),
  153.     SalesData(2, 202103, 104, 3004, 1, 20210315, 2),
  154.     SalesData(2, 202103, 105, 3005, 1, 20210315, 2),
  155.     SalesData(2, 202103, 106, 3006, 1, 20210315, 3),
  156.     SalesData(2, 202103, 107, 3007, 1, 20210315, 1),
  157.     SalesData(2, 202103, 108, 3008, 1, 20210315, 2),
  158.     SalesData(2, 202103, 109, 3009, 1, 20210315, 3),
  159.     SalesData(2, 202103, 110, 3010, 1, 20210315, 1),
  160.     SalesData(2, 202103, 111, 3011, 1, 20210316, 2),
  161.     SalesData(2, 202103, 112, 3012, 1, 20210316, 3),
  162.     SalesData(2, 202103, 113, 3013, 1, 20210316, 1),
  163.     SalesData(2, 202103, 114, 3014, 1, 20210316, 2),
  164.     SalesData(2, 202103, 115, 3015, 1, 20210316, 3),
  165.     SalesData(2, 202103, 116, 3016, 2, 20210316, 1),
  166.     SalesData(2, 202103, 117, 3017, 2, 20210316, 2),
  167.     SalesData(2, 202103, 118, 3018, 2, 20210316, 3),
  168.     SalesData(2, 202103, 119, 3019, 2, 20210316, 1),
  169.     SalesData(2, 202103, 120, 3020, 2, 20210316, 2),
  170.     SalesData(2, 202103, 121, 3021, 3, 20210317, null),
  171.     SalesData(2, 202103, 122, 3022, 3, 20210317, null),
  172.     SalesData(2, 202103, 123, 3023, 3, 20210317, null),
  173.     SalesData(2, 202103, 124, 3024, 3, 20210317, null),
  174.     SalesData(2, 202103, 125, 3025, 3, 20210317, null)
  175.  
  176.     // Month 4 - 202104 - no data intentionally (will violate LOW_TRANSACTIONS_NUMBER and LOW_PATIENTS_NUMBER)
  177.   )
  178.  
  179.   // Combine data and create DataFrame
  180.   val allData = goodShopData ++ badShopData
  181.   val salesDF = allData.toDF()
  182.  
  183.   println(s"Wygenerowano dane sprzedaży z ${salesDF.count()} rekordami")
  184.  
  185.   salesDF
  186. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement