Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /**
- * Generuje dane testowe dla tabeli lrx_panel i tworzy tabelę tymczasową
- */
- def generatePanelData(spark: SparkSession): Unit = {
- import spark.implicits._
- // Dwa sklepy: jeden dobry, jeden zły
- val panelDF = Seq(1, 2).toDF("shop_id")
- // Tworzenie tabeli tymczasowej
- panelDF.createOrReplaceTempView("lrx_panel")
- println("Tabela lrx_panel utworzona z 2 sklepami")
- }
- /**
- * Generuje dane testowe dla tabeli dim_pack i tworzy tabelę tymczasową
- */
- def generateDimPackData(spark: SparkSession): Unit = {
- import spark.implicits._
- // Dane z lekami RX i OTC - teraz używamy int dla pack_id
- val data = Seq(
- (1, "RX", 20210101, 20211231),
- (2, "RX", 20210101, 20211231),
- (3, "OTC", 20210101, 20211231)
- )
- val dimPackDF = data.toDF("pack_id", "basket_4_code", "day_id_from", "day_id_to")
- // Tworzenie tabeli tymczasowej
- dimPackDF.createOrReplaceTempView("dim_pack")
- println("Tabela dim_pack utworzona z 3 produktami")
- }
- /**
- * Generuje dane testowe sprzedaży i zwraca DataFrame
- */
- def generateSalesData(spark: SparkSession): DataFrame = {
- import spark.implicits._
- // Dane dla dobrego sklepu (sklep_id=1)
- val goodShopData = Seq(
- // miesiąc 1 - 202101 - stałe wartości (20 pacjentów, głównie RX)
- (1, 202101, 1, 101, 1, 20210115, 1),
- (1, 202101, 2, 102, 1, 20210115, 1),
- (1, 202101, 3, 103, 1, 20210115, 1),
- (1, 202101, 4, 104, 1, 20210115, 2),
- (1, 202101, 5, 105, 1, 20210115, 2),
- (1, 202101, 6, 106, 2, 20210115, 2),
- (1, 202101, 7, 107, 2, 20210115, 3),
- (1, 202101, 8, 108, 2, 20210115, 3),
- (1, 202101, 9, 109, 2, 20210115, 3),
- (1, 202101, 10, 110, 2, 20210115, 1),
- (1, 202101, 11, 111, 3, 20210116, null),
- (1, 202101, 12, 112, 3, 20210116, null),
- (1, 202101, 13, 113, 3, 20210116, null),
- (1, 202101, 14, 114, 1, 20210116, 2),
- (1, 202101, 15, 115, 1, 20210116, 2),
- (1, 202101, 16, 116, 1, 20210116, 3),
- (1, 202101, 17, 117, 1, 20210116, 3),
- (1, 202101, 18, 118, 2, 20210116, 1),
- (1, 202101, 19, 119, 2, 20210116, 1),
- (1, 202101, 20, 120, 2, 20210116, 2),
- // miesiąc 2 - 202102 - podobne stałe wartości
- (1, 202102, 1, 201, 1, 20210215, 1),
- (1, 202102, 2, 202, 1, 20210215, 1),
- (1, 202102, 3, 203, 1, 20210215, 1),
- (1, 202102, 4, 204, 1, 20210215, 2),
- (1, 202102, 5, 205, 1, 20210215, 2),
- (1, 202102, 6, 206, 2, 20210215, 2),
- (1, 202102, 7, 207, 2, 20210215, 3),
- (1, 202102, 8, 208, 2, 20210215, 3),
- (1, 202102, 9, 209, 2, 20210215, 3),
- (1, 202102, 10, 210, 2, 20210215, 1),
- (1, 202102, 11, 211, 3, 20210216, null),
- (1, 202102, 12, 212, 3, 20210216, null),
- (1, 202102, 13, 213, 3, 20210216, null),
- (1, 202102, 14, 214, 1, 20210216, 2),
- (1, 202102, 15, 215, 1, 20210216, 2),
- (1, 202102, 16, 216, 1, 20210216, 3),
- (1, 202102, 17, 217, 1, 20210216, 3),
- (1, 202102, 18, 218, 2, 20210216, 1),
- (1, 202102, 19, 219, 2, 20210216, 1),
- (1, 202102, 20, 220, 2, 20210216, 2),
- // miesiąc 3 - 202103 - podobne stałe wartości
- (1, 202103, 1, 301, 1, 20210315, 1),
- (1, 202103, 2, 302, 1, 20210315, 1),
- (1, 202103, 3, 303, 1, 20210315, 1),
- (1, 202103, 4, 304, 1, 20210315, 2),
- (1, 202103, 5, 305, 1, 20210315, 2),
- (1, 202103, 6, 306, 2, 20210315, 2),
- (1, 202103, 7, 307, 2, 20210315, 3),
- (1, 202103, 8, 308, 2, 20210315, 3),
- (1, 202103, 9, 309, 2, 20210315, 3),
- (1, 202103, 10, 310, 2, 20210315, 1),
- (1, 202103, 11, 311, 3, 20210316, null),
- (1, 202103, 12, 312, 3, 20210316, null),
- (1, 202103, 13, 313, 3, 20210316, null),
- (1, 202103, 14, 314, 1, 20210316, 2),
- (1, 202103, 15, 315, 1, 20210316, 2),
- (1, 202103, 16, 316, 1, 20210316, 3),
- (1, 202103, 17, 317, 1, 20210316, 3),
- (1, 202103, 18, 318, 2, 20210316, 1),
- (1, 202103, 19, 319, 2, 20210316, 1),
- (1, 202103, 20, 320, 2, 20210316, 2),
- // miesiąc 4 - 202104 - podobne stałe wartości
- (1, 202104, 1, 401, 1, 20210415, 1),
- (1, 202104, 2, 402, 1, 20210415, 1),
- (1, 202104, 3, 403, 1, 20210415, 1),
- (1, 202104, 4, 404, 1, 20210415, 2),
- (1, 202104, 5, 405, 1, 20210415, 2),
- (1, 202104, 6, 406, 2, 20210415, 2),
- (1, 202104, 7, 407, 2, 20210415, 3),
- (1, 202104, 8, 408, 2, 20210415, 3),
- (1, 202104, 9, 409, 2, 20210415, 3),
- (1, 202104, 10, 410, 2, 20210415, 1),
- (1, 202104, 11, 411, 3, 20210416, null),
- (1, 202104, 12, 412, 3, 20210416, null),
- (1, 202104, 13, 413, 3, 20210416, null),
- (1, 202104, 14, 414, 1, 20210416, 2),
- (1, 202104, 15, 415, 1, 20210416, 2),
- (1, 202104, 16, 416, 1, 20210416, 3),
- (1, 202104, 17, 417, 1, 20210416, 3),
- (1, 202104, 18, 418, 2, 20210416, 1),
- (1, 202104, 19, 419, 2, 20210416, 1),
- (1, 202104, 20, 420, 2, 20210416, 2)
- )
- // Dane dla złego sklepu (sklep_id=2) - duże wahania i brak danych w ostatnim miesiącu
- val badShopData = Seq(
- // miesiąc 1 - 202101 - dużo RX (30 pacjentów)
- (2, 202101, 101, 1001, 1, 20210115, 1),
- (2, 202101, 102, 1002, 1, 20210115, 1),
- (2, 202101, 103, 1003, 1, 20210115, 1),
- (2, 202101, 104, 1004, 1, 20210115, 2),
- (2, 202101, 105, 1005, 1, 20210115, 2),
- (2, 202101, 106, 1006, 1, 20210115, 3),
- (2, 202101, 107, 1007, 1, 20210115, 1),
- (2, 202101, 108, 1008, 1, 20210115, 2),
- (2, 202101, 109, 1009, 1, 20210115, 3),
- (2, 202101, 110, 1010, 1, 20210115, 1),
- (2, 202101, 111, 1011, 1, 20210116, 2),
- (2, 202101, 112, 1012, 1, 20210116, 3),
- (2, 202101, 113, 1013, 1, 20210116, 1),
- (2, 202101, 114, 1014, 1, 20210116, 2),
- (2, 202101, 115, 1015, 1, 20210116, 3),
- (2, 202101, 116, 1016, 2, 20210116, 1),
- (2, 202101, 117, 1017, 2, 20210116, 2),
- (2, 202101, 118, 1018, 2, 20210116, 3),
- (2, 202101, 119, 1019, 2, 20210116, 1),
- (2, 202101, 120, 1020, 2, 20210116, 2),
- (2, 202101, 121, 1021, 3, 20210117, null),
- (2, 202101, 122, 1022, 3, 20210117, null),
- (2, 202101, 123, 1023, 3, 20210117, null),
- (2, 202101, 124, 1024, 3, 20210117, null),
- (2, 202101, 125, 1025, 3, 20210117, null),
- (2, 202101, 126, 1026, 2, 20210117, 3),
- (2, 202101, 127, 1027, 2, 20210117, 1),
- (2, 202101, 128, 1028, 2, 20210117, 2),
- (2, 202101, 129, 1029, 2, 20210117, 3),
- (2, 202101, 130, 1030, 2, 20210117, 1),
- // miesiąc 2 - 202102 - mało pacjentów (5) i brak RX
- (2, 202102, 101, 2001, 3, 20210215, null),
- (2, 202102, 102, 2002, 3, 20210215, null),
- (2, 202102, 103, 2003, 3, 20210215, null),
- (2, 202102, 104, 2004, 3, 20210215, null),
- (2, 202102, 105, 2005, 3, 20210215, null),
- // miesiąc 3 - 202103 - znowu dużo pacjentów (25) i dużo RX
- (2, 202103, 101, 3001, 1, 20210315, 1),
- (2, 202103, 102, 3002, 1, 20210315, 1),
- (2, 202103, 103, 3003, 1, 20210315, 1),
- (2, 202103, 104, 3004, 1, 20210315, 2),
- (2, 202103, 105, 3005, 1, 20210315, 2),
- (2, 202103, 106, 3006, 1, 20210315, 3),
- (2, 202103, 107, 3007, 1, 20210315, 1),
- (2, 202103, 108, 3008, 1, 20210315, 2),
- (2, 202103, 109, 3009, 1, 20210315, 3),
- (2, 202103, 110, 3010, 1, 20210315, 1),
- (2, 202103, 111, 3011, 1, 20210316, 2),
- (2, 202103, 112, 3012, 1, 20210316, 3),
- (2, 202103, 113, 3013, 1, 20210316, 1),
- (2, 202103, 114, 3014, 1, 20210316, 2),
- (2, 202103, 115, 3015, 1, 20210316, 3),
- (2, 202103, 116, 3016, 2, 20210316, 1),
- (2, 202103, 117, 3017, 2, 20210316, 2),
- (2, 202103, 118, 3018, 2, 20210316, 3),
- (2, 202103, 119, 3019, 2, 20210316, 1),
- (2, 202103, 120, 3020, 2, 20210316, 2),
- (2, 202103, 121, 3021, 3, 20210317, null),
- (2, 202103, 122, 3022, 3, 20210317, null),
- (2, 202103, 123, 3023, 3, 20210317, null),
- (2, 202103, 124, 3024, 3, 20210317, null),
- (2, 202103, 125, 3025, 3, 20210317, null)
- // miesiąc 4 - 202104 - brak danych celowo (naruszenie reguł)
- )
- // Łączymy dane i tworzymy DataFrame
- val allData = goodShopData ++ badShopData
- val salesDF = spark.createDataFrame(allData).toDF(
- "shop_id", "month_id", "patient_id", "trn_id", "pack_id", "day_id", "prescriber_id"
- )
- println(s"Wygenerowano dane sprzedaży z ${salesDF.count()} rekordami")
- salesDF
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement