Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json, random, math
- def load_data(path):
- with open(path, encoding="utf-8") as file:
- return json.load(file)
- def train_test_split(data, test_ratio=0.2):
- random.shuffle(data)
- cut = int(len(data) * (1-test_ratio))
- print(f"Uczący zbiór: {len(data[:cut])}")
- print(f"Testowy zbiór: {len(data[cut:])}")
- return data[:cut], data[cut:]
- def build_dictionary(train):
- vocabulary = set()
- for record in train:
- vocabulary.update(record["tags"])
- print(vocabulary)
- return vocabulary
- def train_nb(train, vocabulary):
- class_counts = {}
- word_counts = {}
- total_words = {}
- for record in train:
- c = record["label"]
- class_counts[c] = class_counts.get(c, 0) + 1
- word_counts.setdefault(c, {})
- total_words.setdefault(c, 0)
- for tag in record["tags"]:
- word_counts[c][tag] = word_counts[c].get(tag, 0) + 1
- total_words[c] += 1
- model = {
- "class_counts": class_counts,
- "word_counts": word_counts,
- "total_words": total_words,
- "vocabulary": vocabulary,
- "alpha": 1.0,
- "total_docs": len(train)
- }
- print(model)
- return model
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement