Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import time
- import logging
- import re
- import sys
- # Configure logging
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
- def count_title_opener(file_path):
- """Counts occurrences of the 'title' opener in the binary file."""
- title_opener = b"title"
- try:
- with open(file_path, 'rb') as file:
- content = file.read()
- return content.count(title_opener)
- except FileNotFoundError:
- logging.error(f"The file at {file_path} was not found.")
- raise
- except Exception as e:
- logging.error(f"An unexpected error occurred: {e}")
- raise
- def extract_titles(file_path, output_file=None):
- """Extracts titles from the binary file and writes them to an output file."""
- title_opener = b"title"
- end_pattern = b'\x00\x00\x00'
- errors = []
- try:
- with open(file_path, 'rb') as file:
- content = file.read()
- title_count = content.count(title_opener)
- if title_count == 0:
- logging.warning("No 'title' openers found. Check the file format.")
- return
- logging.info(f"Found {title_count} occurrences of 'title' in the file.")
- start_index = 0
- titles = []
- while start_index < len(content):
- start_index = content.find(title_opener, start_index)
- if start_index == -1:
- break
- start_index += len(title_opener)
- if start_index + 3 > len(content):
- errors.append(f"Not enough data after index {start_index}. Skipping this occurrence. Bytes: {content[start_index:start_index+30].hex()}")
- start_index += 1
- continue
- start_index += 3
- end_index = content.find(end_pattern, start_index)
- if end_index == -1:
- errors.append(f"End pattern not found after index {start_index}. Skipping this occurrence. Bytes: {content[start_index:start_index+30].hex()}")
- start_index += 1
- continue
- title_bytes = content[start_index:end_index].strip()
- try:
- title = title_bytes.decode('utf-8', errors='replace')
- title = re.sub(r'[^\x20-\x7E]', '', title)
- titles.append(title)
- except UnicodeDecodeError:
- errors.append(f"Error decoding bytes at index {start_index} to {end_index}. Bytes: {content[start_index:end_index].hex()}")
- start_index = end_index + len(end_pattern)
- if not titles:
- logging.warning("No valid titles found after scraping.")
- timestamp = time.strftime("%Y%m%d_%H%M%S")
- if output_file is None:
- output_file = os.path.splitext(file_path)[0] + f"_titles_{timestamp}.txt"
- with open(output_file, 'w', encoding='utf-8') as output:
- output.write(f"Script run at: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n")
- if errors:
- output.write("Errors:\n")
- for error in errors[:5]:
- output.write(f"- {error}\n")
- output.write("\n")
- output.write("Titles:\n")
- for title in titles:
- output.write(title.strip() + '\n')
- logging.info(f"Titles successfully written to {output_file}.")
- logging.info(f"Number of 'title' openers found: {title_count}")
- logging.info(f"Number of titles extracted: {len(titles)}")
- except FileNotFoundError:
- logging.error(f"The file at {file_path} was not found.")
- raise
- except Exception as e:
- logging.error(f"An unexpected error occurred: {e}")
- raise
- if __name__ == "__main__":
- if len(sys.argv) != 2:
- logging.error("Usage: python title_extractor.py <path_to_file.dat>")
- input("Press Enter to exit...")
- else:
- input_file = sys.argv[1]
- if not os.path.isfile(input_file):
- logging.error("The specified file does not exist.")
- else:
- try:
- title_count = count_title_opener(input_file)
- if title_count == 0:
- logging.warning("No 'title' openers found. Check if the file format is correct or the data is not as expected.")
- else:
- extract_titles(input_file)
- except Exception as e:
- logging.error(f"An error occurred during processing: {e}")
- input("Press Enter to exit...")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement