Advertisement
Jackspade9624

opcrawler.py

Jun 2nd, 2025
10
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.50 KB | None | 0 0
  1. #!/usr/bin/env python3
  2.  
  3. import argparse
  4. import requests
  5. import logging
  6. from bs4 import BeautifulSoup
  7. from urllib.parse import urljoin, urlparse
  8.  
  9. class WebCrawler:
  10. def __init__(self, start_url, max_depth, output_file, verbose):
  11. self.start_url = start_url
  12. self.max_depth = max_depth
  13. self.output_file = output_file self.verbose = verbose
  14. self.visited_urls = set()
  15. self.logger = self._setup_logger()
  16.  
  17. def _setup_logger(self):
  18. logger = logging.getLogger('web_crawler')
  19. logger.setLevel(logging.DEBUG if self.verbose else logging.INFO)
  20. formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
  21. ch = logging.StreamHandler()
  22. ch.setFormatter(formatter)
  23. logger.addHandler(ch)
  24. return logger
  25.  
  26. def _get_page(self, url):
  27. try:
  28. response = requests.get(url, timeout=10)
  29. response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
  30. return response.text
  31. except requests.exceptions.RequestException as e:
  32. self.logger.error(f"Error fetching {url}: {e}")
  33. return None
  34.  
  35. def _extract_links(self, html_content, base_url):
  36. soup = BeautifulSoup(html_content, 'html.parser')
  37. links = []
  38. for link_tag in soup.find_all('a', href=True):
  39. absolute_url = urljoin(base_url, link_tag['href']) # Handles relative URLs
  40. if self._is_valid_url(absolute_url):
  41. links.append(absolute_url)
  42.  
  43. return links
  44.  
  45. def _is_valid_url(self, url):
  46. parsed_url = urlparse(url)
  47. return all([parsed_url.scheme, parsed_url.netloc]) and parsed_url.scheme in ('http', 'https')
  48.  
  49. def _crawl(self, url, depth=0):
  50. if depth > self.max_depth or url in self.visited_urls:
  51. return
  52.  
  53. self.visited_urls.add(url)
  54.  
  55. self.logger.info(f"Crawling {url}, Depth: {depth}") # Show url and depth if verbose mode
  56.  
  57. html_content = self._get_page(url)
  58. if html_content:
  59. # Example: Extract data (customize to your needs)
  60. soup = BeautifulSoup(html_content, 'html.parser')
  61. title = soup.title.string if soup.title else "No Title Found"
  62. self.logger.debug(f"Title: {title}")
  63.  
  64. if self.output_file:
  65. with open(self.output_file, 'a', encoding='utf-8') as f:
  66. f.write(f"URL: {url}\nTitle: {title}\n\n")
  67.  
  68. links = self._extract_links(html_content, url)
  69. for link in links:
  70. self._crawl(link, depth+1)
  71.  
  72. def run(self):
  73. self._crawl(self.start_url)
  74. self.logger.info("Crawling Complete.")
  75.  
  76. def main():
  77. parser = argparse.ArgumentParser(description="Advanced Web Crawler")
  78.  
  79. # Positional argument
  80. parser.add_argument("url", help="The URL to start crawling from")
  81.  
  82. # Optional arguments
  83. parser.add_argument("-d", "--max_depth", type=int, default=3, help="Maximum crawl depth (default: 3)")
  84. parser.add_argument("-o", "--output_file", help="Output file to save results")
  85. parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose logging")
  86.  
  87. args = parser.parse_args()
  88.  
  89. crawler = WebCrawler(args.url, args.max_depth, args.output_file, args.verbose)
  90. crawler.run()
  91.  
  92. if __name__ == "__main__":
  93. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement