Advertisement
Jackspade9624

webcrawler.py

Jun 2nd, 2025 (edited)
30
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.48 KB | None | 0 0
  1. import argparse
  2. import requests
  3. import logging
  4. from bs4 import BeautifulSoup
  5. from urllib.parse import urljoin, urlparse
  6.  
  7. class WebCrawler:
  8. def __init__(self, start_url, max_depth, output_file, verbose):
  9. self.start_url = start_url self.max_depth = max_depth
  10. self.output_file = output_file
  11. self.verbose = verbose
  12. self.visited_urls = set()
  13. self.logger = self._setup_logger()
  14.  
  15. def _setup_logger(self):
  16. logger = logging.getLogger('web_crawler')
  17. logger.setLevel(logging.DEBUG if self.verbose else logging.INFO)
  18. formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
  19. ch = logging.StreamHandler()
  20. ch.setFormatter(formatter)
  21. logger.addHandler(ch)
  22. return logger
  23.  
  24. def _get_page(self, url):
  25. try:
  26. response = requests.get(url, timeout=10)
  27. response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
  28. return response.text
  29. except requests.exceptions.RequestException as e:
  30. self.logger.error(f"Error fetching {url}: {e}")
  31. return None
  32.  
  33. def _extract_links(self, html_content, base_url):
  34. soup = BeautifulSoup(html_content, 'html.parser')
  35. links = []
  36. for link_tag in soup.find_all('a', href=True):
  37. absolute_url = urljoin(base_url, link_tag['href']) # Handles relative URLs
  38. if self._is_valid_url(absolute_url):
  39. links.append(absolute_url)
  40.  
  41. return links
  42.  
  43. def _is_valid_url(self, url):
  44. parsed_url = urlparse(url)
  45. return all([parsed_url.scheme, parsed_url.netloc]) and parsed_url.scheme in ('http', 'https')
  46.  
  47. def _crawl(self, url, depth=0):
  48. if depth > self.max_depth or url in self.visited_urls:
  49. return
  50.  
  51. self.visited_urls.add(url)
  52.  
  53. self.logger.info(f"Crawling {url}, Depth: {depth}") # Show url and depth if verbose mode
  54.  
  55. html_content = self._get_page(url)
  56. if html_content:
  57. # Example: Extract data (customize to your needs)
  58. soup = BeautifulSoup(html_content, 'html.parser')
  59. title = soup.title.string if soup.title else "No Title Found"
  60. self.logger.debug(f"Title: {title}")
  61.  
  62. if self.output_file:
  63. with open(self.output_file, 'a', encoding='utf-8') as f:
  64. f.write(f"URL: {url}\nTitle: {title}\n\n")
  65.  
  66. links = self._extract_links(html_content, url)
  67. for link in links:
  68. self._crawl(link, depth+1)
  69.  
  70. def run(self):
  71. self._crawl(self.start_url)
  72. self.logger.info("Crawling Complete.")
  73.  
  74. def main():
  75. parser = argparse.ArgumentParser(description="Advanced Web Crawler")
  76.  
  77. # Positional argument
  78. parser.add_argument("URL", help="The URL to start crawling from")
  79.  
  80. # Optional arguments
  81. parser.add_argument("-d", "--max_depth", type=int, default=3, help="Maximum crawl depth (default: 3)")
  82. parser.add_argument("-o", "--output_file", help="Output file to save results")
  83. parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose logging")
  84.  
  85. args = parser.parse_args()
  86.  
  87. crawler = WebCrawler(args.start_url, args.max_depth, args.output_file, args.verbose)
  88. crawler.run()
  89.  
  90. if __name__ == "__main__":
  91. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement