Advertisement
Sushill

AmazonMe

Jan 18th, 2023 (edited)
1,249
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.07 KB | Spirit | 0 0
  1. import re
  2. import sys
  3. import random
  4. import itertools
  5. import pandas as pd
  6. from playwright.sync_api import sync_playwright
  7.  
  8.  
  9. # random time interval between each requests made to server:
  10. def randomTime(val):
  11.     ranges = [i for i in range(3, val+1)]
  12.     return random.choice(ranges)
  13.  
  14.  
  15. # Hundreds of thousands of user agents for server:
  16. def userAgents():
  17.     with open('user-agents.txt') as f:
  18.         agents = f.read().split("\n")
  19.         return random.choice(agents)
  20.  
  21.  
  22. # Using intertools to flatten multi-dimensional list:
  23. def flat(d_lists):
  24.     return list(itertools.chain(*d_lists))
  25.  
  26.  
  27. # Try except to return the value when there is no element. This helpsto a
  28. class TryExcept:
  29.     def text(self, element):
  30.         try:
  31.             return element.inner_text().strip()
  32.         except AttributeError:
  33.             return "N/A"    
  34.  
  35.     def attributes(self, element, attr):
  36.         try:
  37.             return element.get_attribute(attr)
  38.         except AttributeError:
  39.             return "Not available"
  40.  
  41.  
  42. def amazonMe(head):
  43.     print(f"Initiating the Amazon automation | Powered by Playwright.")
  44.     amazon_dicts = []
  45.     catchClause = TryExcept()    
  46.  
  47.     user_input = str(input("Enter a URL:> "))  
  48.     # user_input = """https://www.amazon.com/s?k=creatine&crid=1O52ZAP5GFBTQ&sprefix=creatine%2Caps%2C538&ref=nb_sb_noss_1"""
  49.     amazon_link_pattern = re.search("^https://www.amazon.com/s?k=*", user_input)
  50.     if amazon_link_pattern != None:
  51.         print(f"Invalid link. Please try proper Amazon link.")
  52.         sys.exit()
  53.    
  54.     with sync_playwright() as play:    
  55.         browser = play.chromium.launch(headless=head, slow_mo=3*1000)
  56.         page = browser.new_page(user_agent=userAgents())
  57.         page.goto(user_input)
  58.    
  59.         page.wait_for_timeout(timeout=randomTime(4)*1000)
  60.  
  61.         ##################### XPATH selectors ###########################################################################################################    
  62.         search_name = "//span[@class='a-color-state a-text-bold']"
  63.         total_page_number_first = "//span[@class='s-pagination-item s-pagination-disabled']"
  64.         total_page_number_second = "//span[@class='s-pagination-strip']/a"
  65.         next_button = "//a[@class='s-pagination-item s-pagination-next s-pagination-button s-pagination-separator']"
  66.        
  67.         main_content = "//div[@data-component-type='s-search-result']"        
  68.        
  69.         hyperlink = "//a[@class='a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal']"        
  70.         price = "//span[@data-a-color='base']/span[@class='a-offscreen']"
  71.         old_price = "//span[@data-a-color='secondary']/span[@class='a-offscreen']"
  72.         review = "//span[@class='a-declarative']/a/i/span[@class='a-icon-alt']"
  73.         review_count = "//a[@class='a-link-normal s-underline-text s-underline-link-text s-link-style']/span[@class='a-size-base s-underline-text']"
  74.         image = "//img[@class='s-image']"
  75.         ###################################################################################################################################################
  76.  
  77.         product_name = re.sub(r"[^a-zA-Z0-9]", "", catchClause.text(page.query_selector(search_name))).capitalize()                
  78.         page.wait_for_selector(main_content, timeout=10*1000)
  79.  
  80.         try:
  81.             last_page = page.query_selector(total_page_number_first).inner_text().strip()
  82.         except AttributeError:
  83.             last_page = page.query_selector_all(total_page_number_second)[-2].get_attribute('aria-label').split()[-1]
  84.        
  85.         print(f"Number of pages | {last_page}.")
  86.         print(f"Scraping | {product_name}.")
  87.  
  88.         for click in range(int(last_page)):
  89.             print(f"Scraping page | {click+1}")
  90.             for content in page.query_selector_all(main_content):
  91.                 data = {
  92.                     "Product": catchClause.text(content.query_selector(hyperlink)),
  93.                     "ASIN": catchClause.attributes(content, 'data-asin'),
  94.                     "Price": catchClause.text(content.query_selector(price)),
  95.                     "Original price": catchClause.text(content.query_selector(old_price)),
  96.                     "Review": catchClause.text(content.query_selector(review)),
  97.                     "Review count": re.sub(r"[()]", "", catchClause.text(content.query_selector(review_count))),
  98.                     "Hyperlinnk": f"""http://www.amazon.com{catchClause.attributes(content.query_selector(hyperlink), 'href')}""",
  99.                     "Image": f"""{catchClause.attributes(content.query_selector(image), 'src')}""",
  100.                 }
  101.            
  102.                 amazon_dicts.append(data)        
  103.        
  104.             try:
  105.                 page.query_selector(next_button).click()
  106.             except AttributeError:
  107.                 break    
  108.  
  109.         browser.close()
  110.  
  111.     print(f"Scraping done. Now exporting to excel database.")
  112.    
  113.     df = pd.DataFrame(amazon_dicts)
  114.     df.to_excel(f"HP {product_name}-Amazon database.xlsx", index=False)
  115.     print(f"{product_name} Database is saved.")    
  116.  
  117.    
  118.    
Tags: AMAZON
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement