AmazonMe

Sushill

Jan 18th, 2023 (edited)

1,249

Never

Add comment

Not a member of Pastebin yet? Sign Up, it unlocks many cool features!

Python 5.07 KB | Spirit | 0 0

raw download clone embed print report

import re
import sys
import random
import itertools
import pandas as pd
from playwright.sync_api import sync_playwright
# random time interval between each requests made to server:
def randomTime(val):
ranges = [i for i in range(3, val+1)]
return random.choice(ranges)
# Hundreds of thousands of user agents for server:
def userAgents():
with open('user-agents.txt') as f:
agents = f.read().split("\n")
return random.choice(agents)
# Using intertools to flatten multi-dimensional list:
def flat(d_lists):
return list(itertools.chain(*d_lists))
# Try except to return the value when there is no element. This helpsto a
class TryExcept:
def text(self, element):
try:
return element.inner_text().strip()
except AttributeError:
return "N/A"
def attributes(self, element, attr):
try:
return element.get_attribute(attr)
except AttributeError:
return "Not available"
def amazonMe(head):
print(f"Initiating the Amazon automation | Powered by Playwright.")
amazon_dicts = []
catchClause = TryExcept()
user_input = str(input("Enter a URL:> "))
# user_input = """https://www.amazon.com/s?k=creatine&crid=1O52ZAP5GFBTQ&sprefix=creatine%2Caps%2C538&ref=nb_sb_noss_1"""
amazon_link_pattern = re.search("^https://www.amazon.com/s?k=*", user_input)
if amazon_link_pattern != None:
print(f"Invalid link. Please try proper Amazon link.")
sys.exit()
with sync_playwright() as play:
browser = play.chromium.launch(headless=head, slow_mo=3*1000)
page = browser.new_page(user_agent=userAgents())
page.goto(user_input)
page.wait_for_timeout(timeout=randomTime(4)*1000)
##################### XPATH selectors ###########################################################################################################
search_name = "//span[@class='a-color-state a-text-bold']"
total_page_number_first = "//span[@class='s-pagination-item s-pagination-disabled']"
total_page_number_second = "//span[@class='s-pagination-strip']/a"
next_button = "//a[@class='s-pagination-item s-pagination-next s-pagination-button s-pagination-separator']"
main_content = "//div[@data-component-type='s-search-result']"
hyperlink = "//a[@class='a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal']"
price = "//span[@data-a-color='base']/span[@class='a-offscreen']"
old_price = "//span[@data-a-color='secondary']/span[@class='a-offscreen']"
review = "//span[@class='a-declarative']/a/i/span[@class='a-icon-alt']"
review_count = "//a[@class='a-link-normal s-underline-text s-underline-link-text s-link-style']/span[@class='a-size-base s-underline-text']"
image = "//img[@class='s-image']"
###################################################################################################################################################
product_name = re.sub(r"[^a-zA-Z0-9]", "", catchClause.text(page.query_selector(search_name))).capitalize()
page.wait_for_selector(main_content, timeout=10*1000)
try:
last_page = page.query_selector(total_page_number_first).inner_text().strip()
except AttributeError:
last_page = page.query_selector_all(total_page_number_second)[-2].get_attribute('aria-label').split()[-1]
print(f"Number of pages | {last_page}.")
print(f"Scraping | {product_name}.")
for click in range(int(last_page)):
print(f"Scraping page | {click+1}")
for content in page.query_selector_all(main_content):
data = {
"Product": catchClause.text(content.query_selector(hyperlink)),
"ASIN": catchClause.attributes(content, 'data-asin'),
"Price": catchClause.text(content.query_selector(price)),
"Original price": catchClause.text(content.query_selector(old_price)),
"Review": catchClause.text(content.query_selector(review)),
"Review count": re.sub(r"[()]", "", catchClause.text(content.query_selector(review_count))),
"Hyperlinnk": f"""http://www.amazon.com{catchClause.attributes(content.query_selector(hyperlink), 'href')}""",
"Image": f"""{catchClause.attributes(content.query_selector(image), 'src')}""",
}
amazon_dicts.append(data)
try:
page.query_selector(next_button).click()
except AttributeError:
break
browser.close()
print(f"Scraping done. Now exporting to excel database.")
df = pd.DataFrame(amazon_dicts)
df.to_excel(f"HP {product_name}-Amazon database.xlsx", index=False)
print(f"{product_name} Database is saved.")

Tags: AMAZON

Add Comment

Please, Sign In to add comment