Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- from time import sleep
- from bs4 import BeautifulSoup
- import requests
- from selenium import webdriver
- from selenium.webdriver.chrome.options import Options
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.chrome.service import Service
- from selenium.webdriver.common.by import By
- from selenium.common.exceptions import NoSuchElementException
- import itertools
- import pandas as pd
- import concurrent.futures
- import winsound
- def get_ua():
- uastrings = [
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 "
- "Safari/600.1.25",
- "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",
- "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 "
- "Safari/537.36",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 "
- "Safari/537.85.10",
- "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
- "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36"
- ]
- return random.choice(uastrings)
- headers = {'User-Agent': get_ua()}
- interval = 1
- headless = True
- path = Service('c:\\users\\chromedriver.exe')
- opt = Options()
- arguments = ['--headless', '--disable-dev-shm-usage', f'user-agent= {get_ua()}',
- 'disable-notifications']
- if headless:
- for arg in arguments:
- opt.add_argument(arg)
- else:
- for arg in arguments[1:]:
- opt.add_argument(arg)
- opt.add_experimental_option("detach", True)
- driver = webdriver.Chrome(service=path, options=opt)
- base_url = "https://ihgfdelhifair.in/mis/Exhibitors"
- def check_response(urls):
- return requests.get(urls)
- # convert multi-dimensional list to single dimensional one:
- def flattened_list(lists):
- flat = list(itertools.chain(*lists))
- return flat
- # using concurrency (multithreading) features for faster scraping:
- def lightning_scraping(functions, lists):
- with concurrent.futures.ThreadPoolExecutor() as executor:
- executions = executor.map(functions, lists)
- return executions
- url_collections = []
- for i in range(0, 1550, 50):
- links = f"https://ihgfdelhifair.in/mis/Exhibitors/index/{i}"
- url_collections.append(links)
- exhibitors_names = []
- exhibitors_emails = []
- exhibitors_contact_names = []
- exhibitors_states = []
- exhibitors_cities = []
- exhibitors_hall_numbers = []
- exhibitors_stand_numbers = []
- def main(url):
- driver.maximize_window()
- driver.get(url)
- driver.implicitly_wait(55)
- content = driver.page_source
- soup = BeautifulSoup(content, 'html.parser')
- container = soup.find_all('div', class_='container')
- main_container = container[5:]
- for cont in container:
- name = cont.find('h4')
- if name is None:
- continue
- else:
- exhibitors_names.append(name.text.strip())
- all_paragraphs = [paragraph.find_all('p') for paragraph in main_container]
- for paras in all_paragraphs:
- try:
- emails = paras[0].text.strip().replace("Email : ", "")
- contact_name = paras[1].text.strip().replace("Contact Person Name :", "")
- state = paras[2].text.strip().replace("State :", "")
- city = paras[3].text.strip().replace("City : ", "")
- hall_numbers = paras[4].text.strip().replace("Hall No. : ", "")
- stand_numbers = paras[5].text.strip().replace("Stand No. :", "")
- except IndexError:
- continue
- exhibitors_emails.append(emails), exhibitors_contact_names.append(contact_name),
- exhibitors_states.append(state), exhibitors_cities.append(city), exhibitors_hall_numbers.append(hall_numbers),
- exhibitors_stand_numbers.append(stand_numbers)
- lightning_scraping(main, url_collections)
- exhibitors_dicts = {
- "Emails": exhibitors_emails,
- "Contact Person Names": exhibitors_contact_names,
- "States": exhibitors_states,
- "Cities": exhibitors_cities,
- "Hall Nos.": exhibitors_hall_numbers,
- "Stand Nos:": exhibitors_stand_numbers
- }
- # json_object = json.dumps(exhibitors_dicts, indent=4)
- df = pd.DataFrame(exhibitors_dicts)
- df.to_excel("Exhibitors database.xlsx", index=False)
- winsound.PlaySound('notification.mp3', winsound.SND_FILENAME)
Add Comment
Please, Sign In to add comment