Sushill

Exhibitor's list

Apr 13th, 2022 (edited)
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.86 KB | None | 0 0
  1. import json
  2. from time import sleep
  3. from bs4 import BeautifulSoup
  4. import requests
  5. from selenium import webdriver
  6. from selenium.webdriver.chrome.options import Options
  7. from selenium.webdriver.common.keys import Keys
  8. from selenium.webdriver.chrome.service import Service
  9. from selenium.webdriver.common.by import By
  10. from selenium.common.exceptions import NoSuchElementException
  11. import itertools
  12. import pandas as pd
  13. import concurrent.futures
  14. import winsound
  15.  
  16.  
  17. def get_ua():
  18.     uastrings = [
  19.         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",
  20.         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",
  21.         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 "
  22.         "Safari/600.1.25",
  23.         "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",
  24.         "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",
  25.         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 "
  26.         "Safari/537.36",
  27.         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 "
  28.         "Safari/537.85.10",
  29.         "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
  30.         "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",
  31.         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36"
  32.     ]
  33.  
  34.     return random.choice(uastrings)
  35.  
  36.  
  37. headers = {'User-Agent': get_ua()}
  38. interval = 1
  39. headless = True
  40. path = Service('c:\\users\\chromedriver.exe')
  41. opt = Options()
  42. arguments = ['--headless', '--disable-dev-shm-usage', f'user-agent= {get_ua()}',
  43.              'disable-notifications']
  44.  
  45. if headless:
  46.     for arg in arguments:
  47.         opt.add_argument(arg)
  48. else:
  49.     for arg in arguments[1:]:
  50.         opt.add_argument(arg)
  51.  
  52. opt.add_experimental_option("detach", True)
  53. driver = webdriver.Chrome(service=path, options=opt)
  54.  
  55. base_url = "https://ihgfdelhifair.in/mis/Exhibitors"
  56.  
  57.  
  58. def check_response(urls):
  59.     return requests.get(urls)
  60.  
  61.  
  62. # convert multi-dimensional list to single dimensional one:
  63. def flattened_list(lists):
  64.     flat = list(itertools.chain(*lists))
  65.     return flat
  66.  
  67.  
  68. # using concurrency (multithreading) features for faster scraping:
  69. def lightning_scraping(functions, lists):
  70.     with concurrent.futures.ThreadPoolExecutor() as executor:
  71.         executions = executor.map(functions, lists)
  72.         return executions
  73.  
  74.  
  75. url_collections = []
  76.  
  77. for i in range(0, 1550, 50):
  78.     links = f"https://ihgfdelhifair.in/mis/Exhibitors/index/{i}"
  79.     url_collections.append(links)
  80.  
  81.  
  82. exhibitors_names = []
  83. exhibitors_emails = []
  84. exhibitors_contact_names = []
  85. exhibitors_states = []
  86. exhibitors_cities = []
  87. exhibitors_hall_numbers = []
  88. exhibitors_stand_numbers = []
  89.  
  90.  
  91. def main(url):
  92.     driver.maximize_window()
  93.     driver.get(url)
  94.     driver.implicitly_wait(55)
  95.     content = driver.page_source
  96.  
  97.     soup = BeautifulSoup(content, 'html.parser')
  98.     container = soup.find_all('div', class_='container')
  99.     main_container = container[5:]
  100.     for cont in container:
  101.         name = cont.find('h4')
  102.         if name is None:
  103.             continue
  104.         else:
  105.             exhibitors_names.append(name.text.strip())
  106.  
  107.     all_paragraphs = [paragraph.find_all('p') for paragraph in main_container]
  108.  
  109.     for paras in all_paragraphs:
  110.         try:
  111.             emails = paras[0].text.strip().replace("Email : ", "")
  112.             contact_name = paras[1].text.strip().replace("Contact Person Name :", "")
  113.             state = paras[2].text.strip().replace("State :", "")
  114.             city = paras[3].text.strip().replace("City : ", "")
  115.             hall_numbers = paras[4].text.strip().replace("Hall No. : ", "")
  116.             stand_numbers = paras[5].text.strip().replace("Stand No. :", "")
  117.         except IndexError:
  118.             continue
  119.  
  120.         exhibitors_emails.append(emails), exhibitors_contact_names.append(contact_name),
  121.         exhibitors_states.append(state), exhibitors_cities.append(city), exhibitors_hall_numbers.append(hall_numbers),
  122.         exhibitors_stand_numbers.append(stand_numbers)
  123.  
  124.  
  125. lightning_scraping(main, url_collections)
  126.  
  127. exhibitors_dicts = {
  128.     "Emails": exhibitors_emails,
  129.     "Contact Person Names": exhibitors_contact_names,
  130.     "States": exhibitors_states,
  131.     "Cities": exhibitors_cities,
  132.     "Hall Nos.": exhibitors_hall_numbers,
  133.     "Stand Nos:": exhibitors_stand_numbers
  134. }
  135.  
  136. # json_object = json.dumps(exhibitors_dicts, indent=4)
  137. df = pd.DataFrame(exhibitors_dicts)
  138. df.to_excel("Exhibitors database.xlsx", index=False)
  139.  
  140. winsound.PlaySound('notification.mp3', winsound.SND_FILENAME)
  141.  
Add Comment
Please, Sign In to add comment