Sushill

otmoto

Feb 26th, 2022 (edited)
135
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.74 KB | None | 0 0
  1. import json
  2.  
  3. import pandas as pd
  4. from bs4 import BeautifulSoup
  5. import requests
  6. import itertools
  7. from time import sleep
  8. import concurrent.futures
  9. import random
  10. import winsound
  11.  
  12.  
  13. def get_ua():
  14.     uastrings = [
  15.         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",
  16.         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",
  17.         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 "
  18.         "Safari/600.1.25",
  19.         "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",
  20.         "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",
  21.         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 "
  22.         "Safari/537.36",
  23.         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 "
  24.         "Safari/537.85.10",
  25.         "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
  26.         "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",
  27.         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36"
  28.     ]
  29.  
  30.     return random.choice(uastrings)
  31.  
  32.  
  33. headers = {"User-Agent": get_ua()}
  34.  
  35.  
  36. def flattened_list(lists):
  37.     flat = list(itertools.chain(*lists))
  38.     return flat
  39.  
  40.  
  41. def skip_every_second(data_lists):
  42.     var = data_lists[::2]
  43.     return var
  44.  
  45.  
  46. # using concurrency features for faster scraping.
  47. def lightning_scraping(functions, lists):
  48.     with concurrent.futures.ThreadPoolExecutor() as executor:
  49.         executions = executor.map(extract, url_list)
  50.         return executions
  51.  
  52.  
  53. url_list = []
  54.  
  55. model_data = []
  56. wersja_data = []
  57. cena_data = []
  58. years_data = []
  59. distance_data = []
  60. cc_data = []
  61. fuel_data = []
  62. otomoto_url = []
  63.  
  64. first_page_number = 1
  65. last_page_number = 50
  66. time_interval = 5
  67.  
  68.  
  69. def extract(url):
  70.     r = requests.get(url, headers=headers)
  71.     sleep(time_interval)
  72.     soup = BeautifulSoup(r.content, 'lxml')
  73.     stuffs = soup.find_all('main', class_='ooa-p2z5vl e19uumca5')  # this contains whole table data
  74.     for_otomoto_link = soup.find_all('div', class_='ooa-1nvnpye e1b25f6f5')
  75.     for oto in for_otomoto_link:
  76.         url = oto.find('a').get('href')
  77.         otomoto_url.append(url)
  78.     # model = flattened_list([[car.text for car in i.find_all('h2')] for i in stuffs])  # scraping model data by list
  79.     # comprehension
  80.     for data in stuffs:
  81.         car = data.find_all('h2')
  82.         for car_text in car:
  83.             model_data.append(car_text.text)
  84.         cenas = data.find_all('span', class_='ooa-epvm6 e1b25f6f8')
  85.         for mrp in cenas:
  86.             cena_data.append(mrp.text)
  87.  
  88.     wersjas = soup.find_all('div', class_='ooa-1nvnpye')
  89.     # wersja_data = []
  90.     for item in wersjas:
  91.         wersja = item.find('p', class_='e1b25f6f7')
  92.         try:
  93.             wersja_data.append(wersja.text.strip())
  94.         except (Exception,):
  95.             wersja_data.append("N/A")
  96.  
  97.     cena = flattened_list([[mrp.text for mrp in i.find_all('span', class_='ooa-epvm6 e1b25f6f8')] for i in stuffs])
  98.  
  99.     # extracting li items by index as it's not in any class or id attributes
  100.     li_items = soup.find_all('ul', class_='e1b25f6f7')
  101.     a = [j.text for j in li_items]
  102.     for li in li_items:
  103.         ordered_lists = li.find_all('li')
  104.         years_data.append(ordered_lists[0].text.strip())
  105.         distance_data.append(ordered_lists[1].text.strip())
  106.         cc_data.append(ordered_lists[2].text.strip())
  107.         fuel_data.append(ordered_lists[-1].text.strip())
  108.     # return li
  109.     sleep(time_interval)
  110.     '''for k in range(0, len(cena)):
  111.        a = [j.text for j in li_items[k]]
  112.        years_data.append(a[0])
  113.        if a[-1] == "Elektryczny":
  114.            a[2] = "N/A"
  115.  
  116.        cc_data.append(a[2])
  117.        fuel_data.append(a[-1])
  118.        for index in a:
  119.            if 'km' in index.split(" "):
  120.                distance_data.append(index)
  121.  
  122.        years_data.append(
  123.            a[0::4])  # I approach this way (skipping indexes) to grab index of each list separated by new lines
  124.        distance_data.append(a[1::4])
  125.        cc_data.append(a[2::4])
  126.        fuel_data.append(a[3::4])'''
  127.  
  128.     '''return model, wersja_data, cena, flattened_list(years), flattened_list(fuel), \
  129.           flattened_list(distance), flattened_list(cc)'''
  130.  
  131.  
  132. for m in range(first_page_number, last_page_number + 1):
  133.     base_url = f'https://www.otomoto.pl/osobowe/od-2006?search%5Bfilter_float_year%3Ato%5D=2022&search%5Border%5D=' \
  134.                f'created_at_first%3Adesc&page={m}&search%5Badvanced_search_expanded%5D=true'
  135.     url_list.append(base_url)
  136.  
  137. # after multithreading after concurrency function: Next initializing datas to lists
  138. lightning_scraping(extract, url_list)
  139. # extract(url_list[0])
  140.  
  141. dict_otomoto = {'Model': model_data,
  142.                 'wersja': wersja_data,
  143.                 'Cena': cena_data,
  144.                 "Year": skip_every_second(years_data),
  145.                 'Fuel': skip_every_second(fuel_data),
  146.                 'Distance': skip_every_second(distance_data),
  147.                 "Cubic cm": skip_every_second(cc_data),
  148.                 'Otomoto link': otomoto_url}
  149.  
  150. with open('otomoto9.json', 'w', encoding='utf-8') as f:
  151.     json.dump(dict_otomoto, f, ensure_ascii=False, indent=4)
  152.     print("Done!")
  153.  
  154. try:
  155.     df_json = pd.read_json("otomoto9.json")
  156.     df_json.to_csv('Otomoto9.csv', index=False)
  157. except (Exception, ):
  158.     print("Array error! File is saved in json format.")
  159.     winsound.PlaySound('notification.mp3', winsound.SND_FILENAME)
  160.  
Add Comment
Please, Sign In to add comment