Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- import pandas as pd
- from bs4 import BeautifulSoup
- import requests
- import itertools
- from time import sleep
- import concurrent.futures
- import random
- import winsound
- def get_ua():
- uastrings = [
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 "
- "Safari/600.1.25",
- "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",
- "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 "
- "Safari/537.36",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 "
- "Safari/537.85.10",
- "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
- "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",
- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36"
- ]
- return random.choice(uastrings)
- headers = {"User-Agent": get_ua()}
- def flattened_list(lists):
- flat = list(itertools.chain(*lists))
- return flat
- def skip_every_second(data_lists):
- var = data_lists[::2]
- return var
- # using concurrency features for faster scraping.
- def lightning_scraping(functions, lists):
- with concurrent.futures.ThreadPoolExecutor() as executor:
- executions = executor.map(extract, url_list)
- return executions
- url_list = []
- model_data = []
- wersja_data = []
- cena_data = []
- years_data = []
- distance_data = []
- cc_data = []
- fuel_data = []
- otomoto_url = []
- first_page_number = 1
- last_page_number = 50
- time_interval = 5
- def extract(url):
- r = requests.get(url, headers=headers)
- sleep(time_interval)
- soup = BeautifulSoup(r.content, 'lxml')
- stuffs = soup.find_all('main', class_='ooa-p2z5vl e19uumca5') # this contains whole table data
- for_otomoto_link = soup.find_all('div', class_='ooa-1nvnpye e1b25f6f5')
- for oto in for_otomoto_link:
- url = oto.find('a').get('href')
- otomoto_url.append(url)
- # model = flattened_list([[car.text for car in i.find_all('h2')] for i in stuffs]) # scraping model data by list
- # comprehension
- for data in stuffs:
- car = data.find_all('h2')
- for car_text in car:
- model_data.append(car_text.text)
- cenas = data.find_all('span', class_='ooa-epvm6 e1b25f6f8')
- for mrp in cenas:
- cena_data.append(mrp.text)
- wersjas = soup.find_all('div', class_='ooa-1nvnpye')
- # wersja_data = []
- for item in wersjas:
- wersja = item.find('p', class_='e1b25f6f7')
- try:
- wersja_data.append(wersja.text.strip())
- except (Exception,):
- wersja_data.append("N/A")
- cena = flattened_list([[mrp.text for mrp in i.find_all('span', class_='ooa-epvm6 e1b25f6f8')] for i in stuffs])
- # extracting li items by index as it's not in any class or id attributes
- li_items = soup.find_all('ul', class_='e1b25f6f7')
- a = [j.text for j in li_items]
- for li in li_items:
- ordered_lists = li.find_all('li')
- years_data.append(ordered_lists[0].text.strip())
- distance_data.append(ordered_lists[1].text.strip())
- cc_data.append(ordered_lists[2].text.strip())
- fuel_data.append(ordered_lists[-1].text.strip())
- # return li
- sleep(time_interval)
- '''for k in range(0, len(cena)):
- a = [j.text for j in li_items[k]]
- years_data.append(a[0])
- if a[-1] == "Elektryczny":
- a[2] = "N/A"
- cc_data.append(a[2])
- fuel_data.append(a[-1])
- for index in a:
- if 'km' in index.split(" "):
- distance_data.append(index)
- years_data.append(
- a[0::4]) # I approach this way (skipping indexes) to grab index of each list separated by new lines
- distance_data.append(a[1::4])
- cc_data.append(a[2::4])
- fuel_data.append(a[3::4])'''
- '''return model, wersja_data, cena, flattened_list(years), flattened_list(fuel), \
- flattened_list(distance), flattened_list(cc)'''
- for m in range(first_page_number, last_page_number + 1):
- base_url = f'https://www.otomoto.pl/osobowe/od-2006?search%5Bfilter_float_year%3Ato%5D=2022&search%5Border%5D=' \
- f'created_at_first%3Adesc&page={m}&search%5Badvanced_search_expanded%5D=true'
- url_list.append(base_url)
- # after multithreading after concurrency function: Next initializing datas to lists
- lightning_scraping(extract, url_list)
- # extract(url_list[0])
- dict_otomoto = {'Model': model_data,
- 'wersja': wersja_data,
- 'Cena': cena_data,
- "Year": skip_every_second(years_data),
- 'Fuel': skip_every_second(fuel_data),
- 'Distance': skip_every_second(distance_data),
- "Cubic cm": skip_every_second(cc_data),
- 'Otomoto link': otomoto_url}
- with open('otomoto9.json', 'w', encoding='utf-8') as f:
- json.dump(dict_otomoto, f, ensure_ascii=False, indent=4)
- print("Done!")
- try:
- df_json = pd.read_json("otomoto9.json")
- df_json.to_csv('Otomoto9.csv', index=False)
- except (Exception, ):
- print("Array error! File is saved in json format.")
- winsound.PlaySound('notification.mp3', winsound.SND_FILENAME)
Add Comment
Please, Sign In to add comment