我已经为自动交易员编写了一个Web爬虫代码,但是由于某些原因,当迭代URL时,我的数据帧最大长度只能达到1300。每页有13个结果,所以限制为100是否有某种意义,还是我做错了什么?任何帮助将不胜感激 :) 我已经在下面附加了我的代码
# Import required libraries
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
# List of urls
path = 'https://www.autotrader.co.uk/car-search?advertClassification=standard&postcode=RH104JJ&make=&price-from=500&price-to=100000&onesearchad=Used&onesearchad=Nearly%20New&onesearchad=New&advertising-location=at_cars&is-quick-search=TRUE&page='
urls = []
for i in range(1,500):
url = path + str(i)
urls.append(url)
# Lists to store the scraped data in
makes = []
prices = []
ratings = []
dates = []
types = []
miles = []
litres = []
bhps = []
transmissions = []
fuels = []
owners = []
attributes = [makes, ratings, dates, types, miles, litres, bhps, transmissions, fuels, owners]
# Iterate through urls
sum = 0
for url in urls:
sum += 1
if sum%10 == 0:
print(sum)
# Attempt to connect to the url
try:
response = get(url)
except:
print('oops')
html_soup = BeautifulSoup(response.text, 'html.parser')
# Get a list of individual cars and iterate through it
car_containers = html_soup.find_all('li', class_ = 'search-page__result')
for container in car_containers:
try:
container.find("div", {"class": "js-tooltip"}).find("div", {"class": "pi-indicator js-tooltip-trigger"}).text
rating = container.find("div", {"class": "js-tooltip"}).find("div", {"class": "pi-indicator js-tooltip-trigger"}).text.strip()
except:
rating = ''
ratings.append(rating)
make = container.h2.text.strip().title().split(' ')[0]
makes.append(make)
price = container.find("div", {"class": "vehicle-price"}).text[1:]
prices.append(price)
specs = container.find("ul", {"class": "listing-key-specs"}).find_all("li", recursive=True)
for spec in specs:
if spec.text.split(' ')[0].isdigit() and len(spec.text.split(' ')[0]) == 4:
date = spec.text.split(' ')[0]
dates.append(date)
if 'mile' in str(spec):
mile = spec.text.split(' ')[0]
miles.append(mile)
if 'l' in str(spec).lower() and str(spec.text)[:-1].replace('.', '').isnumeric() and not spec.text.split(' ')[0].isdigit():
litre = spec.text[:-1]
litres.append(litre)
if any(x in str(spec).lower() for x in ['automatic', 'manual']):
transmission = spec.text
transmissions.append(transmission)
if any(x in str(spec).lower() for x in ['bhp', 'ps']):
bhp = spec.text
bhps.append(bhp)
if any(x in str(spec).lower() for x in ['petrol', 'diesel']):
fuel = spec.text
fuels.append(fuel)
if 'owner' in str(spec):
owner = spec.text
owners.append(owner.split(' ')[0])
typelist = ['hatchback', 'saloon', 'convertible', 'coupe', 'suv', 'mpv', 'estate', 'limousine',
'pickup']
if any(x in str(spec).lower() for x in typelist):
typ = spec.text
types.append(typ)
# Filling in empty spaces
for attribute in attributes:
if len(attribute) < len(prices):
attribute.append('')
# Creating a dataframe from the lists
df = ({'makes': makes,
'Price': prices,
'Rating': ratings,
'Year': dates,
'Type': types,
'Miles': miles,
'Litres': litres,
'BHP': bhps,
'Transmission': transmissions,
'Fuel': fuels,
'Owners': owners
})
df = pd.DataFrame(df)
答案 0 :(得分:0)
如果URL的长度太长,也许只是使用URL缩短器