不完全确定这里出了什么问题,但我的代码似乎工作得很好,但经过一段时间后,抓取停止了,我不确定该怎么做。
背景:我想抓取房地产列表的所有个人特征。所有的 URL 都基于从网站的每个页面上抓取的 hrefs,并且似乎都运行良好,并且都是唯一的 URL。以“list_price”为例,下面的代码抓取所有 list_prices 直到它随机停止在任何给定的数字(例如,看到它在 11、23、7 等之后停止。使用了不同的 URL)但所有的值被刮是正确的。
代码:
# Create arrays for all of the information that will be scraped from the website
property_sizes = []
descriptions = []
list_prices = []
bedrooms = []
bathrooms = []
property_types = []
lot_sizes = []
years_built = []
titles = []
styles = []
features = []
amenities = []
appliances = []
communities = []
days_on_rew = []
# Initialise all of the features as NaN to keep the indexing correct when appending to the arrays
property_size = None
description = None
list_price = None
num_bedrooms = None
num_bathrooms = None
property_type = None
lot_size = 0
year_built = None
title = None
style = None
the_features = None
the_amenities = None
the_appliances = None
community = None
days_on_website = None
# Use for loop to index through each url and scrape the information off of each page
for idx in range(len(listings_df)):
# Alias the hrefs and add with the original url to get the link
link = listings_df["url"][idx]
# Use links with Selenium and start scraping the content off of the webpages
driver.get(link)
# Find out if the tags "tr" are present as they contain most of the features before 10 seconds
tag_present = EC.presence_of_element_located((By.TAG_NAME, "tr"))
WebDriverWait(driver, timeout).until(tag_present)
# Find the property size and clean the text
try:
unclean_property_size = driver.find_element_by_class_name("listingheader").text
clean_property_size = unclean_property_size.replace("\n", " ")
property_size = next(iter(re.findall(r'[0-9]{3,4}[ ][S][q][f][t]',
clean_property_size)), None).replace(" Sqft", "")
except NoSuchElementException:
pass
# Find the description and clean the text
try:
description = driver.find_element_by_class_name("listingoverview").text
description = re.sub(r'[^\w\s]', "", description)
except NoSuchElementException:
pass
# Get the <tr> tags to get each individual line of information that is wanted
body_lines = driver.find_elements_by_tag_name("tr")
# Loop through each line that is found in the tags and alias the values accordingly
for idx in range(len(body_lines)):
# Convert all to strings first
str_body_lines = str(body_lines[idx].text)
# Split the values from its features on the website
split_body_lines = str_body_lines.split("\n")
# Listing Price
if split_body_lines[0].lower() == "list price":
unclean_list_price = str(split_body_lines[1])
# Number of Bedrooms
elif split_body_lines[0] == "bedrooms":
num_bedrooms = int(split_body_lines[1])
# Number of Bathrooms
elif split_body_lines[0] == "bathrooms":
num_bathrooms = int(split_body_lines[1])
# Property Type
elif split_body_lines[0] == "property type":
property_type = str(split_body_lines[1])
# Lot Size
elif split_body_lines[0] == "lot size":
unclean_lot_size = str(split_body_lines[1])
# Year Built
elif split_body_lines[0] == "year built":
unclean_year_built = str(split_body_lines[1])
# Title/type of property ownership
elif split_body_lines[0] == "title":
title = str(split_body_lines[1])
# Style of the Property
elif split_body_lines[0] == "style":
style = str(split_body_lines[1])
# Features
elif split_body_lines[0] == "features":
the_features = str(split_body_lines[1])
# Amenities found in/around the property
elif split_body_lines[0] == "amenities":
the_amenities = str(split_body_lines[1])
# Appliances
elif split_body_lines[0] == "appliances":
the_appliances = str(split_body_lines[1])
# Community/neighbourhood the property is situated in
elif split_body_lines[0] == "community":
community = str(split_body_lines[1])
# Days on the REW website
elif split_body_lines[0] == "days on rew":
unclean_days_on_website = str(split_body_lines[1])
# Pass if the line is none of the above choices
else:
continue
# Remove any punctuation and unwanted strings in features being extracted
try:
list_price = re.sub(r'[^\d]', "", unclean_list_price)
except:
list_price == None
try:
year_built = next(iter(re.findall(r'[0-9]{4}', unclean_year_built)), None)
except:
year_built == None
try:
days_on_website = re.sub(r'[^\d]', "", unclean_days_on_website)
except:
days_on_website == None
try:
lot_size = next(iter(re.findall(r'[0-9]{4}', unclean_lot_size)), None)
lot_size = int(lot_size)
except:
lot_size == 0
# Append all of the values to the respective arrays
property_sizes.append(property_size)
descriptions.append(description)
list_prices.append(int(list_price))
bedrooms.append(num_bedrooms)
bathrooms.append(num_bathrooms)
property_types.append(property_type)
lot_sizes.append(lot_size)
years_built.append(int(year_built))
titles.append(title)
styles.append(style)
features.append(the_features)
amenities.append(the_amenities)
appliances.append(the_appliances)
communities.append(community)
days_on_rew.append(int(days_on_website))