我编写了一个程序,该程序使用Selenium Webdriver打开Chrome并针对特定查询抓取Craigslist(在我的情况下是Mazda Miata)。
我拥有它,因此它可以打印出帖子的标题,帖子的日期,商品的价格,并显示商品的网址。
这是代码。
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#import schedule
from bs4 import BeautifulSoup
import urllib.request
cars = []
class CraigslistScaper(object):
def __init__(self,query,location,max_price,transmission):
self.query = query
# self.sort=sort
self.location = location
# self.postal = postal
self.max_price = max_price
self.transmission = auto_transmission
#https://sfbay.craigslist.org/search/cta?query=mazda+miata&sort=rel&max_price=6000&auto_transmission=1
self.url = "https://{}.craigslist.org/search/cta?query={}&sort=rel&max_price={}&auto_transmission={}".format(self.location, self.query, self.max_price, self.transmission)
self.driver = webdriver.Chrome('/Users/MohitAsthana/Desktop/chromedriver')
self.delay = 5
def load_craigslist_url(self):
self.driver.get(self.url)
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_element_located((By.ID,"searchform")))
print("page is ready")
except TimeoutError:
print('Loading took too much time')
def extract_post_information(self):
all_posts = self.driver.find_elements_by_class_name('result-row')
titles = []
dates = []
prices = []
for post in all_posts:
title = post.text.split('$')
if title[0] == '':
title = title[1]
else:
title = title[0]
title = title.split("\n")
price = title[0]
title = title[-1]
title = title.split(' ')
month = title[0]
day = title[1]
date = month + " " + day
title = ' '.join(title[2:])
print('PRICE: ' + (price))
print('TITLE: ' + (title))
print('DATE: ' + date)
def extract_post_urls(self):
url_list = []
soup = BeautifulSoup(self.driver.page_source,'html.parser')
for link in soup.findAll('a', {'class': "result-title hdrlnk"}):
print(link.get('href'))
url_list.append(url_list)
return url_list
def quit(self):
self.driver.close()
location = "sfbay"
#postal = "94539"
max_price = "25000"
#radius = "250"
auto_transmission = 1
query = "Mazda Miata"
scraper = CraigslistScaper(query,location,max_price,auto_transmission)
scraper.load_craigslist_url()
scraper.extract_post_information()
scraper.extract_post_urls()
scraper.quit()
我想将所有结果保存到CSV文件中,其中A列显示标题,B列是日期,C列是价格,D列是Craigslist帖子的URL。
我使用此脚本的目标是最终使其自动化,使其每天运行,并提供带有所有马自达Miata价格的更新的CSV文件。然后,我可以绘制历史数据,甚至可以将其用作训练数据集来预测马自达Miata的价格。
我的问题是如何将标题,日期,价格和Craigslist URL保存到CSV中?