下面是在python中使用selenium webdriver抓取网站的代码。代码运行但输出显示“导出0条记录”,我在mongo中创建的集合也被删除本身。
Python代码:
from selenium import (
webdriver
)
from pymongo import (
MongoClient
)
from config import (
db_name,
collection_name,
input_csv_path
)
from tqdm import (
tqdm
)
from subprocess import (
call
)
import csv
import time
class MyntraScrape(object):
def __init__(self, input_csv_path, db_name, collection_name, path=None,
chrome=True, phantom=False):
self.input_csv = input_csv_path
if chrome is True:
self.browser = webdriver.Chrome(path)
else:
self.browser = webdriver.PhantomJS(path)
connection = MongoClient('localhost:27017')
database = connection[db_name]
self.myntra_table = database[collection_name]
def load_urls_from_csv(self, input_csv_path):
url_list = []
with open(input_csv_path, 'r') as csv_file:
reader = csv.reader(csv_file)
for row in reader:
url_list.append(row[0])
self.url_list = url_list
return self
def process_url(self):
for url in tqdm(self.url_list):
self.browser.get(url)
time.sleep(.5)
try:
Selling_Price = self.browser.find_element_by_xpath("//strong[@class='pdp-price']").text[4:]
except:
Selling_Price = None
try:
MRP = self.browser.find_element_by_xpath("//p[@class='pdp-discount-container']/s").text[4:]
except:
MRP = Selling_Price
try:
Prodct_Name = self.browser.find_element_by_xpath("//h1[@class='pdp-title']").text
except:
Prodct_Name = None
try:
Stock_Status = self.browser.find_element_by_css_selector(".size-buttons-out-of-stock").text
except:
Stock_Status = "in stock"
self.myntra_table.insert({
"selling_price": Selling_Price,
"mrp": MRP,
"url": url,
"product_name": Prodct_Name,
"stock_status": Stock_Status
})
self.browser.quit()
call("mongoexport -db scraped_output -c myntra -csv --out myntra_output.csv --fields url,product_name,mrp,selling_price,stock_status", shell=True)
self.myntra_table.drop()
obj = MyntraScrape(input_csv_path, db_name, collection_name,
path='/home/nitink/Python Linux/Important Commands/chromedriver',
chrome=True,
phantom=False)
obj.load_urls_from_csv(input_csv_path)
obj.process_url()
配置文件包含:
db_name = "mydb"
collection_name = "myntra"
input_csv_path = "/home/nitink/Python Linux/Selenium/myntra_selenium/Myntra.csv"
Myntra.csv 有以下链接:
http://www.myntra.com/jackets/vero-moda/vero-moda-black-jacket/1568991/buy?src=search&uq=false&q=vero%2520moda&p=1
http://www.myntra.com/jeans/vero-moda/vero-moda-women-blue-mid-rise-mildly-distressed-jeans/1671336/buy?src=search&uq=false&q=vero%2520moda&p=3
http://www.myntra.com/tops/vero-moda/vero-moda-women-blue--off-white-printed-top/1671348/buy?src=search&uq=false&q=vero%2520moda&p=4