记录未导出到mongodb?

时间:2017-01-04 12:38:41

标签: python mongodb selenium

下面是在python中使用selenium webdriver抓取网站的代码。代码运行但输出显示“导出0条记录”,我在mongo中创建的集合也被删除本身。

Python代码:

from selenium import (
    webdriver
)
from pymongo import (
    MongoClient
)
from config import (
    db_name,
    collection_name,
    input_csv_path
)
from tqdm import (
    tqdm
)

from subprocess import (
call
)

import csv
import time


class MyntraScrape(object):
    def __init__(self, input_csv_path, db_name, collection_name, path=None,
                 chrome=True, phantom=False):
        self.input_csv = input_csv_path
        if chrome is True:
            self.browser = webdriver.Chrome(path)
        else:
            self.browser = webdriver.PhantomJS(path)

        connection = MongoClient('localhost:27017')
        database = connection[db_name]
        self.myntra_table = database[collection_name]

    def load_urls_from_csv(self, input_csv_path):
        url_list = []
        with open(input_csv_path, 'r') as csv_file:
            reader = csv.reader(csv_file)
            for row in reader:
                url_list.append(row[0])
        self.url_list = url_list
        return self

    def process_url(self):
        for url in tqdm(self.url_list):
            self.browser.get(url)
            time.sleep(.5)
            try:
                Selling_Price = self.browser.find_element_by_xpath("//strong[@class='pdp-price']").text[4:]
            except:
                Selling_Price = None
            try:
                MRP = self.browser.find_element_by_xpath("//p[@class='pdp-discount-container']/s").text[4:]
            except:
                MRP = Selling_Price
            try:
                Prodct_Name = self.browser.find_element_by_xpath("//h1[@class='pdp-title']").text
            except:
                Prodct_Name = None

            try:
                Stock_Status = self.browser.find_element_by_css_selector(".size-buttons-out-of-stock").text
            except:
                Stock_Status = "in stock"



            self.myntra_table.insert({
                "selling_price": Selling_Price,
                "mrp": MRP,
                "url": url,
                "product_name": Prodct_Name,
                "stock_status": Stock_Status
            })
        self.browser.quit()
        call("mongoexport -db scraped_output -c myntra -csv --out myntra_output.csv --fields url,product_name,mrp,selling_price,stock_status", shell=True)
        self.myntra_table.drop()
obj = MyntraScrape(input_csv_path, db_name, collection_name,
                     path='/home/nitink/Python Linux/Important Commands/chromedriver',
                     chrome=True,
                     phantom=False)

obj.load_urls_from_csv(input_csv_path)
obj.process_url()

配置文件包含:

db_name = "mydb"
collection_name = "myntra"
input_csv_path = "/home/nitink/Python Linux/Selenium/myntra_selenium/Myntra.csv"

Myntra.csv 有以下链接:

http://www.myntra.com/jackets/vero-moda/vero-moda-black-jacket/1568991/buy?src=search&uq=false&q=vero%2520moda&p=1
http://www.myntra.com/jeans/vero-moda/vero-moda-women-blue-mid-rise-mildly-distressed-jeans/1671336/buy?src=search&uq=false&q=vero%2520moda&p=3
http://www.myntra.com/tops/vero-moda/vero-moda-women-blue--off-white-printed-top/1671348/buy?src=search&uq=false&q=vero%2520moda&p=4

0 个答案:

没有答案