我使用selenium制作了一个python抓取程序。如何将网址列表中有关产品的所有信息导出到csv中。现在我的程序只返回一个产品。所以它将是2列(产品标题,产品价格)和4行,一个标题和3个产品信息。 我必须使用selenium和python来完成这个程序。我将不胜感激。
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import csv
# set the proxies to hide actual IP
proxies = {
'http': 'http://62.117.72.122:8080',
'https': 'http://185.93.3.123:8080',
}
urls = ["https://www.amazon.com/Haggar-Hidden-Comfort-Waist-Plain/dp/B0018Q3BRO/ref=sr_1_3?ie=UTF8&qid=1518006207&sr=8-3&keywords=trousers+for+men",
"https://www.amazon.com/gp/product/B0118QC1BA/ref=s9_acsd_cdeal_hd_bw_bFmNr_c_x_w?pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-5&pf_rd_r=AZJF41VDFJMPA4XY6D95&pf_rd_t=101&pf_rd_p=32a36b64-58af-5269-b81a-c1030ee0250c&pf_rd_i=3760911",
"https://www.amazon.com/URbeauty-Aromatherapy-Peppermint-Eucalyptus-Lemongrass/dp/B078B9JRPM/ref=pd_sim_201_5?_encoding=UTF8&pd_rd_i=B078B9JRPM&pd_rd_r=2GC7WEGK2P019AMS5VPC&pd_rd_w=DURLb&pd_rd_wg=jnjzt&psc=1&refRID=2GC7WEGK2P019AMS5VPC"
]
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=%s' % proxies)
driver = webdriver.Chrome(executable_path="C:\\Users\Andrei\Downloads\chromedriver_win32\chromedriver.exe",
chrome_options=chrome_options)
for i in range(len(urls)):
driver.get(urls[i])
sleep(3)
product_title = driver.find_elements_by_xpath('//*[@id="productTitle"][1]')
product_price = driver.find_elements_by_xpath('//*[@id="priceblock_ourprice"][1]')
prod_title = [x.text for x in product_title]
prod_price = [x.text for x in product_price]
csvfile = 'products-10-02.csv'
header = ['Product title', 'Product price']
data = [prod_title[0], prod_price[0]]
with open(csvfile, "w") as output:
writer = csv.writer(output)
writer.writerow(header)
writer.writerow(data)

答案 0 :(得分:1)
当然它只导出第一个数据行 - 这就是你明确要求的那个:
# here - your `data` is a single row...
data = [prod_title[0], prod_price[0]]
with open(csvfile, "w") as output:
writer = csv.writer(output)
writer.writerow(header)
# and here: you call `writerow` only once
writer.writerow(data)
要导出所有收集的数据,您需要逐行循环它们:
with open(csvfile, "w") as output:
writer = csv.writer(output)
writer.writerow(header)
# zip(["a", "b", "c"], [1, 2, 3]) => [("a", 1), ("b", 2), ("c", 3)]
for row in zip(prod_title, prod_price):
writer.writerow(row)
或者只是将整个行列表传递给writer.writerows()
(注意复数形式):
with open(csvfile, "w") as output:
writer = csv.writer(output)
writer.writerow(header)
writer.writerows(zip(prod_title, prod_price))
答案 1 :(得分:1)
试试这个。您应该有一个带有所需数据的csv输出。我从下面的脚本中踢出了proxies
,因为它在我的工作中不起作用。现在好了。
from selenium import webdriver
import time ; import csv
urls = ("https://www.amazon.com/Haggar-Hidden-Comfort-Waist-Plain/dp/B0018Q3BRO/ref=sr_1_3?ie=UTF8&qid=1518006207&sr=8-3&keywords=trousers+for+men",
"https://www.amazon.com/gp/product/B0118QC1BA/ref=s9_acsd_cdeal_hd_bw_bFmNr_c_x_w?pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-5&pf_rd_r=AZJF41VDFJMPA4XY6D95&pf_rd_t=101&pf_rd_p=32a36b64-58af-5269-b81a-c1030ee0250c&pf_rd_i=3760911",
"https://www.amazon.com/URbeauty-Aromatherapy-Peppermint-Eucalyptus-Lemongrass/dp/B078B9JRPM/ref=pd_sim_201_5?_encoding=UTF8&pd_rd_i=B078B9JRPM&pd_rd_r=2GC7WEGK2P019AMS5VPC&pd_rd_w=DURLb&pd_rd_wg=jnjzt&psc=1&refRID=2GC7WEGK2P019AMS5VPC"
)
driver = webdriver.Chrome()
for itemlink in range(len(urls)):
driver.get(urls[itemlink])
time.sleep(3)
product_title = driver.find_element_by_css_selector('#productTitle').text.strip()
product_price = driver.find_element_by_css_selector('#priceblock_ourprice').text.strip()
with open('products-10-02.csv', "a", newline="") as output:
writer = csv.writer(output)
writer.writerow([product_title,product_price])
driver.quit()