我写了这段代码,使用页面URL擦除亚马逊的某些元素,现在我想添加一个csv函数,它允许我水平CSV列附加以下变量: - (Date_time,price,Merchant,Sellers_count)每个时间我运行代码这个列应该在右边添加而不删除任何现有的列。这是代码&要添加的表格格式
# -*- coding: cp1252 -*-
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import requests, csv, time, urllib2, gspread, os, ast, datetime
from scrapy import Selector as s
from lxml import html
from random import randint
from oauth2client.client import SignedJwtAssertionCredentials
x = lambda x: source.xpath(x).extract()
links = ['http://www.amazon.com/dp/B00064NZCK',
'http://www.amazon.com/dp/B000CIU7F8',
'http://www.amazon.com/dp/B000H5839I',
'http://www.amazon.com/dp/B000LTLBHG',
'http://www.amazon.com/dp/B000SDLXKU',
'http://www.amazon.com/dp/B000SDLXNC',
'http://www.amazon.com/dp/B000SPHPWI',
'http://www.amazon.com/dp/B000UUMHRE']
driver = webdriver.Firefox()
#driver.set_page_load_timeout(30)
for Url in links:
try:
driver.get(Url)
except:
pass
time.sleep(randint(1,3))
try:
html = driver.page_source
source = s(text=html,type="html")
except:
pass
try:
Page_link = x('//link[@rel="canonical"]//@href')
except:
pass
try:
Product_Name = x('//span[@id="productTitle"]/text()')
except:
pass
Product_Name = str(Product_Name).encode('utf-8'); Product_Name = Product_Name.replace("[u'","").replace("']","")
try:
price = x('//span[@id="priceblock_ourprice"]//text()')
except:
pass
try:
Merchant = x('//div[@id="merchant-info"]//a//text()')
except:
pass
try:
Sellers_count = x('//span[@class="olp-padding-right"]//a/text()')
except:
pass
if Merchant == []:
Merchant = 'Amazon'
else:
Merchant = Merchant[0]
price = str(price).replace("[u'","").replace("']","")
if len(Sellers_count)>0:
Sellers_count = Sellers_count[0].encode('utf-8')
else:
Sellers_count = str(Sellers_count).encode('utf-8')
try:
Sellers_count = Sellers_count.replace("Â new",""); Sellers_count = int(Sellers_count)-1
except:
pass
if Sellers_count == []:
Sellers_count = str(Sellers_count).replace("[]","")
else:
Sellers_count = Sellers_count
Date_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
print Date_time, Product_Name, Url, price, Merchant, Sellers_count
我要追加的现有表格格式: -
ASIN ID PRODUCT URL
B00064NZCK MG-5690 BigMouth Inc Over The Hill Parking Privelege Permit http://www.amazon.com/dp/B00064NZCK
B000CIU7F8 BM1102 BigMouth Inc Pocket Disgusting Sounds Machine http://www.amazon.com/dp/B000CIU7F8
B000H5839I MG-4774 BigMouth Inc All Occasion Over The Hill Cane http://www.amazon.com/dp/B000H5839I
B000LTLBHG BM1234 BigMouth Inc Beer Belt / 6 Pack Holster(Black) http://www.amazon.com/dp/B000LTLBHG
B000SDLXKU BM1103 BigMouth Inc Covert Clicker http://www.amazon.com/dp/B000SDLXKU
B000SDLXNC BM1254 BigMouth Inc Inflatable John http://www.amazon.com/dp/B000SDLXNC
B000SPHPWI SO:AP Design Sense Generic Weener Kleener Soap http://www.amazon.com/dp/B000SPHPWI
B000UUMHRE MG-5305 BigMouth Inc Over the Hill Rectal Thermometer http://www.amazon.com/dp/B000UUMHRE
答案 0 :(得分:0)
您必须阅读已有的CSV并写一个包含您添加的列的新文件,这里有一个示例:
with open('your.csv', 'w') as out_file:
with open('new.csv', 'r') as in_file:
for line in in_file:
out_file.write(line.rstrip('\n') + Date_time+ Product_name + '\n')
很明显,你必须管理标题(我猜想的第一行)
希望我帮助你
答案 1 :(得分:0)
以下应该做你需要的。它读入您现有的CSV文件并添加四个新的列标题。然后,对于每个URL,您的代码将获取新数据。然后将其添加到现有行的末尾(顺序无关紧要)。然后,创建更新的CSV文件:
import csv
links = ['http://www.amazon.com/dp/B00064NZCK',
'http://www.amazon.com/dp/B000CIU7F8',
'http://www.amazon.com/dp/B000H5839I',
'http://www.amazon.com/dp/B000LTLBHG',
'http://www.amazon.com/dp/B000SDLXKU',
'http://www.amazon.com/dp/B000SDLXNC',
'http://www.amazon.com/dp/B000SPHPWI',
'http://www.amazon.com/dp/B000UUMHRE']
with open('existing.csv', 'r') as f_input:
csv_input = csv.reader(f_input)
# Read in the existing CSV file
headers = next(csv_input) + ["Date_time", "price", "Merchant", "Sellers_count"]
rows = list(csv_input)
# Create an index just in case the order changes or there are other entries
url_indexes = {row[3] : index for index, row in enumerate(rows)}
for url in links:
# Insert your existing code here to get the actual data
Date_time = "2015-08-27_12-34-56"
price = "123.45"
Merchant = "Def"
Sellers_count = "42"
rows[url_indexes[url]].extend([Date_time, price, Merchant, Sellers_count])
# Write the updated CSV to a new file
with open('updated.csv', 'wb') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(headers)
csv_output.writerows(rows)