所以我有这个python脚本。现在,我运行脚本,它给我一个CSV输出文件。
我想要的是:当它重新启动并检查这些输出值的更改时(在重新启动时不刷新输出文件并擦除所有以前收集的数据)
同样,每行数据大约需要3秒才能获取。有谁知道如何快速处理大型数据集?
import urllib2,re,urllib,urlparse,csv,sys,time,threading,codecs
from bs4 import BeautifulSoup
def extract(url):
try:
sys.stdout.write('0')
global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class':'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
#pass
return
ifile = open('output.csv', "a", 0)
ifile.write('URL' + "," + 'Price' + "," + 'Stock' + "," + "Time" + '\n')
inputs = csv.reader(open('input.csv'))
#inputs = csv.reader(codecs.open('input.csv', 'rU', 'utf-16'))
for i in inputs:
extract(i[0])
ifile.close()
print("finished")