我正在尝试将我从网站解析的所有数据转移到csv文件中,但我遇到了一些问题:
1.即使我添加了字符编码,它仍然以excel而不是纯文本打印为HTML:
e.g
<option redirectvalue="/partfinder/Asus/All In One/E Series/ET10B">ET10B</option>
2.它打印在一列而不是所有行
到目前为止,这是我的代码:
import string, urllib2, urlparse, csv, sys, codecs, cStringIO
from urllib import quote
from urlparse import urljoin
from bs4 import BeautifulSoup
from ast import literal_eval
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
changable_url = 'http://www.asusparts.eu/partfinder/Asus/All%20In%20One/E%20Series'
page = urllib2.urlopen(changable_url)
base_url = 'http://www.asusparts.eu'
soup = BeautifulSoup(page)
selects = []
redirects = []
model_info = []
#Opening csv writer
c = UnicodeWriter(open(r"asus_stock.csv", "wb"))
#Object reader
cr = UnicodeWriter(open(r"asus_stock.csv", "rb"))
print "FETCHING OPTIONS"
select = soup.find(id='myselectListModel')
selects.append(select)
for item in selects:
print item.get_text()
options = select.findAll('option')
for option in options:
if(option.has_attr('redirectvalue')):
redirects.append(option['redirectvalue'])
for r in redirects:
rpage = urllib2.urlopen(urljoin(base_url, quote(r)))
s = BeautifulSoup(rpage)
#print s
#Fetching the main title for each specific model and printing it out
print "FETCHING MAIN TITLE"
maintitle = s.find(id='puffBreadCrumbs')
model_info.append(maintitle)
print maintitle.get_text()
datas = s.find(id='accordion')
a = datas.findAll('a')
content = datas.findAll('span')
print "FETCHING CATEGORY"
for data in a:
if(data.has_attr('onclick')):
arguments = literal_eval('(' + data['onclick'].replace(', this', '').split('(', 1)[1])
#model_info.append(arguments)
print arguments #arguments[1] + " " + arguments[3] + " " + arguments[4]
# Retrieves Part number and Price
print "FETCHING DATA"
for complete in content:
if(complete.has_attr('class')):
#model_info.append(complete['class'])
print complete.get_text()
print "FETCHING IMAGES"
img = s.find('td')
images = img.findAll('img')
model_info.append(images)
print images
c.writerows(selects)
我如何制作它以便打印出来
1-Text而不是HTML
2行而不是一列
[编辑] 这就是我想要显示CSV文件的方式以及要返回的值的示例
"Brand Name" "CategoryID" "ModelID" "Family" "Name" "Part Number" "Price" "Image src"
Asus | AC Adapter | ET1602 | E Series | Power Cord 3P L:80CM,UK(B) | 14G110008350 |14.77 | image src
[新编辑]
这些是打印值的输出:
print "FETCHING OPTIONS"
select = soup.find(id='myselectListModel')
selects.append(select)
for item in selects:
print item.get_text()
的产率:
ET10B ET1602 ET1602C etc..
获取主标题:
print "FETCHING MAIN TITLE"
maintitle = s.find(id='puffBreadCrumbs')
model_info.append(maintitle)
print maintitle.get_text()
的产率:
华硕 - All In One - E系列 - ET10B
获取类别
datas = s.find(id='accordion')
a = datas.findAll('a')
content = datas.findAll('span')
print "FETCHING CATEGORY"
for data in a:
if(data.has_attr('onclick')):
arguments = literal_eval('(' + data['onclick'].replace(', this', '').split('(', 1)[1])
#model_info.append(arguments)
print arguments
的产率:
FETCHING CATEGORY
('Asus', 'AC Adapter', 'ET10B', '6941', 'E Series')
('Asus', '04G265003580')
('Asus', '14G110008340')
('Asus', 'Bracket', 'ET10B', '7138', 'E Series')
('Asus', 'Cable', 'ET10B', '6983', 'E Series')
('Asus', 'Camera', 'ET10B', '6985', 'E Series')
('Asus', 'Cooling', 'ET10B', '6999', 'E Series')
('Asus', 'Cover', 'ET10B', '6984', 'E Series')
etc..
获取名称:
print "FETCHING NAME"
name = s.find('b').get_text()
print name
的产率:
电源适配器65W19V 3PIN
获取零件编号和价格
print "FETCHING PART NUMBER AND PRICE (inc. VAT)"
for complete in content:
if(complete.has_attr('class')):
#model_info.append(complete['class'])
print complete.get_text()
的产率:
FETCHING PART NUMBER AND PRICE (inc. VAT)
Part number: 04G265003580
Remote stock
38.09:- EUR
获取图像
print "FETCHING IMAGES"
img = s.find('td')
images = img.findAll('img')
model_info.append(images)
print images
的产率:
FETCHING IMAGES
[<img alt="" src="/images/Articles/thumbs/04G265003580_thumb.jpg"/>]
答案 0 :(得分:0)
您需要进一步处理selects
列表;而不是将整个列表写入CSV文件,只需将option
标签写入CSV文件:
for select in selects:
c.writerow([opt.string for opt in select.find_all('option')])