试图从Ebay搜索页面中提取一些信息。脚本运行时,无法导出到Json文件。至少可以使用我告诉其创建的标题来创建EXCEL工作表。任何帮助,将不胜感激。经过进一步调查,BeautifulSoup不能引用“ iid”吗?
div class =“ lvpic pic img left” iid =“ 173992820900”
from bs4 import BeautifulSoup
import datetime
from tinydb import TinyDB, Query
import urllib3
import xlsxwriter
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = 'https://www.ebay.com/e/fashion/sunglasses-under-10?rt=nc&LH_ItemCondition=3'
total_added = 0
def make_soup(url):
http = urllib3.PoolManager()
r = http.request("GET", url)
return BeautifulSoup(r.data,'lxml')
def main(url):
global total_added
db = TinyDB("db.json")
while url:
print ("Web Page: ", url)
soup = soup_process(url, db)
nextlink = soup.find("link", rel="next")
url = False
if (nextlink):
url = nextlink['href']
print ("Added ",total_added)
make_excel(db)
def soup_process(url, db):
global total_added
soup = make_soup(url)
results = soup.find_all("li", class_="lvresult")
for result in results:
try:
rec = {
'ebaynum': result.div['iid'],
# 'date': result.p.time['datetime'],
'cost': gmoney(result.ul.li.span.string.strip()),
'webpage': result.a['href'],
'pic': gpic(result.a.img['img']),
'descr': result.h3.a.string.strip(),
# 'createdt': datetime.datetime.now().isoformat()
}
Result = Query()
s1 = db.search(Result.ebaynum == rec["ebaynum"])
if not s1:
total_added += 1
print ("Adding ... ", total_added)
db.insert(rec)
except (AttributeError, KeyError) as ex:
pass
return soup
def gmoney(amt):
return float(amt.replace("$",""))
def gpic(ids):
idlist = ids.split(",")
first = idlist[0]
code = first.replace("1:","")
return "https://i.ebayimg.com/thumbs/images/m/%s-l225.jpg" % code
def make_excel(db):
Headlines = ["EbayNum", "Cost", "Webpage", "Pic", "Desc"]
row = 0
workbook = xlsxwriter.Workbook('ebay.xlsx')
worksheet = workbook.add_worksheet()
worksheet.set_column(0,0, 15) # pid
# worksheet.set_column(1,1, 20) # date
worksheet.set_column(2,2, 7) # cost
worksheet.set_column(3,3, 10) # webpage
worksheet.set_column(4,4, 7) # picture
worksheet.set_column(5,5, 60) # Description
# worksheet.set_column(6,6, 30) # created date
for col, title in enumerate(Headlines):
worksheet.write(row, col, title)
for item in db.all():
row += 1
worksheet.write(row, 0, item['ebaynum'] )
# worksheet.write(row, 1, item['date'] )
worksheet.write(row, 2, item['cost'] )
worksheet.write_url(row, 3, item['webpage'], string='Web Page')
worksheet.write_url(row, 4, item['pic'], string="Picture" )
worksheet.write(row, 5, item['descr'] )
# worksheet.write(row, 6, item['createdt'] )
workbook.close()
main(url)
答案 0 :(得分:0)
不是您想要的link
,而是一个a
标签。比较你的:
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.ebay.com/e/fashion/sunglasses-under-10?rt=nc&LH_ItemCondition=3')
soup = bs(r.content, 'lxml')
nextlink = soup.find("link", rel="next")
if nextlink:
print("found")
else:
print("not found")
与
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.ebay.com/e/fashion/sunglasses-under-10?rt=nc&LH_ItemCondition=3')
soup = bs(r.content, 'lxml')
nextlink = soup.find("a", rel="next")
if nextlink:
print("found")
else:
print("not found")
我更喜欢一个简洁的CSS属性选择器:
nextlink = soup.select_one('[rel=next]')