尝试通过网络抓取Ebay,但无法将信息提取到EXCEL工作表中

时间:2019-08-11 21:24:51

标签: beautifulsoup python-3.7

试图从Ebay搜索页面中提取一些信息。脚本运行时,无法导出到Json文件。至少可以使用我告诉其创建的标题来创建EXCEL工作表。任何帮助,将不胜感激。经过进一步调查,BeautifulSoup不能引用“ iid”吗?

  

div class =“ lvpic pic img left” iid =“ 173992820900”

 from bs4 import BeautifulSoup
 import datetime
 from tinydb import TinyDB, Query
 import urllib3
 import xlsxwriter

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 url = 'https://www.ebay.com/e/fashion/sunglasses-under-10?rt=nc&LH_ItemCondition=3'
    total_added = 0

def make_soup(url):
http = urllib3.PoolManager()
r = http.request("GET", url)
return BeautifulSoup(r.data,'lxml')

def main(url):
global total_added
db = TinyDB("db.json")

while url:
    print ("Web Page: ", url)
    soup = soup_process(url, db)
    nextlink = soup.find("link", rel="next")

    url = False
    if (nextlink):
        url = nextlink['href']

print ("Added ",total_added)

make_excel(db)

def soup_process(url, db):
global total_added

soup = make_soup(url)
results = soup.find_all("li", class_="lvresult")

for result in results:
    try:
       rec = {
            'ebaynum': result.div['iid'],
            # 'date': result.p.time['datetime'],
            'cost': gmoney(result.ul.li.span.string.strip()),
            'webpage': result.a['href'],
            'pic': gpic(result.a.img['img']),
            'descr': result.h3.a.string.strip(),
            # 'createdt': datetime.datetime.now().isoformat()
        }

   Result = Query()
       s1 = db.search(Result.ebaynum == rec["ebaynum"])

       if not s1:
            total_added += 1
            print ("Adding ... ", total_added)
            db.insert(rec)

    except (AttributeError, KeyError) as ex:
        pass

return soup

def gmoney(amt):
return float(amt.replace("$",""))

def gpic(ids):
idlist = ids.split(",")
first = idlist[0]
code = first.replace("1:","")
return "https://i.ebayimg.com/thumbs/images/m/%s-l225.jpg" % code

def make_excel(db):
Headlines = ["EbayNum", "Cost", "Webpage", "Pic", "Desc"]
row = 0

workbook = xlsxwriter.Workbook('ebay.xlsx')
worksheet = workbook.add_worksheet()

worksheet.set_column(0,0, 15) # pid
# worksheet.set_column(1,1, 20) # date
worksheet.set_column(2,2, 7)  # cost
worksheet.set_column(3,3, 10)  # webpage
worksheet.set_column(4,4, 7)  # picture
worksheet.set_column(5,5, 60)  # Description
# worksheet.set_column(6,6, 30)  # created date

for col, title in enumerate(Headlines):
    worksheet.write(row, col, title)

for item in db.all():
    row += 1
    worksheet.write(row, 0, item['ebaynum'] )
    # worksheet.write(row, 1, item['date'] )
    worksheet.write(row, 2, item['cost'] )
    worksheet.write_url(row, 3, item['webpage'], string='Web Page')
    worksheet.write_url(row, 4, item['pic'], string="Picture" )
    worksheet.write(row, 5, item['descr'] )
    # worksheet.write(row, 6, item['createdt'] )

workbook.close()

main(url)

1 个答案:

答案 0 :(得分:0)

不是您想要的link,而是一个a标签。比较你的:

import requests
from bs4 import BeautifulSoup as bs

r = requests.get('https://www.ebay.com/e/fashion/sunglasses-under-10?rt=nc&LH_ItemCondition=3')
soup = bs(r.content, 'lxml')
nextlink = soup.find("link", rel="next")

if nextlink:
    print("found")
else:
    print("not found")

import requests
from bs4 import BeautifulSoup as bs

r = requests.get('https://www.ebay.com/e/fashion/sunglasses-under-10?rt=nc&LH_ItemCondition=3')
soup = bs(r.content, 'lxml')
nextlink = soup.find("a", rel="next")

if nextlink:
    print("found")
else:
    print("not found")

我更喜欢一个简洁的CSS属性选择器:

nextlink = soup.select_one('[rel=next]')