运行我的python爬虫时遇到了问题。当它发现任何项目丢失时 然后它跳过整个领先。更具体地说,如果它没有从页面中找到名称,那么它不关心地址,网络,电子邮件等,而是跳过该页面并且不打印任何内容。我如何修复它以便任何项目丢失然后爬虫将打印其余的?我已使用网址粘贴了抓取代码。希望有任何解决方法。
以下是我尝试的内容:
import requests
from lxml import html
def Endpoint(address):
page=requests.get(address)
tree=html.fromstring(page.text)
titles = tree.xpath('//div[@class="contact-details block dark"]')
for title in titles:
try:
Name=title.xpath('.//p[1]/text()')[0]
Name1=title.xpath('.//p[3]/text()')[0]
Name2=title.xpath('.//p[1]/text()')[1]
Name3=title.xpath('.//p[1]/text()')[2]
Address = title.xpath('.//p[2]/text()')[0]
Address1 = title.xpath('.//p[2]/text()')[1]
Address2 = title.xpath('.//p[2]/text()')[2]
Address3 = title.xpath('.//p[2]/text()')[3]
Web = title.xpath('.//p/a/@href')[0]
Email = title.xpath('.//p/a/@href')[1]
Metco=(Name,Name1,Name2,Name3,Address,Address1,Address2,Address3,Web,Email)
print(Metco)
except:
continue
Endpoint("http://www.austrade.gov.au/SupplierDetails.aspx?ORGID=ORG8160044466&folderid=1736")
答案 0 :(得分:1)
如果您设置了默认值(例如None
),则可以解决此问题,因此如果项目不在范围内,它不会引发异常
import requests
from lxml import html
def Endpoint(address):
page=requests.get(address)
tree=html.fromstring(page.text)
titles = tree.xpath('//div[@class="contact-details block dark"]')
for title in titles:
try :
Name=title.xpath('.//p[1]/text()')[0] if len(title.xpath('.//p[1]/text()')) > 0 else None
Name1=title.xpath('.//p[3]/text()')[0] if len(title.xpath('.//p[3]/text()')) > 0 else None
Name2=title.xpath('.//p[1]/text()')[1] if len(title.xpath('.//p[1]/text()')) > 1 else None
Name3=title.xpath('.//p[1]/text()')[2] if len(title.xpath('.//p[1]/text()')) > 2 else None
Address = title.xpath('.//p[2]/text()')[0] if len(title.xpath('.//p[2]/text()')) > 0 else None
Address1 = title.xpath('.//p[2]/text()')[1] if len(title.xpath('.//p[2]/text()')) > 1 else None
Address2 = title.xpath('.//p[2]/text()')[2] if len(title.xpath('.//p[2]/text()')) > 2 else None
Address3 = title.xpath('.//p[2]/text()')[3] if len(title.xpath('.//p[2]/text()')) > 3 else None
Web = title.xpath('.//p/a/@href')[0] if len(title.xpath('.//p[1]/text()')) > 0 else None
Email = title.xpath('.//p/a/@href')[1] if len(title.xpath('.//p[1]/text()')) > 0 else None
Metco=(Name,Name1,Name2,Name3,Address,Address1,Address2,Address3,Web,Email)
print(Metco)
except Exception as ex :
print ex
Endpoint("http://www.austrade.gov.au/SupplierDetails.aspx?ORGID=ORG8160044466&folderid=1736")
结果
'公司名称:PIMS Group Pty Ltd',无,'电话:+61 7 4969 3900','传真:+61 7 4969 3999','43 Evans Avenue','North Mackay','QLD', '4740','mailto:admin@pims.net.au','http://www.pims.net.au'