我想一次抓一个动态网址。
我做的是我从所有href
中获取的URL,然后我想刮掉该URL。
我在想什么:
from bs4 import BeautifulSoup
import urllib.request
import re
r = urllib.request.urlopen('http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware')
soup = BeautifulSoup(r, "html.parser")
links = soup.find_all("a", href=re.compile(r"expexhibitorlist\.aspx\?categoryno=[0-9]+"))
linksfromcategories = ([link["href"] for link in links])
string = "http://i.cantonfair.org.cn/en/"
str1 = [string + x for x in linksfromcategories]
fulllinksfromcategories = '\n'.join(str1)
lfc = urllib.request.urlopen(fulllinksfromcategories).read()
soup2 = BeautifulSoup(lfc,"html.parser")
print(soup2)
但它给了我以下错误:
Traceback (most recent call last):
File "D:\python\scarpepython.py", line 50, in <module>
lfc = urllib.request.urlopen(fulllinksfromcategories).read()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 465, in open
response = self._open(req, data)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 483, in _open
'_open', req)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1268, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1243, in do_open
r = h.getresponse()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1174, in getresponse
response.begin()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 282, in begin
version, status, reason = self._read_status()
File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 264, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine:
答案 0 :(得分:1)
这可以是使用lxml的选项。
from sys import exit
from pprint import pprint
import lxml.html
import requests
import re
url = 'http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware&page=1'
doc = requests.get(url)
root = lxml.html.fromstring(doc.text)
regexp_total_pages = re.compile(ur"Pages\s\d\/(\d+)")
text_total_pages = root.xpath('//*[@id="AspNetPager1"]/div[1]/text()')[0].strip()
total_pages = int(re.match(regexp_total_pages,text_total_pages).group(1))
all_links = list()
for i in range(1,total_pages + 1):
url = "http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware&page={page}".format(page=i)
doc = requests.get(url)
root = lxml.html.fromstring(doc.text)
all_links.append(root.xpath('//*[@id="form1"]/div[*]/div[*]/h3/a/@href'))
pprint(all_links)
输出:
[['expCompany.aspx?corpid=0776011226',
'expCompany.aspx?corpid=0767408756',
'expCompany.aspx?corpid=0768210055',
'expCompany.aspx?corpid=0797720568',
'expCompany.aspx?corpid=0732708199',
'expCompany.aspx?corpid=0793210033',
'expCompany.aspx?corpid=0732106474',
'expCompany.aspx?corpid=0758010034',
'expCompany.aspx?corpid=0971067386',
'expCompany.aspx?corpid=0740321671'],
['expCompany.aspx?corpid=0778019678',
'expCompany.aspx?corpid=0856547211',
'expCompany.aspx?corpid=0786118468',
'expCompany.aspx?corpid=0836547578',
'expCompany.aspx?corpid=0898829143',
'expCompany.aspx?corpid=0785822466',
'expCompany.aspx?corpid=0886647641',
'expCompany.aspx?corpid=0965278225',
'expCompany.aspx?corpid=0889552449',
'expCompany.aspx?corpid=0757118156'],
['expCompany.aspx?corpid=0800629095',
'expCompany.aspx?corpid=0797100877',
'expCompany.aspx?corpid=0791001566',
'expCompany.aspx?corpid=0955274359',
'expCompany.aspx?corpid=0789803409',
'expCompany.aspx?corpid=0769413578',
'expCompany.aspx?corpid=0712314777',
'expCompany.aspx?corpid=0873048367',
'expCompany.aspx?corpid=0716520166',
'expCompany.aspx?corpid=1444012375'],
['expCompany.aspx?corpid=1020485398',
'expCompany.aspx?corpid=1218502245',
'expCompany.aspx?corpid=1197393480',
'expCompany.aspx?corpid=1366309374',
'expCompany.aspx?corpid=1204799012',
'expCompany.aspx?corpid=1078880722',
'expCompany.aspx?corpid=1367905785',
'expCompany.aspx?corpid=1427517382',
'expCompany.aspx?corpid=1377308235',
'expCompany.aspx?corpid=1437717128'],
['expCompany.aspx?corpid=1361609356',
'expCompany.aspx?corpid=1532524260',
'expCompany.aspx?corpid=1512425129',
'expCompany.aspx?corpid=1371110608',
'expCompany.aspx?corpid=1021582521',
'expCompany.aspx?corpid=0829323712',
'expCompany.aspx?corpid=0756508698',
'expCompany.aspx?corpid=0781315922',
'expCompany.aspx?corpid=0850325858',
'expCompany.aspx?corpid=0713405337'],
['expCompany.aspx?corpid=0895550135',
'expCompany.aspx?corpid=0736604457',
'expCompany.aspx?corpid=0761821937',
'expCompany.aspx?corpid=0853755897',
'expCompany.aspx?corpid=0807455302',
'expCompany.aspx?corpid=0763919269',
'expCompany.aspx?corpid=0736104221',
'expCompany.aspx?corpid=0796616555',
'expCompany.aspx?corpid=0804229227',
'expCompany.aspx?corpid=0746304700'],
['expCompany.aspx?corpid=0839047328',
'expCompany.aspx?corpid=0875628420',
'expCompany.aspx?corpid=0869651030',
'expCompany.aspx?corpid=0838653323',
'expCompany.aspx?corpid=0779107569',
'expCompany.aspx?corpid=0748806674',
'expCompany.aspx?corpid=0736602141',
'expCompany.aspx?corpid=0722715458',
'expCompany.aspx?corpid=0782910676',
'expCompany.aspx?corpid=0798114121'],
['expCompany.aspx?corpid=0830450037',
'expCompany.aspx?corpid=0723700490',
'expCompany.aspx?corpid=0889823692',
'expCompany.aspx?corpid=0984073042',
'expCompany.aspx?corpid=0726719753',
'expCompany.aspx?corpid=0742406942',
'expCompany.aspx?corpid=0742119461',
'expCompany.aspx?corpid=0728315987',
'expCompany.aspx?corpid=0818248812',
'expCompany.aspx?corpid=0750419352'],
['expCompany.aspx?corpid=0982275722',
'expCompany.aspx?corpid=0815756641',
'expCompany.aspx?corpid=0712604536',
'expCompany.aspx?corpid=0798617576',
'expCompany.aspx?corpid=0734217566',
'expCompany.aspx?corpid=0878728894',
'expCompany.aspx?corpid=0772422523',
'expCompany.aspx?corpid=0784607985',
'expCompany.aspx?corpid=0786204936',
'expCompany.aspx?corpid=0886423907'],
['expCompany.aspx?corpid=0789300431',
'expCompany.aspx?corpid=0779921604',
'expCompany.aspx?corpid=0794403082',
'expCompany.aspx?corpid=0769111680',
'expCompany.aspx?corpid=0746606839',
'expCompany.aspx?corpid=0896726003',
'expCompany.aspx?corpid=0886728390',
'expCompany.aspx?corpid=0841756743',
'expCompany.aspx?corpid=1010680461',
'expCompany.aspx?corpid=0837456503'],
['expCompany.aspx?corpid=0735317945',
'expCompany.aspx?corpid=0858556012',
'expCompany.aspx?corpid=0883227862',
'expCompany.aspx?corpid=0802151577',
'expCompany.aspx?corpid=0725403915',
'expCompany.aspx?corpid=0773118307',
'expCompany.aspx?corpid=0977967839',
'expCompany.aspx?corpid=0889257398',
'expCompany.aspx?corpid=0773003774',
'expCompany.aspx?corpid=0741211862'],
['expCompany.aspx?corpid=0944767300',
'expCompany.aspx?corpid=0766703225',
'expCompany.aspx?corpid=0807623222',
'expCompany.aspx?corpid=0754416485',
'expCompany.aspx?corpid=0716414765',
'expCompany.aspx?corpid=0764603066',
'expCompany.aspx?corpid=0757110589',
'expCompany.aspx?corpid=0800248632',
'expCompany.aspx?corpid=0747902779',
'expCompany.aspx?corpid=0738619647'],
['expCompany.aspx?corpid=1098582416',
'expCompany.aspx?corpid=0909669961',
'expCompany.aspx?corpid=0862829627',
'expCompany.aspx?corpid=0892328884',
'expCompany.aspx?corpid=0886729635',
'expCompany.aspx?corpid=0724805261',
'expCompany.aspx?corpid=0877655294',
'expCompany.aspx?corpid=0835853958',
'expCompany.aspx?corpid=0737821957',
'expCompany.aspx?corpid=0785019255'],
['expCompany.aspx?corpid=0873828585',
'expCompany.aspx?corpid=0735401884',
'expCompany.aspx?corpid=0927058069',
'expCompany.aspx?corpid=0794816876',
'expCompany.aspx?corpid=0721211392',
'expCompany.aspx?corpid=0741602341',
'expCompany.aspx?corpid=0760906105',
'expCompany.aspx?corpid=0904473659',
'expCompany.aspx?corpid=0711614568',
'expCompany.aspx?corpid=0753503530'],
['expCompany.aspx?corpid=0774108002',
'expCompany.aspx?corpid=0845328722',
'expCompany.aspx?corpid=0823848403',
'expCompany.aspx?corpid=0876029511',
'expCompany.aspx?corpid=0886827914',
'expCompany.aspx?corpid=0712712280',
'expCompany.aspx?corpid=0833854881',
'expCompany.aspx?corpid=0746216867',
'expCompany.aspx?corpid=0774704214',
'expCompany.aspx?corpid=0730516488'],
['expCompany.aspx?corpid=0716607064',
'expCompany.aspx?corpid=0758917403',
'expCompany.aspx?corpid=0763702256',
'expCompany.aspx?corpid=0721303394',
'expCompany.aspx?corpid=0828647452',
'expCompany.aspx?corpid=0771805641',
'expCompany.aspx?corpid=0741722489',
'expCompany.aspx?corpid=0980867582',
'expCompany.aspx?corpid=0790809611',
'expCompany.aspx?corpid=0714917484'],
['expCompany.aspx?corpid=0790402155',
'expCompany.aspx?corpid=0710118558',
'expCompany.aspx?corpid=0864455955',
'expCompany.aspx?corpid=0784706276',
'expCompany.aspx?corpid=0897623416',
'expCompany.aspx?corpid=0821453137',
'expCompany.aspx?corpid=0754917280',
'expCompany.aspx?corpid=0724600646',
'expCompany.aspx?corpid=0764211415',
'expCompany.aspx?corpid=0735008307'],
['expCompany.aspx?corpid=0795909343',
'expCompany.aspx?corpid=0850830043',
'expCompany.aspx?corpid=0970778277',
'expCompany.aspx?corpid=1075781404',
'expCompany.aspx?corpid=1252802513',
'expCompany.aspx?corpid=1236901616',
'expCompany.aspx?corpid=1435215908',
'expCompany.aspx?corpid=1469712283',
'expCompany.aspx?corpid=1439615100',
'expCompany.aspx?corpid=1245501009'],
['expCompany.aspx?corpid=0901974362',
'expCompany.aspx?corpid=1487117816',
'expCompany.aspx?corpid=1058881186',
'expCompany.aspx?corpid=0809557305',
'expCompany.aspx?corpid=1265998039',
'expCompany.aspx?corpid=1188093431',
'expCompany.aspx?corpid=0995572026',
'expCompany.aspx?corpid=1036184837',
'expCompany.aspx?corpid=0990573086',
'expCompany.aspx?corpid=1464212531'],
['expCompany.aspx?corpid=0858351382',
'expCompany.aspx?corpid=1348806571',
'expCompany.aspx?corpid=0822452086',
'expCompany.aspx?corpid=1428413902',
'expCompany.aspx?corpid=0879752062',
'expCompany.aspx?corpid=1369405760',
'expCompany.aspx?corpid=1256000612',
'expCompany.aspx?corpid=1037680042',
'expCompany.aspx?corpid=1062381570',
'expCompany.aspx?corpid=1461915811'],
['expCompany.aspx?corpid=1373808159',
'expCompany.aspx?corpid=1027382040',
'expCompany.aspx?corpid=1191393458',
'expCompany.aspx?corpid=1133389590',
'expCompany.aspx?corpid=0762910036',
'expCompany.aspx?corpid=1399206654',
'expCompany.aspx?corpid=1253598637',
'expCompany.aspx?corpid=1128889405',
'expCompany.aspx?corpid=1082384190',
'expCompany.aspx?corpid=1077881359'],
['expCompany.aspx?corpid=1584325065',
'expCompany.aspx?corpid=1131692156',
'expCompany.aspx?corpid=1587322969',
'expCompany.aspx?corpid=1251100353',
'expCompany.aspx?corpid=1115590386',
'expCompany.aspx?corpid=1541424572',
'expCompany.aspx?corpid=1137393378',
'expCompany.aspx?corpid=1069988131',
'expCompany.aspx?corpid=1392806069',
'expCompany.aspx?corpid=0766210029'],
['expCompany.aspx?corpid=1143394259',
'expCompany.aspx?corpid=1561819111',
'expCompany.aspx?corpid=1349307520']]
答案 1 :(得分:0)
str1
包含一个网址列表。您将此URL列表加入由换行符分隔的单个字符串中,然后尝试导航到该混合,这当然不起作用。
相反,您打算逐个循环提取的URL并导航:
linksfromcategories = [string + x for x in linksfromcategories]
for link in linksfromcategories:
print(link)
lfc = urllib.request.urlopen(link).read()
soup2 = BeautifulSoup(lfc,"html.parser")
print(soup2)