如何在Python中抓取许多动态URL

时间:2016-02-24 18:27:09

标签: python web-scraping beautifulsoup

我想一次抓一个动态网址。 我做的是我从所有href中获取的URL,然后我想刮掉该URL。 我在想什么:

from bs4 import BeautifulSoup
import urllib.request 
import re

r = urllib.request.urlopen('http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware')
soup = BeautifulSoup(r, "html.parser")
links = soup.find_all("a", href=re.compile(r"expexhibitorlist\.aspx\?categoryno=[0-9]+"))

linksfromcategories = ([link["href"] for link in links])
string = "http://i.cantonfair.org.cn/en/"

str1 = [string + x for x in linksfromcategories]
fulllinksfromcategories = '\n'.join(str1)
lfc = urllib.request.urlopen(fulllinksfromcategories).read()
soup2 = BeautifulSoup(lfc,"html.parser") 
print(soup2)

但它给了我以下错误:

  Traceback (most recent call last):
  File "D:\python\scarpepython.py", line 50, in <module>
  lfc = urllib.request.urlopen(fulllinksfromcategories).read()
  File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 162, in urlopen
  return opener.open(url, data, timeout)
  File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 465, in open
  response = self._open(req, data)
  File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 483, in _open
'_open', req)
  File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 443, in _call_chain
result = func(*args)
  File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1268, in http_open
  return self.do_open(http.client.HTTPConnection, req)
  File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1243, in do_open
  r = h.getresponse()
  File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1174, in getresponse
response.begin()
  File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 282, in begin
version, status, reason = self._read_status()
 File "C:\Users\amanp\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 264, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine: 

2 个答案:

答案 0 :(得分:1)

这可以是使用lxml的选项。

from sys import exit
from pprint import pprint

import lxml.html
import requests
import re

url = 'http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware&page=1'
doc = requests.get(url)
root = lxml.html.fromstring(doc.text)

regexp_total_pages = re.compile(ur"Pages\s\d\/(\d+)")
text_total_pages = root.xpath('//*[@id="AspNetPager1"]/div[1]/text()')[0].strip()
total_pages = int(re.match(regexp_total_pages,text_total_pages).group(1))

all_links = list()

for i in range(1,total_pages + 1):
    url = "http://i.cantonfair.org.cn/en/ExpExhibitorList.aspx?k=glassware&page={page}".format(page=i)
    doc = requests.get(url)
    root = lxml.html.fromstring(doc.text)
    all_links.append(root.xpath('//*[@id="form1"]/div[*]/div[*]/h3/a/@href'))

pprint(all_links)

输出:

[['expCompany.aspx?corpid=0776011226',
  'expCompany.aspx?corpid=0767408756',
  'expCompany.aspx?corpid=0768210055',
  'expCompany.aspx?corpid=0797720568',
  'expCompany.aspx?corpid=0732708199',
  'expCompany.aspx?corpid=0793210033',
  'expCompany.aspx?corpid=0732106474',
  'expCompany.aspx?corpid=0758010034',
  'expCompany.aspx?corpid=0971067386',
  'expCompany.aspx?corpid=0740321671'],
 ['expCompany.aspx?corpid=0778019678',
  'expCompany.aspx?corpid=0856547211',
  'expCompany.aspx?corpid=0786118468',
  'expCompany.aspx?corpid=0836547578',
  'expCompany.aspx?corpid=0898829143',
  'expCompany.aspx?corpid=0785822466',
  'expCompany.aspx?corpid=0886647641',
  'expCompany.aspx?corpid=0965278225',
  'expCompany.aspx?corpid=0889552449',
  'expCompany.aspx?corpid=0757118156'],
 ['expCompany.aspx?corpid=0800629095',
  'expCompany.aspx?corpid=0797100877',
  'expCompany.aspx?corpid=0791001566',
  'expCompany.aspx?corpid=0955274359',
  'expCompany.aspx?corpid=0789803409',
  'expCompany.aspx?corpid=0769413578',
  'expCompany.aspx?corpid=0712314777',
  'expCompany.aspx?corpid=0873048367',
  'expCompany.aspx?corpid=0716520166',
  'expCompany.aspx?corpid=1444012375'],
 ['expCompany.aspx?corpid=1020485398',
  'expCompany.aspx?corpid=1218502245',
  'expCompany.aspx?corpid=1197393480',
  'expCompany.aspx?corpid=1366309374',
  'expCompany.aspx?corpid=1204799012',
  'expCompany.aspx?corpid=1078880722',
  'expCompany.aspx?corpid=1367905785',
  'expCompany.aspx?corpid=1427517382',
  'expCompany.aspx?corpid=1377308235',
  'expCompany.aspx?corpid=1437717128'],
 ['expCompany.aspx?corpid=1361609356',
  'expCompany.aspx?corpid=1532524260',
  'expCompany.aspx?corpid=1512425129',
  'expCompany.aspx?corpid=1371110608',
  'expCompany.aspx?corpid=1021582521',
  'expCompany.aspx?corpid=0829323712',
  'expCompany.aspx?corpid=0756508698',
  'expCompany.aspx?corpid=0781315922',
  'expCompany.aspx?corpid=0850325858',
  'expCompany.aspx?corpid=0713405337'],
 ['expCompany.aspx?corpid=0895550135',
  'expCompany.aspx?corpid=0736604457',
  'expCompany.aspx?corpid=0761821937',
  'expCompany.aspx?corpid=0853755897',
  'expCompany.aspx?corpid=0807455302',
  'expCompany.aspx?corpid=0763919269',
  'expCompany.aspx?corpid=0736104221',
  'expCompany.aspx?corpid=0796616555',
  'expCompany.aspx?corpid=0804229227',
  'expCompany.aspx?corpid=0746304700'],
 ['expCompany.aspx?corpid=0839047328',
  'expCompany.aspx?corpid=0875628420',
  'expCompany.aspx?corpid=0869651030',
  'expCompany.aspx?corpid=0838653323',
  'expCompany.aspx?corpid=0779107569',
  'expCompany.aspx?corpid=0748806674',
  'expCompany.aspx?corpid=0736602141',
  'expCompany.aspx?corpid=0722715458',
  'expCompany.aspx?corpid=0782910676',
  'expCompany.aspx?corpid=0798114121'],
 ['expCompany.aspx?corpid=0830450037',
  'expCompany.aspx?corpid=0723700490',
  'expCompany.aspx?corpid=0889823692',
  'expCompany.aspx?corpid=0984073042',
  'expCompany.aspx?corpid=0726719753',
  'expCompany.aspx?corpid=0742406942',
  'expCompany.aspx?corpid=0742119461',
  'expCompany.aspx?corpid=0728315987',
  'expCompany.aspx?corpid=0818248812',
  'expCompany.aspx?corpid=0750419352'],
 ['expCompany.aspx?corpid=0982275722',
  'expCompany.aspx?corpid=0815756641',
  'expCompany.aspx?corpid=0712604536',
  'expCompany.aspx?corpid=0798617576',
  'expCompany.aspx?corpid=0734217566',
  'expCompany.aspx?corpid=0878728894',
  'expCompany.aspx?corpid=0772422523',
  'expCompany.aspx?corpid=0784607985',
  'expCompany.aspx?corpid=0786204936',
  'expCompany.aspx?corpid=0886423907'],
 ['expCompany.aspx?corpid=0789300431',
  'expCompany.aspx?corpid=0779921604',
  'expCompany.aspx?corpid=0794403082',
  'expCompany.aspx?corpid=0769111680',
  'expCompany.aspx?corpid=0746606839',
  'expCompany.aspx?corpid=0896726003',
  'expCompany.aspx?corpid=0886728390',
  'expCompany.aspx?corpid=0841756743',
  'expCompany.aspx?corpid=1010680461',
  'expCompany.aspx?corpid=0837456503'],
 ['expCompany.aspx?corpid=0735317945',
  'expCompany.aspx?corpid=0858556012',
  'expCompany.aspx?corpid=0883227862',
  'expCompany.aspx?corpid=0802151577',
  'expCompany.aspx?corpid=0725403915',
  'expCompany.aspx?corpid=0773118307',
  'expCompany.aspx?corpid=0977967839',
  'expCompany.aspx?corpid=0889257398',
  'expCompany.aspx?corpid=0773003774',
  'expCompany.aspx?corpid=0741211862'],
 ['expCompany.aspx?corpid=0944767300',
  'expCompany.aspx?corpid=0766703225',
  'expCompany.aspx?corpid=0807623222',
  'expCompany.aspx?corpid=0754416485',
  'expCompany.aspx?corpid=0716414765',
  'expCompany.aspx?corpid=0764603066',
  'expCompany.aspx?corpid=0757110589',
  'expCompany.aspx?corpid=0800248632',
  'expCompany.aspx?corpid=0747902779',
  'expCompany.aspx?corpid=0738619647'],
 ['expCompany.aspx?corpid=1098582416',
  'expCompany.aspx?corpid=0909669961',
  'expCompany.aspx?corpid=0862829627',
  'expCompany.aspx?corpid=0892328884',
  'expCompany.aspx?corpid=0886729635',
  'expCompany.aspx?corpid=0724805261',
  'expCompany.aspx?corpid=0877655294',
  'expCompany.aspx?corpid=0835853958',
  'expCompany.aspx?corpid=0737821957',
  'expCompany.aspx?corpid=0785019255'],
 ['expCompany.aspx?corpid=0873828585',
  'expCompany.aspx?corpid=0735401884',
  'expCompany.aspx?corpid=0927058069',
  'expCompany.aspx?corpid=0794816876',
  'expCompany.aspx?corpid=0721211392',
  'expCompany.aspx?corpid=0741602341',
  'expCompany.aspx?corpid=0760906105',
  'expCompany.aspx?corpid=0904473659',
  'expCompany.aspx?corpid=0711614568',
  'expCompany.aspx?corpid=0753503530'],
 ['expCompany.aspx?corpid=0774108002',
  'expCompany.aspx?corpid=0845328722',
  'expCompany.aspx?corpid=0823848403',
  'expCompany.aspx?corpid=0876029511',
  'expCompany.aspx?corpid=0886827914',
  'expCompany.aspx?corpid=0712712280',
  'expCompany.aspx?corpid=0833854881',
  'expCompany.aspx?corpid=0746216867',
  'expCompany.aspx?corpid=0774704214',
  'expCompany.aspx?corpid=0730516488'],
 ['expCompany.aspx?corpid=0716607064',
  'expCompany.aspx?corpid=0758917403',
  'expCompany.aspx?corpid=0763702256',
  'expCompany.aspx?corpid=0721303394',
  'expCompany.aspx?corpid=0828647452',
  'expCompany.aspx?corpid=0771805641',
  'expCompany.aspx?corpid=0741722489',
  'expCompany.aspx?corpid=0980867582',
  'expCompany.aspx?corpid=0790809611',
  'expCompany.aspx?corpid=0714917484'],
 ['expCompany.aspx?corpid=0790402155',
  'expCompany.aspx?corpid=0710118558',
  'expCompany.aspx?corpid=0864455955',
  'expCompany.aspx?corpid=0784706276',
  'expCompany.aspx?corpid=0897623416',
  'expCompany.aspx?corpid=0821453137',
  'expCompany.aspx?corpid=0754917280',
  'expCompany.aspx?corpid=0724600646',
  'expCompany.aspx?corpid=0764211415',
  'expCompany.aspx?corpid=0735008307'],
 ['expCompany.aspx?corpid=0795909343',
  'expCompany.aspx?corpid=0850830043',
  'expCompany.aspx?corpid=0970778277',
  'expCompany.aspx?corpid=1075781404',
  'expCompany.aspx?corpid=1252802513',
  'expCompany.aspx?corpid=1236901616',
  'expCompany.aspx?corpid=1435215908',
  'expCompany.aspx?corpid=1469712283',
  'expCompany.aspx?corpid=1439615100',
  'expCompany.aspx?corpid=1245501009'],
 ['expCompany.aspx?corpid=0901974362',
  'expCompany.aspx?corpid=1487117816',
  'expCompany.aspx?corpid=1058881186',
  'expCompany.aspx?corpid=0809557305',
  'expCompany.aspx?corpid=1265998039',
  'expCompany.aspx?corpid=1188093431',
  'expCompany.aspx?corpid=0995572026',
  'expCompany.aspx?corpid=1036184837',
  'expCompany.aspx?corpid=0990573086',
  'expCompany.aspx?corpid=1464212531'],
 ['expCompany.aspx?corpid=0858351382',
  'expCompany.aspx?corpid=1348806571',
  'expCompany.aspx?corpid=0822452086',
  'expCompany.aspx?corpid=1428413902',
  'expCompany.aspx?corpid=0879752062',
  'expCompany.aspx?corpid=1369405760',
  'expCompany.aspx?corpid=1256000612',
  'expCompany.aspx?corpid=1037680042',
  'expCompany.aspx?corpid=1062381570',
  'expCompany.aspx?corpid=1461915811'],
 ['expCompany.aspx?corpid=1373808159',
  'expCompany.aspx?corpid=1027382040',
  'expCompany.aspx?corpid=1191393458',
  'expCompany.aspx?corpid=1133389590',
  'expCompany.aspx?corpid=0762910036',
  'expCompany.aspx?corpid=1399206654',
  'expCompany.aspx?corpid=1253598637',
  'expCompany.aspx?corpid=1128889405',
  'expCompany.aspx?corpid=1082384190',
  'expCompany.aspx?corpid=1077881359'],
 ['expCompany.aspx?corpid=1584325065',
  'expCompany.aspx?corpid=1131692156',
  'expCompany.aspx?corpid=1587322969',
  'expCompany.aspx?corpid=1251100353',
  'expCompany.aspx?corpid=1115590386',
  'expCompany.aspx?corpid=1541424572',
  'expCompany.aspx?corpid=1137393378',
  'expCompany.aspx?corpid=1069988131',
  'expCompany.aspx?corpid=1392806069',
  'expCompany.aspx?corpid=0766210029'],
 ['expCompany.aspx?corpid=1143394259',
  'expCompany.aspx?corpid=1561819111',
  'expCompany.aspx?corpid=1349307520']]

答案 1 :(得分:0)

您的案例中的

str1包含一个网址列表。您将此URL列表加入由换行符分隔的单个字符串中,然后尝试导航到该混合,这当然不起作用。

相反,您打算逐个循环提取的URL并导航:

linksfromcategories = [string + x for x in linksfromcategories]
for link in linksfromcategories:
    print(link)
    lfc = urllib.request.urlopen(link).read()
    soup2 = BeautifulSoup(lfc,"html.parser")
    print(soup2)