url="http://www.dawsons.co.uk/finance-information/"
urls=[url] #stack of urls to scrape
visited=[url] #historic record of urls
while len(urls)>0:
try:
htmltext=urllib.urlopen(urls[0]).read()
except:
print urls[0]
soup=BeautifulSoup(htmltext)
urls.pop(0)
print len(urls)
for tag in soup.findAll('a',href=True):
tag['href']=urlparse.urljoin(url,tag['href'])
if url in tag['href'] and tag['href'] not in visited:
urls.append(tag['href'])
visited.append(tag['href'])
print visited
# In[19]:
# Fetch the words as documents from each of the pages and club them as one document
text_ang = ""
for link in visited:
print link
#link_href='"'+link+'"'
#print link_href
#print type(link)
try:
page = urllib2.urlopen(link)
soup = BeautifulSoup(page)
text_ang= text_ang + str(soup.get_text().replace('\n',' ').replace('\t',' ').replace('\r',' ').encode('utf-8'))
except:
print link + " -not found"
# soup = BeautifulSoup(page)
# text_ang= str(soup.get_text().replace('\n',' ').replace('\t',' ').replace('\r',' ').encode('utf-8'))
print("********************************************\n", text_ang, "\n********************************************\n")
print("############################################\n",visible_texts, "\n############################################\n")
# In[20]:
# Look out for specific combinations and get the counts
counter = Counter()
# text = r.content.lower()
# text = "njkhdsk sdhfk kjsdfjk u sudf shall imustdfkj shall must must nksf ds may nknsdk v may not jsdfjdsa not may must may not shall"
for phrase in ['APR', 'Interest free finance', 'Interest bearing finance', 'ROUTINES VIDEOS']:
counter[phrase] += len(re.findall(r'\b%s\b' % phrase, text_ang))
print(counter)
# In[9]:
soup
这是使用BeautifulSoup的代码。但它的运行速度非常慢。使用此代码提取文本和图像需要花费大量时间。有没有办法优化代码,以便它运行得更快?