Question

url="http://www.dawsons.co.uk/finance-information/"
urls=[url] #stack of urls to scrape
visited=[url]   #historic record of urls

while len(urls)>0:
    try:
        htmltext=urllib.urlopen(urls[0]).read()
        except:
        print urls[0]
    soup=BeautifulSoup(htmltext)

    urls.pop(0)
    print len(urls)
    for tag in soup.findAll('a',href=True):
        tag['href']=urlparse.urljoin(url,tag['href'])
        if url in tag['href'] and tag['href'] not in visited:
            urls.append(tag['href'])
            visited.append(tag['href'])
print visited


# In[19]:
# Fetch the words as documents from each of the pages and club them as one document
text_ang = ""
for link in visited:
    print link
    #link_href='"'+link+'"'
    #print link_href
    #print type(link)
    try:
        page = urllib2.urlopen(link)
        soup = BeautifulSoup(page)
        text_ang= text_ang + str(soup.get_text().replace('\n',' ').replace('\t',' ').replace('\r',' ').encode('utf-8'))

     except:
        print link + " -not found"
#     soup = BeautifulSoup(page)
#     text_ang= str(soup.get_text().replace('\n',' ').replace('\t','   ').replace('\r',' ').encode('utf-8'))

print("********************************************\n", text_ang,   "\n********************************************\n")
print("############################################\n",visible_texts, "\n############################################\n")


# In[20]:
# Look out for specific combinations and get the counts

counter = Counter()
# text = r.content.lower()
# text = "njkhdsk sdhfk kjsdfjk u sudf shall imustdfkj shall must must nksf     ds may nknsdk v may not jsdfjdsa not may must may not shall" 
for phrase in ['APR', 'Interest free finance', 'Interest bearing finance',  'ROUTINES VIDEOS']:
    counter[phrase] += len(re.findall(r'\b%s\b' % phrase, text_ang))
print(counter)

# In[9]:

 soup

这是使用BeautifulSoup的代码。但它的运行速度非常慢。使用此代码提取文本和图像需要花费大量时间。有没有办法优化代码，以便它运行得更快？

Python和BeautifulSoup优化代码

0 个答案: