使用Beautiful Soup和nltk的组合,我抓住主页并查找网站上包含“about”一词的页面的链接。我也抓了那页。我已经复制了在本文末尾进行抓取的代码。
我没有获得足够的数据来获得良好的学习常规。我想知道我的抓取算法是否设置成功 - 换句话说,我的逻辑中是否有任何漏洞,或者更好的方法来确保我有一大块文本来描述什么样的工作公司呢?
import bs4 as bs
import httplib2 as http
import nltk
# Only these characters are valid in a url
ALLOWED_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~:/?#[]@!$&'()*+,;="
class WebPage(object):
def __init__(self, domain):
:param domain: URL to look at
:type domain: str
self.url = 'http://www.' + domain
except: # Catch specific here?
self.homepage = None
self.about_us = None
def _get_homepage(self):
Open the home page, looking for redirects
import re
web = http.Http()
response, pg = web.request(self.url)
# Check for redirects:
if int(response.get('content-length',251)) < 250:
new_url = re.findall(r'(https?://\S+)', pg)[0]
if len(new_url): # otherwise there's not much I can do...
self.url = ''.join(x for x in new_url if x in ALLOWED_CHARS)
response, pg = web.request(self.url)
self.homepage = self._parse_html(nltk.clean_html(pg))
self._raw_homepage = pg
def _get_about_us(self):
Soup-ify the home page, find the "About us" page, and store its contents in a
soup = bs.BeautifulSoup(self._raw_homepage)
links = [x for x in soup.findAll('a') if x.get('href', None) is not None]
about = [x.get('href') for x in links if 'about' in x.get('href', '').lower()]
# need to find about or about-us
about_us_page = None
for a in about:
bits = a.strip('/').split('/')
if len(bits) == 1:
about_us_page = bits[0]
elif 'about' in bits[-1].lower():
about_us_page = bits[-1]
# otherwise assume shortest string is top-level about pg.
if about_us_page is None and len(about):
about_us_page = min(about, key=len)
self.about_us = None
if about_us_page is not None:
self.about_us_url = self.url + '/' + about_us_page
web = http.Http()
response, pg = web.request(self.about_us_url)
if int(response.get('content-length', 251)) > 250:
self.about_us = self._parse_html(nltk.clean_html(pg))
def _parse_html(self, raw_text):
Clean html coming from a web page. Gets rid of
- all '\n' and '\r' characters
- all zero length words
- all unicode characters that aren't ascii (i.e., &...)
lines = [x.strip() for x in raw_text.splitlines()]
all_text = ' '.join([x for x in lines if len(x)]) # zero length strings
return [x for x in all_text.split(' ') if len(x) and x[0] != '&']
答案 0 :(得分:1)
它超出了您的要求,但我会考虑调用已收集此信息的外部数据源。找到此类服务的好地方是Programmable Web(例如Mergent Company Fundamentals)。并非所有可编程网络上的数据都是最新的,但似乎很多API提供商都在那里。