我有几个解析器从新闻网站获取HTML。为了确保网站的结构没有改变,我每天运行两次简单测试:将从get_content()
返回的文本文件中手动确认的HTML与刚从get_content()
返回的结果进行比较。< / p>
此比较会针对不同的网址产生不同的结果。例如,此网址的字符串相同:http://ria.ru/world/20160309/1387413601.html,但此字符串不同:http://www.ntv.ru/novosti/1608256/。我在这里缺少什么?
class TextParser(object):
def __init__(self, site, tag, prop):
self.site = site
self.tag = tag
self.prop = prop
def __call__(self, url):
soup = bs(get_resource(url), 'html.parser')
article = soup.find(self.tag, self.prop)
if article:
self.clean(article)
return article.encode_contents().strip()
def clean(self, article):
# Drop image, script, div inside content
[s.extract() for s in article(['script', 'div', 'img', 'table',
'aside'])]
# Drop comments
[element.extract() for element in
article(text=lambda text: isinstance(text, Comment))]
parsers = [
TextParser('ria.ru', 'div', {'id': 'article_full_text'}),
TextParser('ntv.ru', 'div', 'smcntr')
]
def get_resource(url):
request = urllib2.Request(url)
try:
response = urllib2.urlopen(request)
html = response.read()
except (urllib2.HTTPError, urllib2.URLError) as e:
logging.error(e)
return
finally:
response.close()
return html
def get_content(url):
site = urlparse(url).netloc.replace('www.', '')
parser = next((x for x in parsers if x.site == site), None)
if parser:
content = parser(url)
if content:
return content
else:
logging.warning('Parsing error. URL:' + url)
return None
else:
logging.warning('Missing parser. URL:' + url)
return None
if __name__ == "__main__":
content_web = get_content('http://www.ntv.ru/novosti/1608256/')
with open('texts/ntv.ru.txt', 'r') as f:
content_file = f.read()
print hashlib.md5(content_web).hexdigest()
print hashlib.md5(content_file).hexdigest()