由于某些原因,当我运行此代码时,它会循环遍历同一个对象,并且不会从数据库中获取任何新项目。换句话说,当打印输出应该迭代列表中的项目时,打印输出只是一遍又一遍的同一个对象。这是我的代码:
article = Article.objects.filter(is_locked=False, is_downloaded=False).first()
while article:
article.is_locked = True
article.save()
print '******************************'
date = article.datetime
title = article.title
url = article.url
print('date: %s' % date)
print('url: %s' % url)
print('title: %s' % title)
get_article(url, title, article)
article = Article.objects.filter(is_locked=False, is_downloaded=False).first()
mldb.models是:
from django.db import models
class Article(models.Model):
url = models.CharField(max_length=1028)
title = models.CharField(max_length=1028)
category = models.CharField(max_length=128)
locale = models.CharField(max_length=128)
section = models.CharField(max_length=512)
tag = models.CharField(max_length=128)
author = models.CharField(max_length=256)
datetime = models.DateTimeField()
description = models.TextField()
article = models.TextField()
is_locked = models.BooleanField(default=False)
is_downloaded = models.BooleanField(default=False)
def __str__(self): # __unicode__ on Python 2
return self.name
class Meta:
app_label = 'mldb'
我也尝试了这个,但它也没有遍历对象(循环只是反复重复同一个对象):
articles = Article.objects.filter(is_locked=False, is_downloaded=False)
for article in articles:
...
这是get_article()。这似乎是导致问题的原因(如果我删除对此函数的调用一切正常):
def get_article(url, title, article):
failed_attempts = 0
while True:
try:
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content, "html5lib")
description = soup.find(property="og:description")["content"] if soup.find(property="og:description") else ''
locale = soup.find(property="og:locale")["content"] if soup.find(property="og:locale") else ''
section = soup.find(property="og:article:section")["content"] if soup.find(property="og:article:section") else ''
tag = soup.find(property="og:article:tag")["content"] if soup.find(property="og:article:tag") else ''
author = soup.find(property="og:article:author")["content"] if soup.find(property="og:article:author") else ''
date = soup.find(property="og:article:published_time")["content"] if soup.find(property="og:article:published_time") else ''
print 'date'
print date
body = ''
for body_tag in soup.findAll("div", {"class" : re.compile('ArticleBody_body.*')}):
body += body_tag.text
# datetime.strptime (ts, "%Y") # 2012-01-02T04:32:57+0000
dt = dateutil.parser.parse(date, fuzzy=True)
print dt
print url
article.title = title.encode('utf-8')
article.url = url.encode('utf-8')
article.description = description.encode('utf-8')
article.locale = locale.encode('utf-8')
article.section = section.encode('utf-8')
article.tag = tag.encode('utf-8')
article.author = author.encode('utf-8')
article.body = body.encode('utf-8')
article.is_downloaded = True
article.article = body
article.save()
print(description.encode('utf-8'))
except (urllib2.HTTPError, ValueError) as err:
print err
time.sleep(20)
failed_attempts += 1
if failed_attempts < 10:
continue
有什么想法吗?
答案 0 :(得分:1)
我看到你的get_article()
函数中有无限循环。
请考虑get_article()
的简化版本,以便进行说明:
def get_article(url, title, article):
failed_attempts = 0
# Note how this while loop runs endlessly.
while True:
try:
# doing something here without calling `return` anywhere
# I'll just write `pass` for the purpose of simplification
pass
except (urllib2.HTTPError, ValueError) as err:
failed_attempts += 1
if failed_attempts < 10:
# you're calling `continue` here but you're not calling
# `break` or `return` anywhere if failed_attemps >= 10
# and therefore you're still stuck in the while-loop
continue
请注意,简单地不调用continue
不会停止while循环:
while True:
print('infinite loop!')
if some_condition:
# if some_condition is truthy, continue
continue
# but if it's not, we will continue anyway. the above if-condition
# therefore doesn't make sense
固定版本可能如下所示,我省略了细节:
def get_article(url, title, article):
failed_attempts = 0
while True:
try:
# it's considered good practice to only put the throwing
# statement you want to catch in the try-block
content = urllib2.urlopen(url).read()
except (urllib2.HTTPError, ValueError) as err:
failed_attempts += 1
if failed_attempts == 10:
# if it's the 10th attempt, break the while loop.
# consider throwing an error here which you can handle
# where you're calling `get_article` from. otherwise
# the caller doesn't know something went wrong
break
else:
# do your work here
soup = BeautifulSoup(content, "html5lib")
# ...
article.save()
# and call return!
return