这是我的代码(link):
import urllib, urllib2, re
import threading, Queue
somewebsite = 'xxx'
urls5 = []
for i in range(50001, 60000):
url = somewebsite + str(i)
urls5.append(url)
class MultiUrl(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
url = self.queue.get()
try:
Go(url)
except:
continue
self.queue.task_done()
def main():
queue = Queue.Queue()
for i in range(16):
t = MultiUrl(queue)
t.setDaemon(True)
t.start()
for url in urls5:
queue.put(url)
queue.join()
def Go(url):
try:
print "Now grabbing...", url
getArticle(url)
if (OriginalContentList != []):
writeContent()
except:
pass
def getArticle(url):
global content, Content
i = re.findall('\d+', url)[0]
ArticleID = str(i)
try:
content = urllib2.urlopen(url).read()
Content = urllib.unquote(content)
except:
OriginalContentList = []
pass
def writeContent():
# Origin Content
try:
filename = ArticleID + '.txt'
file_out = open(filename, 'w')
file_out.write(Content)
file_out.write('\n')
file_out.close()
except:
pass
if __name__ == '__main__':
main()
单线程抓取无法处理那么多文章(> = 200000),所以我谷歌(d)并选择了线程和队列。
问题是:程序无法将内容写入对应文件。例如,它可以将No.55555的内容写入55666.txt。评论我的描述是否准确。
答案 0 :(得分:0)
全局变量由多个线程共享。 一个线程可以在其他线程读取变量时覆盖变量。
尽可能避免使用全局变量。
答案 1 :(得分:0)