我创建了一个网络抓取工具,用于提取在数字图书馆(sample document)中发布的研究论文的信息。
基本上我正在提取每篇论文的标题,摘要和参考文献列表,并将它们存储在文本文件中。对所有引用的论文也重复这个过程。
提高刮刀速度的可行方法有哪些?
以下是代码:
# _*_ coding:utf-8 _*_
import urllib2
import json
import Queue
crawled = []
fo = open("paper.txt", "w")
class Paper(object):
def __init__(self, paper_id):
self.paper_id = paper_id
self.title, self.abstract = self.fetch_data()
def fetch_data(self):
base_url = "http://ieeexplore.ieee.org/rest/document/{0}/{1}"
data_url = base_url.format(self.paper_id, "abstract")
response = urllib2.urlopen(data_url)
html = response.readlines()
data = json.loads("\n".join(html))
title = data["title"]
abstract = data["abstract"]
return title, abstract
def fetch_ieee_references(self):
base_url = "http://ieeexplore.ieee.org/rest/document/{0}/{1}"
data_url = base_url.format(self.paper_id, "references")
response = urllib2.urlopen(data_url)
html = response.readlines()
data = json.loads("\n".join(html))
references = []
try:
for ref in data["references"]:
try:
ref_link = ref["links"]["documentLink"]
ref_paper_id = ref_link.split("/")[-1]
references.append(Paper(ref_paper_id))
except:
pass
except:
pass
return references
def extract_paper(self):
try:
print "Paper ID"
print self.paper_id
fname = str(self.paper_id)
fname = fname + ".txt"
fcon = open(fname,"w")
print
print "Title"
print self.title
print >>fcon, self.title
print "Abstract"
print self.abstract
print >>fcon, self.abstract
print "References"
for ref in self.fetch_ieee_references():
print ref.paper_id, ref.title
print >>fo, self.paper_id, ref.paper_id
except:
pass
def new_func():
n_id = 6639344
q = Queue.Queue()
q.put_nowait(n_id)
crawled.append(n_id)
while not q.empty():
p_id = q.get_nowait()
paper = Paper(p_id)
paper.extract_paper()
for ref in paper.fetch_ieee_references():
if ref.paper_id not in crawled:
crawled.append(ref.paper_id)
q.put_nowait(ref.paper_id)
new_func()
答案 0 :(得分:0)
正如其他用户已经提到的,它主要取决于HTTP请求的速度,因此您依赖于站点的服务器。因此,为了加快速度,您可以在多个流程之间划分文件。 另外我不明白为什么你读了html然后使用json.loads你可以在响应上使用json.load,这会加快一点点。