class DmozSpider(Spider):
global requests
name = "drugtrials"
dt = import_config()
allowed_domains = ['chinadrugtrials.org.cn']
start_urls = [
'http://www.chinadrugtrials.org.cn/eap/clinicaltrials.searchlist'
]
def parse(self, response):
print 'drugtrials start time :', datetime.datetime.now()
f = open('/var/www/html/cde/drugtrials.txt', 'a+')
f.write("drugtrials start time" + str(datetime.datetime.now()) + '\n')
f.close()
dt = import_config()
notneedwords_str = dt['not_need_words']
NOTNEEDWORDS = notneedwords_str.split(',')
print "NOTNEEDWORDS:", NOTNEEDWORDS
db = cx_Oracle.connect(ORACLE_USER, ORACLE_PASSWD, ORACLE_DSN)
cursor = db.cursor()
sql = """SELECT "id","name","company" FROM "cde" where "sfda_status">=8""" # **There are 116589 data sets here.**
cursor.execute(sql)
result = cursor.fetchall()
length = len(result)
requests = []
zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
for i in range(0, length):
namearr = result[i][1].split(" ")
companyarr = result[i][2].split(" ")
arr = namearr + companyarr
for m in arr:
if not len(m) == 1 and m not in NOTNEEDWORDS:
if m in namearr:
# 药物别名查询
cursor.execute(
"""SELECT "cde_usedname","cde_usedname2","cde_usedname3","cde_usedname4","cde_usedname5" from "cde_usedname" where "cde_name"=:cde_name """,
{'cde_name': m})
all_bieming1 = cursor.fetchone()
all_bieming = []
if all_bieming1:
for iva in all_bieming1:
if iva is not None:
all_bieming.append(iva)
print all_bieming
all_drug_name = ' '.join(all_bieming)
m2 = m + ' ' + all_drug_name
else:
m2 = m
request = FormRequest(dt['drug_url'],
formdata={
'drugs_name': m2
},
dont_filter=True,
callback=self.parse_page,
meta={'code': result[i][0], 'keyword': m2, 'keyword_type': 'drugs_name'}
)
requests.append(request)
else:
match = zhPattern.search(unicode(m))
if match:
request = FormRequest(dt['drug_url'],
formdata={
'appliers': m
},
dont_filter=True,
callback=self.parse_page,
meta={'code': result[i][0], 'keyword': m, 'keyword_type': 'appliers'}
)
requests.append(request)
else:
print'没中文{}'.format(m)
length1 = len(requests)
print "length1:",length1
for request in requests:
yield request
def parse_page(self, response):
pass
我发现,当请求中的数据量太大(116,589)时,程序将不执行yield请求的代码。 我确定问题的原因是由于大量请求而导致无法调用回调函数。但是我没有找到解决问题的方法,请帮助我回答。谢谢!