我需要以递归方式抓取网站。这就是我现在所拥有的:
class DemoSpider(CrawlSpider):
name = 'sample_recursive'
allowed_domains = ['www.example.org']
start_urls = [
"http://example.org"
]
rules = [Rule(SgmlLinkExtractor(allow=(r'/.org/site/ID/[\w*\W*]*'), deny=(r'.org/subscription')), callback='parse_start_url', follow=True)]
def parse_start_url(self, response):
items = []
item = DemoSampleItem()
item["source_url"] = response.url
item["title"] = response.xpath('//div[@class="content-title"]/h2/text()')[0].extract()
item["breadcrumb"] = response.xpath("//ul[@class='breadcrumbs']")[0].extract()
item["content"] = response.xpath("//div[@class='main_col']")[0].extract()
item["right_col"] = response.xpath("//div[@class='right_col']").extract()
item["left_col"] = response.xpath("//div[@class='left_col']")[0].extract()
item["depth"] = response.meta.get('depth', 0)
items.append(item)
return items
我希望它能够浏览以下网页:" example.org"," example.org/site/ID/home"," example.org/site/ ID /合作伙伴"和" example.org/site/ID/home/our-values" 然后将每个项目管道保存到mysql数据库中作为其自己的条目。
class AcdiSamplePipeline(object):
# connect
db_connection = MySQLdb.connect(host='localhost', user='user', passwd='passwd', db='dbname'
, charset='utf8', use_unicode=True)
# create a database cursor
cursor = db_connection.cursor()
def process_item(self, item, spider):
source_url = item["source_url"]
title = item["title"].encode('utf-8')
breadcrumb = item["breadcrumb"].encode('utf-8')
content = item["content"].encode('utf-8')
left_col = item["left_col"].encode('utf-8')
right_col = item["right_col"].encode('utf-8')
depth = item["depth"]
try:
self.cursor.execute("""INSERT INTO table_name (source_url, title, breadcrumb, content
, right_col, left_col, page_depth)
VALUES (%s, %s, %s, %s, %s, %s, %s)""",
(source_url
, title
, breadcrumb
, content
, right_col
, left_col
, depth))
self.db_connection.commit()
except MySQLdb.Error, e:
print("--------------- Printing DB ERROR(s) -------------------")
print "ERROR WHILE DB WRITE %d: %s" % (e.args[0], e.args[1])
return item
但截至目前,它是唯一的刮刮和储蓄" example.org"在数据库中。知道为什么它不会递归地通过网站吗?
答案 0 :(得分:1)
LinkExtractor中的正则表达式看起来不对:
allow=(r'/.org/site/ID/[\w*\W*]*')
# ^--- this slash doesn't belong here.
# Plus, the dot should be escaped, else it matches any character
您似乎希望正则表达式更像:
allow=(r'[.]org/site/ID/.+')