scrapy sql或sqlite ...无法获得所需的输出

时间:2012-07-11 12:14:10

标签: sql sqlite scrapy

我正在尝试从网站页面和页面URL(包含这些输入)中提取输入字段并将它们存储到数据库中... ok
*** code works fine with no errors , but this isn't the desired output i want

蜘蛛代码:

class MySpider(CrawlSpider):

    name = 'isa_spider'
    allowed_domains = ['testaspnet.vulnweb.com']
    start_urls = ['http://testaspnet.vulnweb.com']

    rules = (
                Rule(SgmlLinkExtractor(allow=('/*' ) ),callback='parse_item'),)

    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        item=IsaItem()
        item['response_fld']=response.url

        res = hxs.select("//input[(@id or @name) and (@type = 'text' )]/@id ").extract()
        item['text_input'] = res[0] if res else None # None is default value in case no field found

        res = hxs.select("//input[(@id or @name) and (@type = 'password')]/@id").extract()
        item['pass_input'] = res[0] if res else None # None is default value in case no field found

        res = hxs.select("//input[(@id or @name) and (@type = 'file')]/@id").extract()
        item['file_input'] = res[0] if res else None # None is default value in case no field found

        return item

管道代码

    class SQLiteStorePipeline(object):


        def __init__(self):
            self.conn = sqlite3.connect('./project.db')
            self.cur = self.conn.cursor()


        def process_item(self, item, spider):

            self.cur.execute("insert into inputs ( input_name) values(?)", (item['text_input'],))
            self.cur.execute("insert into inputs ( input_name) values(?)", (item['pass_input'],))
            self.cur.execute("insert into inputs ( input_name) values(?)", (item['file_input'],))
            self.cur.execute("insert into links (link) values(?)", (item['response_fld'],))
            self.conn.commit()
            return item

数据库架构 picture

必需输出 picture
 (抱歉没有直接插入图片,因为我的声誉低于10)

1 个答案:

答案 0 :(得分:0)

尚未对此进行测试:

class SQLiteStorePipeline(object):


    def __init__(self):
        self.conn = sqlite3.connect('./project.db')
        self.cur = self.conn.cursor()


    def process_item(self, item, spider):

        cursor = self.cur

        target_id = ? # determine target id
        cursor.execute("insert into links (target, link) values(?, ?)", (target_id, item['response_fld'],))
        link_id = cursor.lastrowid # find out just inserted link id

        cursor.execute("insert into inputs (link_id, input_name, input_type) values(?, ?, ?)", (link_id, item['text_input'], 1))
        cursor.execute("insert into inputs (link_id, input_name, input_type) values(?, ?, ?)", (link_id, item['pass_input'], 2))
        cursor.execute("insert into inputs (link_id, input_name, input_type) values(?, ?, ?)", (link_id, item['file_input'], 3))

        self.conn.commit()