Scrapy:0件/分钟

时间:2015-10-08 18:11:22

标签: python scrapy

我从一个网站上获得了一个Scrapy示例,它有效,但看起来有些不对劲:它无法获取所有内容,我不知道发生了什么。   该示例使用Scrapy + Redis + MongoDB。

信息:

import java.awt.*;
import java.awt.event.*;
import java.sql.*;
import javax.swing.*;
import java.awt.Graphics;
import java.awt.Color;
import java.awt.Font;

public class acclogin extends JFrame {
    Connection con;
    Statement st;
    ImageIcon bg = new ImageIcon("wb1.jpg");
    JFrame f = new JFrame("User Login");
    ResultSet rs;
    JLabel l = new JLabel("Username");
    JLabel l1 = new JLabel("Password");
    JLabel l2 = new JLabel(bg);
    JTextField t = new JTextField(15);
    JPasswordField t1 = new JPasswordField(15);
    JButton b = new JButton("Login");

    public acclogin() {
        frame();
    }

    public void frame() {
        f.setSize(620, 300);

        l.setBounds(10, 20, 100, 10);
        t.setBounds(100, 20, 100, 20);
        l1.setBounds(10, 50, 100, 80);
        t1.setBounds(100, 70, 100, 20);

        b.setBounds(100, 130, 100, 30);
        l2.setBounds(0, 0, 600, 300);

        f.add(l);
        f.add(t);
        f.add(l1);
        f.add(t1);

        f.add(b);
        f.add(l2);
        f.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
        f.setVisible(true);

        LoginButton lb = new LoginButton();
        b.addActionListener(lb);
    }

    class LoginButton implements ActionListener {
        public void actionPerformed(ActionEvent ae) {

            Object obj = ae.getSource();
            if (obj == b) {
                try {
                    String user = t.getText().trim();
                    String pass = t1.getText().trim();
                    Class.forName("sun.jdbc.odbc.JdbcOdbcDriver");
                    Connection con1 = DriverManager.getConnection("jdbc:odbc:balogin");
                    Statement stat;
                    stat = con1.createStatement();
                    ResultSet rs = stat.executeQuery("select * from Table1 where user='" + user + "' and pass='" + pass + "'");
                    System.out.println("select * from Table1 where user='" + user + "' and pass='" + pass + "'");
                    int count = 0;
                    while (rs.next()) {
                        {
                            count = count + 1;
                        }
                        if (count == 1) {
                            JOptionPane.showMessageDialog(null, "User Found,Access Granted");
                            ControlPanel cp1 = new ControlPanel();
                            cp1.display();
                        } else {
                            JOptionPane.showMessageDialog(null, "User not found");
                        }
                    }
                } catch (Exception ex) {
                }
            }
        }
    }

    public static void main(String args[]) {
        new acclogin();
    }

}

novspider.py

2015-10-09 01:43:33 [scrapy] INFO: Crawled 292 pages (at 292 pages/min), scraped 291 items (at 291 items/min)
2015-10-09 01:44:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:45:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:46:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:47:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:48:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:49:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:50:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:51:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:52:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:53:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:54:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:55:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:56:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:57:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:58:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)

settings.py

#-*-coding:utf8-*-

from scrapy_redis.spiders import RedisSpider
from scrapy.selector import Selector
from scrapy.http import Request
from novelspider.items import NovelspiderItem
import re

class novSpider(RedisSpider):
    name = "novspider"
    redis_key = 'nvospider:start_urls'
    start_urls = ['http://www.daomubiji.com/']                               

    def parse(self,response):
        selector = Selector(response)
        table = selector.xpath('//table')
        for each in table:
            bookName = each.xpath('tr/td[@colspan="3"]/center/h2/text()').extract()[0]
            content = each.xpath('tr/td/a/text()').extract()
            url = each.xpath('tr/td/a/@href').extract()
            for i in range(len(url)):
                item = NovelspiderItem()
                item['bookName'] = bookName
                item['chapterURL'] = url[i]

                try:
                    item['bookTitle'] = content[i].split(' ')[0]
                    item['chapterNum'] = content[i].split(' ')[1]
                except Exception,e:
                    continue

                try:
                    item['chapterName'] = content[i].split(' ')[2]
                except Exception,e:
                    item['chapterName'] = content[i].split(' ')[1][-3:]
                yield Request(url[i], callback='parseContent', meta={'item':item})

    def parseContent(self, response):
        selector = Selector(response)
        item = response.meta['item']
        html = selector.xpath('//div[@class="content"]').extract()[0]
        textField = re.search('<div style="clear:both"></div>(.*?)<div', html,re.S).group(1)
        text = re.findall('<p>(.*?)</p>',textField,re.S)
        fulltext = ''
        for each in text:
            fulltext += each
        item['text'] = fulltext
        yield item

pipelines.py

# -*- coding: utf-8 -*-

# Scrapy settings for novelspider project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'novelspider'

SPIDER_MODULES = ['novelspider.spiders']
NEWSPIDER_MODULE = 'novelspider.spiders'

ITEM_PIPELINES = ['novelspider.pipelines.NovelspiderPipeline']

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
COOKIES_ENABLED = True

SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
REDIS_URL = None
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379

MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'novdata'
MONGODB_DOCNAME = 'nov1'

items.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from items import NovelspiderItem 
from scrapy.conf import settings
import pymongo

class NovelspiderPipeline(object):
    def __init__(self):
        host = settings['MONGODB_HOST']
        port = settings['MONGODB_PORT']
        dbName = settings['MONGODB_DBNAME']
        client = pymongo.MongoClient(host=host, port=port)
        tdb = client[dbName]
        self.post = tdb[settings['MONGODB_DOCNAME']]

    def process_item(self, item, spider):
        bookInfo = dict(item)
        self.post.insert(bookInfo)
        return item

1 个答案:

答案 0 :(得分:0)

您永远不会以这种方式达到解析方法。改用它:

yield Request(
    url[i], 
    callback=self.parseContent, # <--
    meta={'item':item})