我从一个网站上获得了一个Scrapy示例,它有效,但看起来有些不对劲:它无法获取所有内容,我不知道发生了什么。 该示例使用Scrapy + Redis + MongoDB。
信息:
import java.awt.*;
import java.awt.event.*;
import java.sql.*;
import javax.swing.*;
import java.awt.Graphics;
import java.awt.Color;
import java.awt.Font;
public class acclogin extends JFrame {
Connection con;
Statement st;
ImageIcon bg = new ImageIcon("wb1.jpg");
JFrame f = new JFrame("User Login");
ResultSet rs;
JLabel l = new JLabel("Username");
JLabel l1 = new JLabel("Password");
JLabel l2 = new JLabel(bg);
JTextField t = new JTextField(15);
JPasswordField t1 = new JPasswordField(15);
JButton b = new JButton("Login");
public acclogin() {
frame();
}
public void frame() {
f.setSize(620, 300);
l.setBounds(10, 20, 100, 10);
t.setBounds(100, 20, 100, 20);
l1.setBounds(10, 50, 100, 80);
t1.setBounds(100, 70, 100, 20);
b.setBounds(100, 130, 100, 30);
l2.setBounds(0, 0, 600, 300);
f.add(l);
f.add(t);
f.add(l1);
f.add(t1);
f.add(b);
f.add(l2);
f.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
f.setVisible(true);
LoginButton lb = new LoginButton();
b.addActionListener(lb);
}
class LoginButton implements ActionListener {
public void actionPerformed(ActionEvent ae) {
Object obj = ae.getSource();
if (obj == b) {
try {
String user = t.getText().trim();
String pass = t1.getText().trim();
Class.forName("sun.jdbc.odbc.JdbcOdbcDriver");
Connection con1 = DriverManager.getConnection("jdbc:odbc:balogin");
Statement stat;
stat = con1.createStatement();
ResultSet rs = stat.executeQuery("select * from Table1 where user='" + user + "' and pass='" + pass + "'");
System.out.println("select * from Table1 where user='" + user + "' and pass='" + pass + "'");
int count = 0;
while (rs.next()) {
{
count = count + 1;
}
if (count == 1) {
JOptionPane.showMessageDialog(null, "User Found,Access Granted");
ControlPanel cp1 = new ControlPanel();
cp1.display();
} else {
JOptionPane.showMessageDialog(null, "User not found");
}
}
} catch (Exception ex) {
}
}
}
}
public static void main(String args[]) {
new acclogin();
}
}
novspider.py
2015-10-09 01:43:33 [scrapy] INFO: Crawled 292 pages (at 292 pages/min), scraped 291 items (at 291 items/min)
2015-10-09 01:44:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:45:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:46:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:47:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:48:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:49:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:50:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:51:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:52:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:53:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:54:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:55:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:56:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:57:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
2015-10-09 01:58:33 [scrapy] INFO: Crawled 292 pages (at 0 pages/min), scraped 291 items (at 0 items/min)
settings.py
#-*-coding:utf8-*-
from scrapy_redis.spiders import RedisSpider
from scrapy.selector import Selector
from scrapy.http import Request
from novelspider.items import NovelspiderItem
import re
class novSpider(RedisSpider):
name = "novspider"
redis_key = 'nvospider:start_urls'
start_urls = ['http://www.daomubiji.com/']
def parse(self,response):
selector = Selector(response)
table = selector.xpath('//table')
for each in table:
bookName = each.xpath('tr/td[@colspan="3"]/center/h2/text()').extract()[0]
content = each.xpath('tr/td/a/text()').extract()
url = each.xpath('tr/td/a/@href').extract()
for i in range(len(url)):
item = NovelspiderItem()
item['bookName'] = bookName
item['chapterURL'] = url[i]
try:
item['bookTitle'] = content[i].split(' ')[0]
item['chapterNum'] = content[i].split(' ')[1]
except Exception,e:
continue
try:
item['chapterName'] = content[i].split(' ')[2]
except Exception,e:
item['chapterName'] = content[i].split(' ')[1][-3:]
yield Request(url[i], callback='parseContent', meta={'item':item})
def parseContent(self, response):
selector = Selector(response)
item = response.meta['item']
html = selector.xpath('//div[@class="content"]').extract()[0]
textField = re.search('<div style="clear:both"></div>(.*?)<div', html,re.S).group(1)
text = re.findall('<p>(.*?)</p>',textField,re.S)
fulltext = ''
for each in text:
fulltext += each
item['text'] = fulltext
yield item
pipelines.py
# -*- coding: utf-8 -*-
# Scrapy settings for novelspider project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'novelspider'
SPIDER_MODULES = ['novelspider.spiders']
NEWSPIDER_MODULE = 'novelspider.spiders'
ITEM_PIPELINES = ['novelspider.pipelines.NovelspiderPipeline']
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
COOKIES_ENABLED = True
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
REDIS_URL = None
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'novdata'
MONGODB_DOCNAME = 'nov1'
items.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from items import NovelspiderItem
from scrapy.conf import settings
import pymongo
class NovelspiderPipeline(object):
def __init__(self):
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
dbName = settings['MONGODB_DBNAME']
client = pymongo.MongoClient(host=host, port=port)
tdb = client[dbName]
self.post = tdb[settings['MONGODB_DOCNAME']]
def process_item(self, item, spider):
bookInfo = dict(item)
self.post.insert(bookInfo)
return item
答案 0 :(得分:0)
您永远不会以这种方式达到解析方法。改用它:
yield Request(
url[i],
callback=self.parseContent, # <--
meta={'item':item})