当我尝试运行我的代码时,我面对这个问题,我已经定义了这个抓取的实时请求,但仍然无法正常工作。谁知道如何在python中处理这个问题? 在这种情况下,站点地图的重要性如何? 提前致谢
stuff:[{"name":"me","position":"here"},
{"name":"me","position":"here"},
{"name":"me","position":"here"}]
这是我的文字结果:
import logging
import re
from urllib.parse import urljoin, urlparse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Request
from scrapy.spiders import SitemapSpider
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy.shell import inspect_response
from sqlalchemy.orm import sessionmaker
from content.spiders.templates.sitemap_template import ModSitemapSpider
from content.models import db_connect, create_db_table, Articles
from content.items import ContentItems
from content.item_functions import (process_item,
process_singular_item,
process_date_item,
process_array_item,
process_plural_texts,
process_external_links,
process_article_text)
HEADER_XPATH = ['//h1[@class="article-title"]//text()']
AUTHOR_XPATH = ['//span[@class="cnnbyline"]//text()',
'//span[@class="byline"]//text()']
PUBDATE_XPATH = ['//span[@class="cnnDateStamp"]//text()']
TAGS_XPATH = ['']
CATEGORY_XPATH = ['']
TEXT = ['//div[@id="storytext"]//text()',
'//div[@id="storycontent"]//p//text()']
INTERLINKS = ['//span[@class="inStoryHeading"]//a/@href']
DATE_FORMAT_STRING = '%Y-%m-%d'
class CNNnewsSpider(ModSitemapSpider):
name = 'cnn'
allowed_domains = ["cnn.com"]
sitemap_urls = ["http://edition.cnn.com/sitemaps/sitemap-news.xml"]
def parse(self, response):
items = []
item = ContentItems()
item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True)
item['resource'] = urlparse(response.url).hostname
item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False)
item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True)
item['tags'] = process_plural_texts(self, response, TAGS_XPATH, single=False)
item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False)
item['article_text'] = process_article_text(self, response, TEXT)
item['external_links'] = process_external_links(self, response, INTERLINKS, single=False)
item['link'] = response.url
items.append(item)
return items
答案 0 :(得分:0)
由于您的班级parse()
未覆盖scrapy.BaseSpider
方法parse()
,因此会抛出异常。虽然您在粘贴的代码中定义了CNNnewsSpider
方法,但由于缩进,它不会包含在class CNNnewsSpider(ModSitemapSpider):
name = 'cnn'
allowed_domains = ["cnn.com"]
sitemap_urls = ["http://edition.cnn.com/sitemaps/sitemap-news.xml"]
def parse(self, response):
items = []
item = ContentItems()
item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True)
item['resource'] = urlparse(response.url).hostname
item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False)
item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True)
item['tags'] = process_plural_texts(self, response, TAGS_XPATH, single=False)
item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False)
item['article_text'] = process_article_text(self, response, TEXT)
item['external_links'] = process_external_links(self, response, INTERLINKS, single=False)
item['link'] = response.url
items.append(item)
return items
中:相反,它被定义为独立函数。您需要按如下方式修改缩进:
$exe = "C:\Users\johnn\OneDrive\Documents\visual studio 2015\Projects\test\test\bin\Release\test.exe"
$pinfo = New-Object System.Diagnostics.ProcessStartInfo
$pinfo.FileName = $exe
$pinfo.RedirectStandardError = $true
$pinfo.RedirectStandardOutput = $true
$pinfo.UseShellExecute = $false
$pinfo.Arguments = "localhost"
$p = New-Object System.Diagnostics.Process
$p.StartInfo = $pinfo
$p.Start() | Out-Null
$p.WaitForExit()
$stdout = $p.StandardOutput.ReadToEnd()
$stderr = $p.StandardError.ReadToEnd()
Write-Host "stdout: $stdout"
Write-Host "stderr: $stderr"
Write-Host "exit code: " + $p.ExitCode