Question

当我尝试运行我的代码时，我面对这个问题，我已经定义了这个抓取的实时请求，但仍然无法正常工作。谁知道如何在python中处理这个问题？在这种情况下，站点地图的重要性如何？提前致谢

stuff:[{"name":"me","position":"here"},
       {"name":"me","position":"here"},
       {"name":"me","position":"here"}]

这是我的文字结果：

import logging
import re
from urllib.parse import urljoin, urlparse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Request
from scrapy.spiders import SitemapSpider
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy.shell import inspect_response
from sqlalchemy.orm import sessionmaker
from content.spiders.templates.sitemap_template import ModSitemapSpider
from content.models import db_connect, create_db_table, Articles
from content.items import ContentItems
from content.item_functions import (process_item,
                                process_singular_item,
                                process_date_item,
                                process_array_item,
                                process_plural_texts,
                                process_external_links,
                                process_article_text)

HEADER_XPATH = ['//h1[@class="article-title"]//text()']
AUTHOR_XPATH = ['//span[@class="cnnbyline"]//text()',
            '//span[@class="byline"]//text()']
PUBDATE_XPATH = ['//span[@class="cnnDateStamp"]//text()']
TAGS_XPATH = ['']
CATEGORY_XPATH = ['']
TEXT = ['//div[@id="storytext"]//text()',
    '//div[@id="storycontent"]//p//text()']
INTERLINKS = ['//span[@class="inStoryHeading"]//a/@href']
DATE_FORMAT_STRING = '%Y-%m-%d'


class CNNnewsSpider(ModSitemapSpider):

    name = 'cnn'
    allowed_domains = ["cnn.com"]
    sitemap_urls = ["http://edition.cnn.com/sitemaps/sitemap-news.xml"]


def parse(self, response):
    items = []
    item = ContentItems()
    item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True)
    item['resource'] = urlparse(response.url).hostname
    item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False)
    item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True)
    item['tags'] = process_plural_texts(self, response, TAGS_XPATH, single=False)
    item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False)
    item['article_text'] = process_article_text(self, response, TEXT)
    item['external_links'] = process_external_links(self, response, INTERLINKS, single=False)
    item['link'] = response.url
    items.append(item)
    return items

Answer 1

由于您的班级parse()未覆盖scrapy.BaseSpider方法parse()，因此会抛出异常。虽然您在粘贴的代码中定义了CNNnewsSpider方法，但由于缩进，它不会包含在class CNNnewsSpider(ModSitemapSpider): name = 'cnn' allowed_domains = ["cnn.com"] sitemap_urls = ["http://edition.cnn.com/sitemaps/sitemap-news.xml"] def parse(self, response): items = [] item = ContentItems() item['title'] = process_singular_item(self, response, HEADER_XPATH, single=True) item['resource'] = urlparse(response.url).hostname item['author'] = process_array_item(self, response, AUTHOR_XPATH, single=False) item['pubdate'] = process_date_item(self, response, PUBDATE_XPATH, DATE_FORMAT_STRING, single=True) item['tags'] = process_plural_texts(self, response, TAGS_XPATH, single=False) item['category'] = process_array_item(self, response, CATEGORY_XPATH, single=False) item['article_text'] = process_article_text(self, response, TEXT) item['external_links'] = process_external_links(self, response, INTERLINKS, single=False) item['link'] = response.url items.append(item) return items中：相反，它被定义为独立函数。您需要按如下方式修改缩进：

   $exe = "C:\Users\johnn\OneDrive\Documents\visual studio 2015\Projects\test\test\bin\Release\test.exe"

   $pinfo = New-Object System.Diagnostics.ProcessStartInfo
   $pinfo.FileName = $exe
   $pinfo.RedirectStandardError = $true
   $pinfo.RedirectStandardOutput = $true
   $pinfo.UseShellExecute = $false
   $pinfo.Arguments = "localhost"
   $p = New-Object System.Diagnostics.Process
   $p.StartInfo = $pinfo
   $p.Start() | Out-Null
   $p.WaitForExit()
   $stdout = $p.StandardOutput.ReadToEnd()
   $stderr = $p.StandardError.ReadToEnd()
   Write-Host "stdout: $stdout"
   Write-Host "stderr: $stderr"
   Write-Host "exit code: " + $p.ExitCode

在Python中抛出NotImplementedError

1 个答案: