在抓取geographic.org/streetview时,无需使用Scrapy重复错误

时间:2017-07-08 19:54:20

标签: python-2.7 scrapy

我的起始网址为http://www.geographic.org/streetview/usa/index.html

我使用以下代码:

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from urlparse import urljoin


class StreetViewSpider(CrawlSpider):
    name = "streetview"
    allowed_domains = ["geographic.org"]
    start_urls = ["http://www.geographic.org/streetview/usa/index.html"]
    rules = (
        Rule(LinkExtractor(restrict_xpaths="*.html"), follow=True),
        Rule(LinkExtractor(allow=('*.html',)), callback='parse_item')
    )

    def parse_item(self, response):
        self.logger.info('Hi, this is an item page! %s', response.url)
        item = scrapy.Item()
        sub_urls = response.xpath("descendant-or-self::li/descendant-or-self::*/a/@href").extract()
        item['urls'] = map(lambda x: urljoin(response.url, x), sub_urls)
        return item

我只需要在其网址中a并且在*.html域中拥有www.geographic.org的所有view.php代码的链接和文字。我们会在href提取scrapy crawl streetview时找到抓取工具。

我使用> scrapy crawl streetview Traceback (most recent call last): File "e:\miniconda2\lib\runpy.py", line 174, in _run_module_as_main "__main__", fname, loader, pkg_name) File "e:\miniconda2\lib\runpy.py", line 72, in _run_code exec code in run_globals File "E:\Miniconda2\Scripts\scrapy.exe\__main__.py", line 9, in <module> File "e:\miniconda2\lib\site-packages\scrapy\cmdline.py", line 148, in execute cmd.crawler_process = CrawlerProcess(settings) File "e:\miniconda2\lib\site-packages\scrapy\crawler.py", line 243, in __init__ super(CrawlerProcess, self).__init__(settings) File "e:\miniconda2\lib\site-packages\scrapy\crawler.py", line 134, in __init__ self.spider_loader = _get_spider_loader(settings) File "e:\miniconda2\lib\site-packages\scrapy\crawler.py", line 330, in _get_spider_loader return loader_cls.from_settings(settings.frozencopy()) File "e:\miniconda2\lib\site-packages\scrapy\spiderloader.py", line 61, in from_settings return cls(settings) File "e:\miniconda2\lib\site-packages\scrapy\spiderloader.py", line 25, in __init__ self._load_all_spiders() File "e:\miniconda2\lib\site-packages\scrapy\spiderloader.py", line 47, in _load_all_spiders for module in walk_modules(name): File "e:\miniconda2\lib\site-packages\scrapy\utils\misc.py", line 71, in walk_modules submod = import_module(fullpath) File "e:\miniconda2\lib\importlib\__init__.py", line 37, in import_module __import__(name) File "F:\PyCharmProjects\streetview\streetview\spiders\collector.py", line 7, in <module> class StreetViewSpider(CrawlSpider): File "F:\PyCharmProjects\streetview\streetview\spiders\collector.py", line 13, in StreetViewSpider Rule(LinkExtractor(allow=('*.html',)), callback='parse_item') File "e:\miniconda2\lib\site-packages\scrapy\linkextractors\lxmlhtml.py", line 116, in __init__ canonicalize=canonicalize, deny_extensions=deny_extensions) File "e:\miniconda2\lib\site-packages\scrapy\linkextractors\__init__.py", line 57, in __init__ for x in arg_to_iter(allow)] File "e:\miniconda2\lib\re.py", line 194, in compile return _compile(pattern, flags) File "e:\miniconda2\lib\re.py", line 251, in _compile raise error, v # invalid expression sre_constants.error: nothing to repeat 运行抓取工具,我得到:

public class RescueFragment extends Fragment {

    public RescueFragment() {
        // Required empty public constructor
    }


    @Override
    public View onCreateView(LayoutInflater inflater, ViewGroup container,
                         Bundle savedInstanceState) {
        View view = inflater.inflate(R.layout.rescue_list, container, false);

        final FirebaseDatabase database = FirebaseDatabase.getInstance();
        DatabaseReference ref = database.getReference("rescue");

        final ArrayList<RescueAnimal> rescueList = new ArrayList<>();

        ref.addValueEventListener(new ValueEventListener() {
            @Override
            public void onDataChange(DataSnapshot dataSnapshot) {
                for (DataSnapshot messageSnapshot: dataSnapshot.getChildren()) {
                    String location = (String) messageSnapshot.child("location").getValue();
                    String appearance = (String) messageSnapshot.child("appearance").getValue();
                    String photo = (String) messageSnapshot.child("photo").getValue();
                    String species = (String) messageSnapshot.child("species").getValue();
                    String problem = (String) messageSnapshot.child("problem").getValue();
                    rescueList.add(new RescueAnimal(location, appearance, photo, species, problem));
                }
            }
            @Override
            public void onCancelled(DatabaseError databaseError) {

            }
        });

        RescueAdapter adapter = new RescueAdapter(getActivity(), rescueList);
        ListView listView = view.findViewById(R.id.rescue_list);
        listView.setAdapter(adapter);
        return view;

1 个答案:

答案 0 :(得分:0)

您收到错误是因为您没有在第二条规则中为allow LinkExtractor属性提供有效的正则表达式。此外,第一条规则中restrict_xpaths的{​​{1}}属性也无效。试着遵循这些规则:

LinkExtractor