我的起始网址为http://www.geographic.org/streetview/usa/index.html。
我使用以下代码:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from urlparse import urljoin
class StreetViewSpider(CrawlSpider):
name = "streetview"
allowed_domains = ["geographic.org"]
start_urls = ["http://www.geographic.org/streetview/usa/index.html"]
rules = (
Rule(LinkExtractor(restrict_xpaths="*.html"), follow=True),
Rule(LinkExtractor(allow=('*.html',)), callback='parse_item')
)
def parse_item(self, response):
self.logger.info('Hi, this is an item page! %s', response.url)
item = scrapy.Item()
sub_urls = response.xpath("descendant-or-self::li/descendant-or-self::*/a/@href").extract()
item['urls'] = map(lambda x: urljoin(response.url, x), sub_urls)
return item
我只需要在其网址中a
并且在*.html
域中拥有www.geographic.org
的所有view.php
代码的链接和文字。我们会在href
提取scrapy crawl streetview
时找到抓取工具。
我使用> scrapy crawl streetview
Traceback (most recent call last):
File "e:\miniconda2\lib\runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "e:\miniconda2\lib\runpy.py", line 72, in _run_code
exec code in run_globals
File "E:\Miniconda2\Scripts\scrapy.exe\__main__.py", line 9, in <module>
File "e:\miniconda2\lib\site-packages\scrapy\cmdline.py", line 148, in execute
cmd.crawler_process = CrawlerProcess(settings)
File "e:\miniconda2\lib\site-packages\scrapy\crawler.py", line 243, in __init__
super(CrawlerProcess, self).__init__(settings)
File "e:\miniconda2\lib\site-packages\scrapy\crawler.py", line 134, in __init__
self.spider_loader = _get_spider_loader(settings)
File "e:\miniconda2\lib\site-packages\scrapy\crawler.py", line 330, in _get_spider_loader
return loader_cls.from_settings(settings.frozencopy())
File "e:\miniconda2\lib\site-packages\scrapy\spiderloader.py", line 61, in from_settings
return cls(settings)
File "e:\miniconda2\lib\site-packages\scrapy\spiderloader.py", line 25, in __init__
self._load_all_spiders()
File "e:\miniconda2\lib\site-packages\scrapy\spiderloader.py", line 47, in _load_all_spiders
for module in walk_modules(name):
File "e:\miniconda2\lib\site-packages\scrapy\utils\misc.py", line 71, in walk_modules
submod = import_module(fullpath)
File "e:\miniconda2\lib\importlib\__init__.py", line 37, in import_module
__import__(name)
File "F:\PyCharmProjects\streetview\streetview\spiders\collector.py", line 7, in <module>
class StreetViewSpider(CrawlSpider):
File "F:\PyCharmProjects\streetview\streetview\spiders\collector.py", line 13, in StreetViewSpider
Rule(LinkExtractor(allow=('*.html',)), callback='parse_item')
File "e:\miniconda2\lib\site-packages\scrapy\linkextractors\lxmlhtml.py", line 116, in __init__
canonicalize=canonicalize, deny_extensions=deny_extensions)
File "e:\miniconda2\lib\site-packages\scrapy\linkextractors\__init__.py", line 57, in __init__
for x in arg_to_iter(allow)]
File "e:\miniconda2\lib\re.py", line 194, in compile
return _compile(pattern, flags)
File "e:\miniconda2\lib\re.py", line 251, in _compile
raise error, v # invalid expression
sre_constants.error: nothing to repeat
运行抓取工具,我得到:
public class RescueFragment extends Fragment {
public RescueFragment() {
// Required empty public constructor
}
@Override
public View onCreateView(LayoutInflater inflater, ViewGroup container,
Bundle savedInstanceState) {
View view = inflater.inflate(R.layout.rescue_list, container, false);
final FirebaseDatabase database = FirebaseDatabase.getInstance();
DatabaseReference ref = database.getReference("rescue");
final ArrayList<RescueAnimal> rescueList = new ArrayList<>();
ref.addValueEventListener(new ValueEventListener() {
@Override
public void onDataChange(DataSnapshot dataSnapshot) {
for (DataSnapshot messageSnapshot: dataSnapshot.getChildren()) {
String location = (String) messageSnapshot.child("location").getValue();
String appearance = (String) messageSnapshot.child("appearance").getValue();
String photo = (String) messageSnapshot.child("photo").getValue();
String species = (String) messageSnapshot.child("species").getValue();
String problem = (String) messageSnapshot.child("problem").getValue();
rescueList.add(new RescueAnimal(location, appearance, photo, species, problem));
}
}
@Override
public void onCancelled(DatabaseError databaseError) {
}
});
RescueAdapter adapter = new RescueAdapter(getActivity(), rescueList);
ListView listView = view.findViewById(R.id.rescue_list);
listView.setAdapter(adapter);
return view;
答案 0 :(得分:0)
您收到错误是因为您没有在第二条规则中为allow
LinkExtractor
属性提供有效的正则表达式。此外,第一条规则中restrict_xpaths
的{{1}}属性也无效。试着遵循这些规则:
LinkExtractor