我正在按照本指南从Instagram抓取数据: http://www.spataru.at/scraping-instagram-scrapy/ 但是我得到了这个错误:
mona@pascal:~/computer_vision/instagram/instagram$ ls
instagram scrapy.cfg
mona@pascal:~/computer_vision/instagram/instagram$ scrapy crawl instagramspider
2017-03-01 15:30:10-0600 [scrapy] INFO: Scrapy 0.14.4 started (bot: instagram)
2017-03-01 15:30:10-0600 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, MemoryUsage, SpiderState
Traceback (most recent call last):
File "/usr/bin/scrapy", line 4, in <module>
execute()
File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 132, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 97, in _run_print_help
func(*a, **kw)
File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 139, in _run_command
cmd.run(args, opts)
File "/usr/lib/python2.7/dist-packages/scrapy/commands/crawl.py", line 43, in run
spider = self.crawler.spiders.create(spname, **opts.spargs)
File "/usr/lib/python2.7/dist-packages/scrapy/command.py", line 34, in crawler
self._crawler.configure()
File "/usr/lib/python2.7/dist-packages/scrapy/crawler.py", line 36, in configure
self.spiders = spman_cls.from_crawler(self)
File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 37, in from_crawler
return cls.from_settings(crawler.settings)
File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 33, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 23, in __init__
for module in walk_modules(name):
File "/usr/lib/python2.7/dist-packages/scrapy/utils/misc.py", line 65, in walk_modules
submod = __import__(fullpath, {}, {}, [''])
File "/home/mona/computer_vision/instagram/instagram/instagram/spiders/spider.py", line 3, in <module>
from scraper_user.items import UserItem
ImportError: No module named scraper_user.items
我遵循完整的教程,所以我不确定缺少什么或如何解决这个问题?
这是文件夹结构:
mona@pascal:~/computer_vision/instagram/instagram$ tree .
.
├── instagram
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ ├── settings.pyc
│ └── spiders
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── spider.py
│ └── spider.pyc
└── scrapy.cfg
2 directories, 11 files
这是教程中的spider.py代码:
1 import scrapy
2 import json
3 from scraper_user.items import UserItem
4 from scraper_user.items import PostItem
5
6
7 class InstagramSpider(scrapy.Spider):
8
9 name = 'instagramspider'
10 allowed_domains = ['instagram.com']
11 start_urls = []
12
13 def __init__(self):
14 self.start_urls = ["https://www.instagram.com/_spataru/?__a=1"]
15
16 def parse(self, response):
17 #get the json file
18 json_response = {}
19 try:
20 json_response = json.loads(response.body_as_unicode())
21 except:
22 self.logger.info('%s doesnt exist', response.url)
23 pass
24 if json_response["user"]["is_private"]:
25 return;
26 #check if the username even worked
27 try:
28 json_response = json_response["user"]
29
30 item = UserItem()
31
32 #get User Info
33 item["username"] = json_response["username"]
34 item["follows_count"] = json_response["follows"]["count"]
35 item["followed_by_count"] = json_response["followed_by"]["count"]
36 item["is_verified"] = json_response["is_verified"]
37 item["biography"] = json_response.get("biography")
38 item["external_link"] = json_response.get("external_url")
39 item["full_name"] = json_response.get("full_name")
40 item["posts_count"] = json_response.get("media").get("count")
41
42 #interate through each post
43 item["posts"] = []
44
45 json_response = json_response.get("media").get("nodes")
46 if json_response:
47 for post in json_response:
48 items_post = PostItem()
49 items_post["code"]=post["code"]
50 items_post["likes"]=post["likes"]["count"]
51 items_post["caption"]=post["caption"]
52 items_post["thumbnail"]=post["thumbnail_src"]
53 item["posts"].append(dict(items_post))
54
55 return item
56 except:
57 self.logger.info("Error during parsing %s", response.url)
这是items.py代码:
1 import scrapy
2
3 class UserItem(scrapy.Item):
4 username = scrapy.Field()
5 follows_count = scrapy.Field()
6 followed_by_count = scrapy.Field()
7 is_verified = scrapy.Field()
8 biography = scrapy.Field()
9 external_link = scrapy.Field()
10 full_name = scrapy.Field()
11 posts_count = scrapy.Field()
12 posts = scrapy.Field()
13
14
15 class PostItem(scrapy.Item):
16 code = scrapy.Field()
17 likes = scrapy.Field()
18 thumbnail = scrapy.Field()
19 caption = scrapy.Field()
20 hashtags = scrapy.Field()
答案 0 :(得分:0)
我在spider.py中更改了这些行:
3 from instagram.items import UserItem
4 from instagram.items import PostItem
以及items.py中的这些行:
1 import scrapy
2 from scrapy.item import Item, Field
3
4 class UserItem(Item):
5 username = Field()
6 follows_count = Field()
7 followed_by_count = Field()
8 is_verified = Field()
9 biography = Field()
10 external_link = Field()
11 full_name = Field()
12 posts_count = Field()
13 posts = Field()
14
15
16 class PostItem(Item):
17 code = Field()
18 likes = Field()
19 thumbnail = Field()
20 caption = Field()
21 hashtags = Field()
~
虽然它没有上一个错误,但它有这个新错误,我不知道如何修复:
mona@pascal:~/computer_vision/instagram/instagram$ scrapy crawl instagramspider
2017-03-01 17:24:03-0600 [scrapy] INFO: Scrapy 0.14.4 started (bot: instagram)
2017-03-01 17:24:04-0600 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, MemoryUsage, SpiderState
Traceback (most recent call last):
File "/usr/bin/scrapy", line 4, in <module>
execute()
File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 132, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 97, in _run_print_help
func(*a, **kw)
File "/usr/lib/python2.7/dist-packages/scrapy/cmdline.py", line 139, in _run_command
cmd.run(args, opts)
File "/usr/lib/python2.7/dist-packages/scrapy/commands/crawl.py", line 43, in run
spider = self.crawler.spiders.create(spname, **opts.spargs)
File "/usr/lib/python2.7/dist-packages/scrapy/command.py", line 34, in crawler
self._crawler.configure()
File "/usr/lib/python2.7/dist-packages/scrapy/crawler.py", line 36, in configure
self.spiders = spman_cls.from_crawler(self)
File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 37, in from_crawler
return cls.from_settings(crawler.settings)
File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 33, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 23, in __init__
for module in walk_modules(name):
File "/usr/lib/python2.7/dist-packages/scrapy/utils/misc.py", line 65, in walk_modules
submod = __import__(fullpath, {}, {}, [''])
File "/home/mona/computer_vision/instagram/instagram/instagram/spiders/spider.py", line 7, in <module>
class InstagramSpider(scrapy.Spider):
AttributeError: 'module' object has no attribute 'Spider'