我正在尝试使用scrapy 来创建我的第一个蜘蛛scraper 来检索Instagram 帖子(标题、网址图片和视频)。我收到一条错误消息:TypeError: Request url must be str or unicode, got NoneType 但是在调试时,我使用了正确的 url 格式并且它确实通过了 get_url 函数。我不确定是什么导致了这个问题。
import scrapy
from urllib.parse import urlencode
import json
from datetime import datetime
API = 'api-key'
import logging
user_accounts = ["cream34246", "styledgood"]
def get_url(url):
payload = {'api_key': API, 'proxy': 'residential', 'timeout': '20000', 'url': url}
proxy_url = 'https://api.webscraping.ai/html?' + urlencode(payload)
return
class InstagramSpider(scrapy.Spider):
name = 'instagram'
allowed_domains = ['api.scraperapi.com']
custom_settings = {'CONCURRENT_REQUESTS_PER_DOMAIN': 5}
def start_requests(self):
for username in user_accounts:
url = f'https://www.instagram.com/{username}/?hl=en'
yield scrapy.Request(get_url(url), callback=self.parse)
def parse(self, response):
x = response.xpath("//script[starts-with(.,'window._sharedData')]/text()").extract_first()
json_string = "{" + x.strip().split('= {')[1][:-1]
data = json.loads(json_string)
# all that we have to do here is to parse the JSON we have
user_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
user = data['entry_data']['ProfilePage'][0]['graphql']['user']['full_name']
next_page_bool = \
data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['page_info'][
'has_next_page']
edges = data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']
for i in edges:
url = 'https://www.instagram.com/p/' + i['node']['shortcode']
video = i['node']['is_video']
date_posted_timestamp = i['node']['taken_at_timestamp']
date_posted_human = datetime.fromtimestamp(date_posted_timestamp).strftime("%d/%m/%Y %H:%M:%S")
like_count = i['node']['edge_media_preview_like']['count'] if "edge_media_preview_like" in i['node'].keys() else ''
comment_count = i['node']['edge_media_to_comment']['count'] if 'edge_media_to_comment' in i[
'node'].keys() else ''
captions = ""
if i['node']['edge_media_to_caption']:
for i2 in i['node']['edge_media_to_caption']['edges']:
captions += i2['node']['text'] + "\n"
if video:
image_url = i['node']['display_url']
else:
image_url = i['node']['thumbnail_resources'][-1]['src']
item = {'username':user,'postURL': url, 'isVideo': video, 'date_posted': date_posted_human,
'timestamp': date_posted_timestamp, 'likeCount': like_count, 'commentCount': comment_count, 'image_url': image_url,
'captions': captions[:-1]}
if video:
yield scrapy.Request(get_url(url), callback=self.get_video, meta={'item': item})
else:
item['videoURL'] = ''
yield item
if next_page_bool:
cursor = \
data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['page_info'][
'end_cursor']
di = {'id': user_id, 'first': 12, 'after': cursor}
print(di)
params = {'query_hash':'e769aa130647d2354c40ea6a439bfc08' , 'variables': json.dumps(di)}
url = 'https://www.instagram.com/graphql/query/?' + urlencode(params)
yield scrapy.Request(get_url(url), callback=self.parse_pages, meta={'pages_di': di})
日志和错误:
2021-04-24 10:45:27 [scrapy.core.engine] INFO: Spider opened
2021-04-24 10:45:27 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2021-04-24 10:45:27 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2021-04-24 10:45:27 [scrapy.core.engine] ERROR: Error while obtaining start requests
Traceback (most recent call last):
File "c:\users\name\anaconda3\envs\envi\lib\site-packages\scrapy\core\engine.py", line 129, in _next_request
request = next(slot.start_requests)
File "C:\Users\Name\Desktop\instascraper\instascraper\spiders\temp.py", line 35, in start_requests
yield scrapy.Request(get_url(url), callback=self.parse)
File "c:\users\name\anaconda3\envs\envi\lib\site-packages\scrapy\http\request\__init__.py", line 25, in __init__
self._set_url(url)
File "c:\users\name\anaconda3\envs\envi\lib\site-packages\scrapy\http\request\__init__.py", line 63, in _set_url
raise TypeError(f'Request url must be str or unicode, got {type(url).__name__}')
TypeError: Request url must be str or unicode, got NoneType
2021-04-24 10:45:27 [scrapy.core.engine] INFO: Closing spider (finished)
答案 0 :(得分:-1)
看起来您生成网址的函数没有返回任何内容。
def get_url(url):
payload = {'api_key': API, 'proxy': 'residential', 'timeout': '20000', 'url': url}
proxy_url = 'https://api.webscraping.ai/html?' + urlencode(payload)
return
它每次都返回 None
,而不是 proxy_url
。您需要通过 return proxy_url
来修复它。