from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from thirdapp.items import ThirdappItem
class MySpider(CrawlSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = [
'http://www.example.com/search?q=news',
'http://www.example.com/search?q=movies',
]
rules = (
Rule(SgmlLinkExtractor(allow('?q=news',), restrict_xpaths('ul[@class="paginator"]',)), callback='parse_item', allow=True),
)
例如,假设规则中的restrict xpath应返回如下内容:
`bWFya2V0PT1nZW5lcmFsfHxzdD09MjB8fHN0cz09eyIxMCI6IlJlZ2lvbiIsIjIwIjoiTWlkZGxlIEVhc3QifQ%3D%3D`
我需要将它连接到:
https://www.catalogs.ssg.asia/toyota/?fromchanged=true&lang=en&l=
能够请求完整的URL,这是两者的串联: 想要的网址:
https://www.catalogs.ssg.asia/toyota/?fromchanged=true&lang=en&l=bWFya2V0PT1nZW5lcmFsfHxzdD09MjB8fHN0cz09eyIxMCI6IlJlZ2lvbiIsIjIwIjoiTWlkZGxlIEVhc3QifQ%3D%3D
有什么建议吗?
答案 0 :(得分:1)
当您使用link_extractor
时,它将返回绝对URL,这是源代码:
def _extract_links(self, selector, response_url, response_encoding, base_url):
links = []
# hacky way to get the underlying lxml parsed document
for el, attr, attr_val in self._iter_links(selector.root):
# pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
try:
attr_val = urljoin(base_url, attr_val)
except ValueError:
continue # skipping bogus links
else:
url = self.process_attr(attr_val)
if url is None:
continue
url = to_native_str(url, encoding=response_encoding)
# to fix relative links after process_value
url = urljoin(response_url, url)
link = Link(url, _collect_string_content(el) or u'',
nofollow=rel_has_nofollow(el.get('rel')))
links.append(link)
return self._deduplicate_if_needed(links)
这部分将获得绝对URL:
# to fix relative links after process_value
url = urljoin(response_url, url)
且base url
来自response
:
base_url = get_base_url(response)
因此,当您使用link_extractor
时,您无需担心相对网址,Scrapy
会将其转换为您。