我正在尝试检索我不知道其确切地址的网站列表。其中有些是 https ,另一些是 http 或URL中可能包含 www ,或者可能没有,我只有他们的主机地址。这是一个例子:
spouts:
- id: "spout"
className: "com.digitalpebble.stormcrawler.spout.MemorySpout"
parallelism: 1
constructorArgs:
- ["https://digikala.com/"]
这是爬网后的结果:
{
"responseHeader":{
"status":0,
"QTime":0},
"response":{"numFound":1,"start":0,"docs":[
{
"url":"https://digikala.com/",
"host":"digikala.com",
"status":"REDIRECTION",
"metadata._redirTo":["https://www.digikala.com/"],
"nextFetchDate":"2019-09-17T06:34:38Z"}]
}}
我想要重定向网站,那么搜寻器会发现重定向的URL及其所有子链接。 我该如何实现? 这是我的 urlfilters.json :
{
"com.digitalpebble.stormcrawler.filtering.URLFilters": [
{
"class": "com.digitalpebble.stormcrawler.filtering.basic.BasicURLFilter",
"name": "BasicURLFilter",
"params": {
"maxPathRepetition": 8,
"maxLength": 8192
}
},
{
"class": "com.digitalpebble.stormcrawler.filtering.depth.MaxDepthFilter",
"name": "MaxDepthFilter",
"params": {
"maxDepth": 2
}
},
{
"class": "com.digitalpebble.stormcrawler.filtering.basic.BasicURLNormalizer",
"name": "BasicURLNormalizer",
"params": {
"removeAnchorPart": true,
"unmangleQueryString": true,
"checkValidURI": true,
"removeHashes": false
}
},
{
"class": "com.digitalpebble.stormcrawler.filtering.host.HostURLFilter",
"name": "HostURLFilter",
"params": {
"ignoreOutsideHost": true,
"ignoreOutsideDomain": false
}
},
{
"class": "com.digitalpebble.stormcrawler.filtering.regex.RegexURLNormalizer",
"name": "RegexURLNormalizer",
"params": {
"regexNormalizerFile": "default-regex-normalizers.xml"
}
},
{
"class": "com.digitalpebble.stormcrawler.filtering.regex.RegexURLFilter",
"name": "RegexURLFilter",
"params": {
"regexFilterFile": "default-regex-filters.txt"
}
},
{
"class": "com.digitalpebble.stormcrawler.filtering.basic.SelfURLFilter",
"name": "SelfURLFilter"
}
]
}
这是我的 crawler-conf.yaml :
config:
fetcher.server.delay: 1.0
fetcher.server.min.delay: 0.0
fetcher.queue.mode: "byHost"
fetcher.threads.per.queue: 1
fetcher.threads.number: 10
fetcher.max.urls.in.queues: -1
fetcher.max.queue.size: -1
fetcher.max.crawl.delay: 30
fetcher.max.crawl.delay.force: false
fetcher.server.delay.force: false
fetcher.metrics.time.bucket.secs: 10
fetcher.max.throttle.sleep: -1
partition.url.mode: "byHost"
metadata.persist:
- _redirTo
- error.cause
- error.source
- isSitemap
- isFeed
metadata.track.path: true
metadata.track.depth: true
http.agent.name: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
http.agent.version: "537.36"
http.agent.description: "Free open-source web browser developed by Google. Chromium is the name of the open source project behind Google Chrome, released under the BSD license."
http.agent.url: "http://www.google.com/chrome"
http.agent.email: "someone@someorganization.com"
http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3"
http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
http.content.limit: -1
http.store.headers: false
http.timeout: 10000
http.content.partial.as.trimmed: false
http.robots.403.allow: true
robots.noFollow.strict: false
http.skip.robots: true
robots.cache.spec: "maximumSize=10000,expireAfterWrite=6h"
robots.error.cache.spec: "maximumSize=10000,expireAfterWrite=1h"
protocols: "http,https,file"
http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
file.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.file.FileProtocol"
selenium.implicitlyWait: 0
selenium.pageLoadTimeout: -1
selenium.setScriptTimeout: 0
selenium.instances.num: 1
selenium.capabilities:
takesScreenshot: false
loadImages: false
javascriptEnabled: true
selenium.delegated.protocol: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
# no url or parsefilters by default
parsefilters.config.file: "parsefilters.json"
urlfilters.config.file: "urlfilters.json"
# JSoupParserBolt
jsoup.treat.non.html.as.error: true
parser.emitOutlinks: false
parser.emitOutlinks.max.per.page: -1
track.anchors: true
detect.mimetype: true
detect.charset.maxlength: 10000
# filters URLs in sitemaps based on their modified Date (if any)
sitemap.filter.hours.since.modified: -1
# staggered scheduling of sitemaps
sitemap.schedule.delay: -1
sitemap.discovery: true
scheduler.class: "com.digitalpebble.stormcrawler.persistence.DefaultScheduler"
fetchInterval.default: 1440
fetchInterval.fetch.error: 120
# never revisit a page with an error (or set a value in minutes)
fetchInterval.error: -1
max.fetch.errors: 3
status.updater.use.cache: true
status.updater.cache.spec: "maximumSize=10000,expireAfterAccess=1h"
status.updater.unit.round.date: "SECOND"
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.text.maxlength: -1
indexer.canonical.name: "canonical"
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
- parse.description=description
谢谢。
答案 0 :(得分:0)
您可以通过简单地设置
来访问同一顶级域中的链接。 "ignoreOutsideHost": false,
"ignoreOutsideDomain": true
在HostURLFilter的配置中。这将处理诸如www。之类的变体。请注意,协议本身不起作用。
如果要跟踪跨不同TLD的重定向,则必须编写自己的URL过滤器,因为HostURLFilter不能区分重定向和出站。