如何抓取重定向的URL?

时间:2019-09-16 07:22:47

标签: web-crawler stormcrawler

我正在尝试检索我不知道其确切地址的网站列表。其中有些是 https ,另一些是 http 或URL中可能包含 www ,或者可能没有,我只有他们的主机地址。这是一个例子:

spouts:
  - id: "spout"
    className: "com.digitalpebble.stormcrawler.spout.MemorySpout"
    parallelism: 1
    constructorArgs:
      - ["https://digikala.com/"]

这是爬网后的结果:

{
  "responseHeader":{
    "status":0,
    "QTime":0},
  "response":{"numFound":1,"start":0,"docs":[
      {
        "url":"https://digikala.com/",
        "host":"digikala.com",
        "status":"REDIRECTION",
        "metadata._redirTo":["https://www.digikala.com/"],
        "nextFetchDate":"2019-09-17T06:34:38Z"}]
  }}

我想要重定向网站,那么搜寻器会发现重定向的URL及其所有子链接。 我该如何实现? 这是我的 urlfilters.json

{
    "com.digitalpebble.stormcrawler.filtering.URLFilters": [
        {
            "class": "com.digitalpebble.stormcrawler.filtering.basic.BasicURLFilter",
            "name": "BasicURLFilter",
            "params": {
                "maxPathRepetition": 8,
                "maxLength": 8192
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.depth.MaxDepthFilter",
            "name": "MaxDepthFilter",
            "params": {
                "maxDepth": 2
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.basic.BasicURLNormalizer",
            "name": "BasicURLNormalizer",
            "params": {
                "removeAnchorPart": true,
                "unmangleQueryString": true,
                "checkValidURI": true,
                "removeHashes": false
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.host.HostURLFilter",
            "name": "HostURLFilter",
            "params": {
                "ignoreOutsideHost": true,
                "ignoreOutsideDomain": false
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.regex.RegexURLNormalizer",
            "name": "RegexURLNormalizer",
            "params": {
                "regexNormalizerFile": "default-regex-normalizers.xml"
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.regex.RegexURLFilter",
            "name": "RegexURLFilter",
            "params": {
                "regexFilterFile": "default-regex-filters.txt"
            }
        },
        {
            "class": "com.digitalpebble.stormcrawler.filtering.basic.SelfURLFilter",
            "name": "SelfURLFilter"
        }
    ]
}

这是我的 crawler-conf.yaml

config: 
  fetcher.server.delay: 1.0
  fetcher.server.min.delay: 0.0
  fetcher.queue.mode: "byHost"
  fetcher.threads.per.queue: 1
  fetcher.threads.number: 10
  fetcher.max.urls.in.queues: -1
  fetcher.max.queue.size: -1
  fetcher.max.crawl.delay: 30

  fetcher.max.crawl.delay.force: false

  fetcher.server.delay.force: false

  fetcher.metrics.time.bucket.secs: 10


  fetcher.max.throttle.sleep: -1

  partition.url.mode: "byHost"


  metadata.persist:
   - _redirTo
   - error.cause
   - error.source
   - isSitemap
   - isFeed

  metadata.track.path: true
  metadata.track.depth: true

  http.agent.name: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
  http.agent.version: "537.36"
  http.agent.description: "Free open-source web browser developed by Google. Chromium is the name of the open source project behind Google Chrome, released under the BSD license."
  http.agent.url: "http://www.google.com/chrome"
  http.agent.email: "someone@someorganization.com"

  http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3"
  http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
  http.content.limit: -1
  http.store.headers: false
  http.timeout: 10000


  http.content.partial.as.trimmed: false


  http.robots.403.allow: true

  robots.noFollow.strict: false
  http.skip.robots: true
  robots.cache.spec: "maximumSize=10000,expireAfterWrite=6h"
  robots.error.cache.spec: "maximumSize=10000,expireAfterWrite=1h"

  protocols: "http,https,file"
  http.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
  https.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"
  file.protocol.implementation: "com.digitalpebble.stormcrawler.protocol.file.FileProtocol"

  selenium.implicitlyWait: 0
  selenium.pageLoadTimeout: -1
  selenium.setScriptTimeout: 0
  selenium.instances.num: 1
  selenium.capabilities:
    takesScreenshot: false
    loadImages: false
    javascriptEnabled: true

  selenium.delegated.protocol: "com.digitalpebble.stormcrawler.protocol.httpclient.HttpProtocol"

  # no url or parsefilters by default
  parsefilters.config.file: "parsefilters.json"
  urlfilters.config.file: "urlfilters.json"
  # JSoupParserBolt
  jsoup.treat.non.html.as.error: true
  parser.emitOutlinks: false
  parser.emitOutlinks.max.per.page: -1
  track.anchors: true
  detect.mimetype: true
  detect.charset.maxlength: 10000

  # filters URLs in sitemaps based on their modified Date (if any)
  sitemap.filter.hours.since.modified: -1

  # staggered scheduling of sitemaps
  sitemap.schedule.delay: -1


  sitemap.discovery: true

  scheduler.class: "com.digitalpebble.stormcrawler.persistence.DefaultScheduler"


  fetchInterval.default: 1440
  fetchInterval.fetch.error: 120

  # never revisit a page with an error (or set a value in minutes)
  fetchInterval.error: -1


  max.fetch.errors: 3

  status.updater.use.cache: true
  status.updater.cache.spec: "maximumSize=10000,expireAfterAccess=1h"

  status.updater.unit.round.date: "SECOND"

  indexer.url.fieldname: "url"
  indexer.text.fieldname: "content"
  indexer.text.maxlength: -1
  indexer.canonical.name: "canonical"
  indexer.md.mapping:
  - parse.title=title
  - parse.keywords=keywords
  - parse.description=description

谢谢。

1 个答案:

答案 0 :(得分:0)

您可以通过简单地设置

来访问同一顶级域中的链接。
            "ignoreOutsideHost": false,
            "ignoreOutsideDomain": true

在HostURLFilter的配置中。这将处理诸如www。之类的变体。请注意,协议本身不起作用。

如果要跟踪跨不同TLD的重定向,则必须编写自己的URL过滤器,因为HostURLFilter不能区分重定向和出站。