通过递归调用时,例程不运行

时间:2018-06-11 03:30:57

标签: go

我在游览之旅中遇到Web Crawler问题。到目前为止,这是我的解决方案:

func GatherUrls(url string, fetcher Fetcher) []string {
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Printf("found: %s %q\n", url, body)
    }
    return urls
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // get all urls for depth
    // check if url has been crawled
    //  Y: noop
    //  N: crawl url
    // when depth is 0, stop
    fmt.Printf("crawling %q...\n", url)
    if depth <= 0 {
        return
    }
    urls := GatherUrls(url, fetcher)
    fmt.Println("urls:", urls)
    for _, u := range urls {
        fmt.Println("currentUrl:", u)
        if _, exists := cache[u]; !exists {
            fmt.Printf("about to crawl %q\n", u)
            go Crawl(u, depth - 1, fetcher)
        } else {
            cache[u] = true
        }
    }
}

func main() {
    cache = make(map[string]bool)
    Crawl("https://golang.org/", 4, fetcher)
}

当我运行此代码时,在函数递归时永远不会调用Crawl()(我知道这是因为fmt.Printf("crawling %q...\n", url)只被调用过一次)

以下是日志:

crawling "https://golang.org/"...
found: https://golang.org/ "The Go Programming Language"
urls: [https://golang.org/pkg/ https://golang.org/cmd/]
currentUrl: https://golang.org/pkg/
about to crawl "https://golang.org/pkg/"
currentUrl: https://golang.org/cmd/
about to crawl "https://golang.org/cmd/"

我做错了什么?我怀疑产生一个线程进行递归是错误的方法吗?请指教。

请注意我希望尽可能少地使用这些库。我已经看到了WaitGroup包的一些答案。我不想用这个。

注意:包括课程样板的完整代码如下: 包主要

import (
    "fmt"
)

var cache map[string]bool

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

func GatherUrls(url string, fetcher Fetcher) []string {
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Printf("found: %s %q\n", url, body)
    }
    return urls
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // get all urls for depth
    // check if url has been crawled
    //  Y: noop
    //  N: crawl url
    // when depth is 0, stop
    fmt.Printf("crawling %q...\n", url)
    if depth <= 0 {
        return
    }
    urls := GatherUrls(url, fetcher)
    fmt.Println("urls:", urls)
    for _, u := range urls {
        fmt.Println("currentUrl:", u)
        if _, exists := cache[u]; !exists {
            fmt.Printf("about to crawl %q\n", u)
            go Crawl(u, depth - 1, fetcher)
        } else {
            cache[u] = true
        }
    }
}

func main() {
    cache = make(map[string]bool)
    Crawl("https://golang.org/", 4, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}

3 个答案:

答案 0 :(得分:0)

main()函数在goroutines执行之前退出。使用wait group

进行修复

cache上有数据竞争。用互斥锁保护它。始终为要访问的网址设置cache[u] = true

var wg sync.WaitGroup
var mu sync.Mutex
var fetched = map[string]bool{}

func Crawl(url string, depth int, fetcher Fetcher) {
    if depth <= 0 {
        return
    }
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q\n", url, body)
    for _, u := range urls {
        mu.Lock()
        f := fetched[u]
        fetched[u] = true
        mu.Unlock()
        if !f {
            wg.Add(1)
            go func(u string) {
                defer wg.Done()
                Crawl(u, depth-1, fetcher)
            }(u)
        }
    }
    return
}

playground example

Wait groups是等待goroutines完成的惯用方法。如果由于某种原因无法使用sync.WaitGroup,请使用计数器,互斥锁和频道重新实现类型:

type WaitGroup struct {
    mu   sync.Mutex
    n    int
    done chan struct{}
}

func (wg *WaitGroup) Add(i int) {
    wg.mu.Lock()
    defer wg.mu.Unlock()
    if wg.done == nil {
        wg.done = make(chan struct{})
    }
    wg.n += i
    if wg.n < 0 {
        panic("negative count")
    }
    if wg.n == 0 {
        close(wg.done)
        wg.done = nil
    }
}

func (wg *WaitGroup) Done() {
    wg.Add(-1)
}

func (wg *WaitGroup) Wait() {
    wg.mu.Lock()
    done := wg.done
    wg.mu.Unlock()
    if done != nil {
        <-done
    }
}

playground example

答案 1 :(得分:0)

正如您在此示例中所见:https://tour.golang.org/concurrency/10,我们应该执行以下任务:

  • 并行获取网址。
  • 不要两次获取相同的网址。
  • 已在地图上提取的缓存网址,但单独使用地图对于并发使用并不安全!

因此,我们可以执行以下步骤来解决上述任务:

创建结构来存储获取结果:

type Result struct {
    body string
    urls []string
    err  error
}

创建一个存储网址的结构已经在地图上提取,我们需要使用sync.Mutex,这不会在&#39; A Tour of Go&#39;中引入:

type Cache struct {
    store map[string]bool
    mux   sync.Mutex
}

并行获取URL和正文:在获取URL时将URL添加到缓存中,但首先我们需要通过互斥锁并行锁定读/写。因此,我们可以像这样修改Crawl函数:

func Crawl(url string, depth int, fetcher Fetcher) {
    if depth <= 0 {
        return
    }

    ch := make(chan Result)

    go func(url string, res chan Result) {
        body, urls, err := fetcher.Fetch(url)

        if err != nil {
            ch <- Result{body, urls, err}
            return
        }

        var furls []string
        cache.mux.Lock()
        for _, u := range urls {
            if _, exists := cache.store[u]; !exists {
                furls = append(furls, u)
            }
            cache.store[u] = true
        }
        cache.mux.Unlock()

        ch <- Result{body: body, urls: furls, err: err}

    }(url, ch)

    res := <-ch

    if res.err != nil {
        fmt.Println(res.err)
        return
    }

    fmt.Printf("found: %s %q\n", url, res.body)

    for _, u := range res.urls {
        Crawl(u, depth-1, fetcher)
    }
}

您可以查看完整代码并在操场上运行:https://play.golang.org/p/iY9uBXchx3w

希望得到这个帮助。

答案 2 :(得分:-1)

因为主要功能是退出

你需要添加sync.WaitGroup来保持主要功能等待单元的所有协同工作

package main

import (
    "fmt"
    "sync"
)

var cache map[string]bool

var wg sync.WaitGroup

type Fetcher interface {
    // Fetch returns the body of URL and
    // a slice of URLs found on that page.
    Fetch(url string) (body string, urls []string, err error)
}

func GatherUrls(url string, fetcher Fetcher, Urls chan []string) {
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Printf("found: %s %q\n", url, body)
    }
    Urls <- urls
    wg.Done()
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // get all urls for depth
    // check if url has been crawled
    //  Y: noop
    //  N: crawl url
    // when depth is 0, stop
    fmt.Printf("crawling %q... %d\n", url, depth)
    if depth <= 0 {
        return
    }
    uc := make(chan []string)
    wg.Add(1)
    go GatherUrls(url, fetcher, uc)
    urls, _ := <-uc
    fmt.Println("urls:", urls)
    for _, u := range urls {
        fmt.Println("currentUrl:", u)
        if _, exists := cache[u]; !exists {
            fmt.Printf("about to crawl %q\n", u)
            wg.Add(1)
            go Crawl(u, depth-1, fetcher)
        } else {
            cache[u] = true
        }
    }
    wg.Done()
}

func main() {
    cache = make(map[string]bool)
    wg.Add(1)
    go Crawl("https://golang.org/", 4, fetcher)
    wg.Wait()
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
    body string
    urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
    if res, ok := f[url]; ok {
        return res.body, res.urls, nil
    }
    return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
    "https://golang.org/": &fakeResult{
        "The Go Programming Language",
        []string{
            "https://golang.org/pkg/",
            "https://golang.org/cmd/",
        },
    },
    "https://golang.org/pkg/": &fakeResult{
        "Packages",
        []string{
            "https://golang.org/",
            "https://golang.org/cmd/",
            "https://golang.org/pkg/fmt/",
            "https://golang.org/pkg/os/",
        },
    },
    "https://golang.org/pkg/fmt/": &fakeResult{
        "Package fmt",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
    "https://golang.org/pkg/os/": &fakeResult{
        "Package os",
        []string{
            "https://golang.org/",
            "https://golang.org/pkg/",
        },
    },
}