Question

我正在编写一个go项目，它是一个简单的网络爬虫，用于抓取网站上的链接。我想尝试并发功能，如goroutines和渠道。但是，当我运行它时，它没有通过。没有任何东西表现得好像没有任何事情发生。我不知道出了什么问题。有人可以为我指出它吗？

如果我删除了通道逻辑但它希望它将链接发送到缓冲通道，然后在结束程序之前显示链接，它可以工作并显示所有已爬网的链接。该程序应该能够达到程序中指定的任何深度。目前深度为1。

package main

import (
    "fmt"
    "log"
    "net/http"
    "os"
    "strings"
    "time"

    "golang.org/x/net/html"
)

// Link type to be sent over channel
type Link struct {
    URL string
    ok  bool
}

func main() {
    if len(os.Args) != 2 {
        fmt.Println("Usage: crawl [URL].")
    }

    url := os.Args[1]
    if !strings.HasPrefix(url, "http://") {
        url = "http://" + url
    }

    ch := make(chan *Link, 5)
    crawl(url, 1, ch)

    visited := make(map[string]bool)

    time.Sleep(2 * time.Second)

    for link := range ch {
        if _, ok := visited[link.URL]; !ok {
            visited[link.URL] = true
        }
    }

    close(ch)
    for l := range visited {
        fmt.Println(l)
    }
}

func crawl(url string, n int, ch chan *Link) {
    if n < 1 {
        return
    }
    resp, err := http.Get(url)
    if err != nil {
        log.Fatalf("Can not reach the site. Error = %v\n", err)
        os.Exit(1)
    }

    b := resp.Body
    defer b.Close()

    z := html.NewTokenizer(b)

    nextN := n - 1
    for {
        token := z.Next()

        switch token {
        case html.ErrorToken:
            return
        case html.StartTagToken:
            current := z.Token()
            if current.Data != "a" {
                continue
            }
            result, ok := getHrefTag(current)
            if !ok {
                continue
            }

            hasProto := strings.HasPrefix(result, "http")
            if hasProto {
                go crawl(result, nextN, ch)
                ch <- &Link{result, true}
            }
        }
    }

}

func getHrefTag(token html.Token) (result string, ok bool) {
    for _, a := range token.Attr {
        if a.Key == "href" {
            result = a.Val
            ok = true
            break
        }
    }
    return
}

更新：

经过一番调整我想出来改变代码以删除数据竞赛，但是我仍然不知道如何避免抓取以前访问过的网址（也许我应该开始另一个问题？）：

package main

import (
    "fmt"
    "log"
    "net/http"
    "os"
    "strings"

    "golang.org/x/net/html"
)

func main() {
    if len(os.Args) != 2 {
        fmt.Println("Usage: crawl [URL].")
    }

    url := os.Args[1]
    if !strings.HasPrefix(url, "http://") {
        url = "http://" + url
    }

    for link := range newCrawl(url, 1) {
        fmt.Println(link)
    }
}

func newCrawl(url string, num int) chan string {
    ch := make(chan string, 20)

    go func() {
        crawl(url, 1, ch)
        close(ch)
    }()

    return ch
}

func crawl(url string, n int, ch chan string) {
    if n < 1 {
        return
    }
    resp, err := http.Get(url)
    if err != nil {
        log.Fatalf("Can not reach the site. Error = %v\n", err)
        os.Exit(1)
    }

    b := resp.Body
    defer b.Close()

    z := html.NewTokenizer(b)

    nextN := n - 1
    for {
        token := z.Next()

        switch token {
        case html.ErrorToken:
            return
        case html.StartTagToken:
            current := z.Token()
            if current.Data != "a" {
                continue
            }
            result, ok := getHrefTag(current)
            if !ok {
                continue
            }

            hasProto := strings.HasPrefix(result, "http")
            if hasProto {
                done := make(chan struct{})
                go func() {
                    crawl(result, nextN, ch)
                    close(done)
                }()
                <-done
                ch <- result
            }
        }
    }
}

func getHrefTag(token html.Token) (result string, ok bool) {
    for _, a := range token.Attr {
        if a.Key == "href" {
            result = a.Val
            ok = true
            break
        }
    }
    return
}

Answer 1

我认为递归调用goroutines并不是一个好主意。它可以简单地失去控制..我更喜欢这样的扁平模型：

package main

import (
    "fmt"
    "log"
    "net/http"
    "os"
    "strings"
    "sync"

    "golang.org/x/net/html"
)

func main() {

    if len(os.Args) != 2 {
        fmt.Println("Usage: crawl [URL].")
    }

    url := os.Args[1]
    if !strings.HasPrefix(url, "http://") {
        url = "http://" + url
    }

    wg := NewWorkGroup(1)
    wg.Crawl(url)
    for k, v := range wg.urlMap {
        fmt.Printf("%s: %d\n", k, v)
    }
}

// represents single link and its deph
type Link struct {
    url  string
    deph uint32
}

// wraps all around to group
type WorkGroup struct {
    *sync.WaitGroup
    maxDeph uint32
    numW    int
    pool    chan *Worker
    linkQ   chan Link
    urlMap  map[string]uint32
}

type Worker struct {
    result chan []Link
}

func newWorker() *Worker {
    return &Worker{
        result: make(chan []Link),
    }
}

func NewWorkGroup(maxDeph uint32) *WorkGroup {
    numW := int(maxDeph)
    if maxDeph > 10 {
        numW = 10
    }
    return &WorkGroup{
        WaitGroup: new(sync.WaitGroup),
        maxDeph:   maxDeph,
        numW:      numW,
        pool:      make(chan *Worker, numW),
        linkQ:     make(chan Link, 100),
        urlMap:    make(map[string]uint32),
    }
}

// dispatch workers -> filter visited -> send not visited to channel
// pool + dispatcher keep order so workers go level by level
func (wg *WorkGroup) spawnDispatcher() {
    wg.Add(1)
    go func() {
        defer wg.Done()
        defer close(wg.linkQ)

        for w := range wg.pool {
            links := <-w.result
            for i := 0; i < len(links); i++ {
                if _, ok := wg.urlMap[links[i].url]; !ok {
                    wg.urlMap[links[i].url] = links[i].deph

                    // dont process links that reach max deph
                    if links[i].deph < wg.maxDeph {
                        select {
                        case wg.linkQ <- links[i]:
                            // goes well
                            continue
                        default:
                            // channel is too short, protecting possible deadlock
                        }
                        // drop rest of links
                        break
                    }
                }
            }
            // empty link channel + nothing in process = end
            if len(wg.linkQ) == 0 && len(wg.pool) == 0 {
                return
            }
        }
    }()
}

//initialize goroutines and crawl url
func (wg *WorkGroup) Crawl(url string) {
    defer close(wg.pool)
    wg.spawnCrawlers()
    wg.spawnDispatcher()
    wg.linkQ <- Link{url: url, deph: 0}
    wg.Wait()
}

func (wg *WorkGroup) spawnCrawlers() {
    // custom num of workers, used maxDeph
    for i := 0; i < wg.numW; i++ {
        wg.newCrawler()
    }
}

func (wg *WorkGroup) newCrawler() {
    wg.Add(1)
    go func(w *Worker) {
        defer wg.Done()
        defer close(w.result)

        for link := range wg.linkQ {
            wg.pool <- w
            w.result <- getExternalUrls(link)
        }
    }(newWorker())
}

// default sligtly modified crawl function
func getExternalUrls(source Link) []Link {
    resp, err := http.Get(source.url)
    if err != nil {
        log.Printf("Can not reach the site. Error = %v\n", err)
        return nil
    }

    b := resp.Body
    defer b.Close()

    z := html.NewTokenizer(b)

    links := []Link{}

    for {
        token := z.Next()

        switch token {
        case html.ErrorToken:
            return links
        case html.StartTagToken:
            current := z.Token()
            if current.Data != "a" {
                continue
            }
            url, ok := getHrefTag(current)
            if ok && strings.HasPrefix(url, "http") {
                links = append(links, Link{url: url, deph: source.deph + 1})
            }
        }
    }
    return links
}

//default function
func getHrefTag(token html.Token) (result string, ok bool) {
    for _, a := range token.Attr {
        if a.Key == "href" {
            result = a.Val
            ok = true
            break
        }
    }
    return
}

理解在golang并发上下文中正确使用通道

1 个答案: