我想知道解决此问题的惯用方法(目前引发死锁错误),递归分支未知次数,因此我不能简单地关闭通道。
http://play.golang.org/p/avLf_sQJj_
我通过将指针传递给数字并递增它来使其工作,并且我已经研究过使用同步等待组。我没有感觉到(我可能错了),我想出了一个优雅的解决方案。我见过的Go示例往往简单,聪明,简洁。
这是Tour of Go的最后一项练习,https://tour.golang.org/#73
你知道吗?Go程序员'会管理吗?任何帮助,将不胜感激。我从一开始就努力学习。
答案 0 :(得分:3)
以下是我对这项练习的解释。有很多人喜欢它,但这是我的。我使用sync.WaitGroup
和自定义的受互斥锁保护的地图来存储访问过的网址。主要是因为Go的标准map
类型不是线程安全的。我还将数据和错误通道组合成单个结构,其具有读取所述通道的方法。主要用于分离问题和(可以说)保持一点清洁。
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(wg *sync.WaitGroup, url string, depth int, fetcher Fetcher, cache *UrlCache, results *Results) {
defer wg.Done()
if depth <= 0 || !cache.AtomicSet(url) {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
results.Error <- err
return
}
results.Data <- [2]string{url, body}
for _, url := range urls {
wg.Add(1)
go Crawl(wg, url, depth-1, fetcher, cache, results)
}
}
func main() {
var wg sync.WaitGroup
cache := NewUrlCache()
results := NewResults()
defer results.Close()
wg.Add(1)
go Crawl(&wg, "http://golang.org/", 4, fetcher, cache, results)
go results.Read()
wg.Wait()
}
// Results defines channels which yield results for a single crawled URL.
type Results struct {
Data chan [2]string // url + body.
Error chan error // Possible fetcher error.
}
func NewResults() *Results {
return &Results{
Data: make(chan [2]string, 1),
Error: make(chan error, 1),
}
}
func (r *Results) Close() error {
close(r.Data)
close(r.Error)
return nil
}
// Read reads crawled results or errors, for as long as the channels are open.
func (r *Results) Read() {
for {
select {
case data := <-r.Data:
fmt.Println(">", data)
case err := <-r.Error:
fmt.Println("e", err)
}
}
}
// UrlCache defines a cache of URL's we've already visited.
type UrlCache struct {
sync.Mutex
data map[string]struct{} // Empty struct occupies 0 bytes, whereas bool takes 1 bytes.
}
func NewUrlCache() *UrlCache { return &UrlCache{data: make(map[string]struct{})} }
// AtomicSet sets the given url in the cache and returns false if it already existed.
//
// All within the same locked context. Modifying a map without synchronisation is not safe
// when done from multiple goroutines. Doing a Exists() check and Set() separately will
// create a race condition, so we must combine both in a single operation.
func (c *UrlCache) AtomicSet(url string) bool {
c.Lock()
_, ok := c.data[url]
c.data[url] = struct{}{}
c.Unlock()
return !ok
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"http://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"http://golang.org/pkg/",
"http://golang.org/cmd/",
},
},
"http://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"http://golang.org/",
"http://golang.org/cmd/",
"http://golang.org/pkg/fmt/",
"http://golang.org/pkg/os/",
},
},
"http://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"http://golang.org/",
"http://golang.org/pkg/",
},
},
"http://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"http://golang.org/",
"http://golang.org/pkg/",
},
},
}
这尚未经过广泛测试,因此也许可以应用优化和修复,但它至少应该给你一些想法。
答案 1 :(得分:2)
您可以扩展在已解析的网址上发送的结果,并包含找到的新网址数,而不是涉及sync.WaitGroup
。在你的主循环中,只要有东西要收集,你就会继续阅读结果。
在您的情况下,找到的网址数量将是产生的例程数量,但不一定需要。我会亲自产生或多或少固定数量的提取例程,因此您不会打开太多的HTTP请求(或者至少您可以控制它)。那么你的主循环不会改变,因为它不关心如何执行提取。这里的重要事实是你需要为每个url发送一个结果或错误 - 我在这里修改了代码,所以当深度已经为1时它不会产生新的例程。
此解决方案的一个副作用是您可以轻松地在主循环中打印进度。
以下是操场上的示例:
http://play.golang.org/p/BRlUc6bojf
package main
import (
"fmt"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
type Res struct {
url string
body string
found int // Number of new urls found
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher, ch chan Res, errs chan error, visited map[string]bool) {
body, urls, err := fetcher.Fetch(url)
visited[url] = true
if err != nil {
errs <- err
return
}
newUrls := 0
if depth > 1 {
for _, u := range urls {
if !visited[u] {
newUrls++
go Crawl(u, depth-1, fetcher, ch, errs, visited)
}
}
}
// Send the result along with number of urls to be fetched
ch <- Res{url, body, newUrls}
return
}
func main() {
ch := make(chan Res)
errs := make(chan error)
visited := map[string]bool{}
go Crawl("http://golang.org/", 4, fetcher, ch, errs, visited)
tocollect := 1
for n := 0; n < tocollect; n++ {
select {
case s := <-ch:
fmt.Printf("found: %s %q\n", s.url, s.body)
tocollect += s.found
case e := <-errs:
fmt.Println(e)
}
}
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"http://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"http://golang.org/pkg/",
"http://golang.org/cmd/",
},
},
"http://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"http://golang.org/",
"http://golang.org/cmd/",
"http://golang.org/pkg/fmt/",
"http://golang.org/pkg/os/",
},
},
"http://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"http://golang.org/",
"http://golang.org/pkg/",
},
},
"http://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"http://golang.org/",
"http://golang.org/pkg/",
},
},
}
是的,请关注@jimt建议并访问地图线程安全。
答案 2 :(得分:0)
这是我解决Go Tour网络爬虫练习的方式
为了跟踪并行执行中的递归完成,我使用了Atomic Integer计数器来跟踪并行递归中抓取了多少个url。在主函数中,我循环等待直到原子计数器递减回零。
为避免再次抓取相同的URL ,我使用了Mutex映射来跟踪抓取的URL。
下面是相同的代码段。
您可以找到entire working code here on Github
// Safe HashSet Version
type SafeHashSet struct {
sync.Mutex
urls map[string]bool //Primarily we wanted use this as an hashset, so the value of map is not significant to us
}
var (
urlSet SafeHashSet
urlCounter int64
)
// Adds an URL to the Set, returns true if new url was added (if not present already)
func (m *SafeHashSet) add(newUrl string) bool {
m.Lock()
defer m.Unlock()
_, ok := m.urls[newUrl]
if !ok {
m.urls[newUrl] = true
return true
}
return false
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
// Decrement the atomic url counter, when this crawl function exits
defer atomic.AddInt64(&urlCounter, -1)
if depth <= 0 {
return
}
// Don't Process a url if it is already processed
isNewUrl := urlSet.add(url)
if !isNewUrl {
fmt.Printf("skip: \t%s\n", url)
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: \t%s %q\n", url, body)
for _, u := range urls {
atomic.AddInt64(&urlCounter, 1)
// Crawl parallely
go Crawl(u, depth-1, fetcher)
}
return
}
func main() {
urlSet = SafeHashSet{urls: make(map[string]bool)}
atomic.AddInt64(&urlCounter, 1)
go Crawl("https://golang.org/", 4, fetcher)
for atomic.LoadInt64(&urlCounter) > 0 {
time.Sleep(100 * time.Microsecond)
}
fmt.Println("Exiting")
}