我想在Go by:
中编写一个简单的Web scraper这是我的代码:
package main
import (
"encoding/csv"
"flag"
"fmt"
"github.com/PuerkitoBio/goquery"
"log"
"net/http"
"net/url"
"os"
"strings"
"sync"
)
type Enterprise struct {
name string
tax_code string
group string
capital string
}
var u, f string
var name, tax_code, group, capital string
func init() {
flag.StringVar(&u, "u", "", "Which URL to download from")
flag.StringVar(&f, "f", "", "Path to the csv file to write the output to")
}
func check(e error) {
if e != nil {
panic(e)
}
}
func findHrefs(u string) map[string]string {
resp, err := http.Get(u)
check(err)
doc, err := goquery.NewDocumentFromResponse(resp)
check(err)
e_hrefs := make(map[string]string)
doc.Find("td div a").Each(func(_ int, s *goquery.Selection) {
e_href, _ := s.Attr("href")
if strings.HasPrefix(e_href, "/Thong-tin-doanh-nghiep") && s.Text() != "" {
e_hrefs[e_href] = s.Text()
}
})
return e_hrefs
}
func fetch(url string, name string, file *os.File, wg *sync.WaitGroup, c chan Enterprise) {
defer wg.Done()
log.Println("Fetching URL", url)
resp, err := http.Get(url)
check(err)
doc, err := goquery.NewDocumentFromResponse(resp)
check(err)
e := new(Enterprise)
doc.Find("td").Each(func(_ int, s *goquery.Selection) {
if s.Text() == "Mã số thuế:" {
e.tax_code = s.Next().Text()
}
if s.Text() == "Tên ngành cấp 2:" {
e.group = s.Next().Text()
}
if s.Text() == "Sở hữu vốn:" {
e.capital = s.Next().Text()
}
})
w := csv.NewWriter(file)
w.Write([]string{name, "'" + e.tax_code, e.group, e.capital})
w.Flush()
c <- *e
}
func getDoc(u, f string) {
parsedUrl, err := url.Parse(u)
check(err)
file, err := os.Create(f)
check(err)
defer file.Close()
var wg sync.WaitGroup
c := make(chan Enterprise)
e_hrefs := findHrefs(u)
for e_href, name := range e_hrefs {
wg.Add(1)
go fetch(parsedUrl.Scheme+"://"+parsedUrl.Host+e_href, name, file, &wg, c)
}
wg.Wait()
}
func main() {
flag.Parse()
if u == "" || f == "" {
fmt.Println("-u=<URL to download from> -f=<Path to the CSV file>")
os.Exit(1)
}
getDoc(u, f)
}
问题是在完成所有goroutine之后通道未关闭,我必须按 control + C 来获取我的shell提示符:
2016/03/02 09:34:05 Fetching URL ...
2016/03/02 09:34:05 Fetching URL ...
2016/03/02 09:34:05 Fetching URL ...
^Csignal: interrupt
通过阅读this,我将getDoc
func中的最后一行更改为:
go func() {
wg.Wait()
close(c)
}()
现在我可以在运行时恢复我的shell提示,但是在所有goroutine完成之前通道已关闭,并且没有写入CSV文件。
我哪里出错了?
答案 0 :(得分:4)
对我来说,它看起来并不像你正在从你的频道中读取,并且因为它是一个同步频道(你从未在它上面声明了长度),它会在收到一个值时阻止。因此,您需要通过c
阅读value <- c
,或者您的获取功能只需c <- *e
这导致您的sync.WaitGroup
永远不会wg.Done()
从不递减计数器,这永远不会导致wg.Wait()
停止阻止,这会导致close(c)
永远不会被调用
答案 1 :(得分:0)
我的原始代码是这样的:
e_hrefs := findHrefs(u)
w := csv.NewWriter(file)
for e_href, name := range e_hrefs {
wg.Add(1)
go fetch(parsedUrl.Scheme+"://"+parsedUrl.Host+e_href, name, &wg, c)
e := <-c
w.Write([]string{name, "'" + e.tax_code, e.group, e.capital})
w.Flush()
}
wg.Wait()
你可以看到,它并不是并发。
我刚刚使用range子句修复了频道:
e_hrefs := findHrefs(u)
for e_href, name := range e_hrefs {
wg.Add(1)
go fetch(parsedUrl.Scheme+"://"+parsedUrl.Host+e_href, name, &wg, c)
}
go func() {
wg.Wait()
close(c)
}()
w := csv.NewWriter(file)
for e := range c {
w.Write([]string{e.name, "'" + e.tax_code, e.group, e.capital})
w.Flush()
}