我正在编写一个go项目,它是一个简单的网络爬虫,用于抓取网站上的链接。我想尝试并发功能,如goroutines和渠道。但是,当我运行它时,它没有通过。没有任何东西表现得好像没有任何事情发生。我不知道出了什么问题。有人可以为我指出它吗?
如果我删除了通道逻辑但它希望它将链接发送到缓冲通道,然后在结束程序之前显示链接,它可以工作并显示所有已爬网的链接。该程序应该能够达到程序中指定的任何深度。目前深度为1。
package main
import (
"fmt"
"log"
"net/http"
"os"
"strings"
"time"
"golang.org/x/net/html"
)
// Link type to be sent over channel
type Link struct {
URL string
ok bool
}
func main() {
if len(os.Args) != 2 {
fmt.Println("Usage: crawl [URL].")
}
url := os.Args[1]
if !strings.HasPrefix(url, "http://") {
url = "http://" + url
}
ch := make(chan *Link, 5)
crawl(url, 1, ch)
visited := make(map[string]bool)
time.Sleep(2 * time.Second)
for link := range ch {
if _, ok := visited[link.URL]; !ok {
visited[link.URL] = true
}
}
close(ch)
for l := range visited {
fmt.Println(l)
}
}
func crawl(url string, n int, ch chan *Link) {
if n < 1 {
return
}
resp, err := http.Get(url)
if err != nil {
log.Fatalf("Can not reach the site. Error = %v\n", err)
os.Exit(1)
}
b := resp.Body
defer b.Close()
z := html.NewTokenizer(b)
nextN := n - 1
for {
token := z.Next()
switch token {
case html.ErrorToken:
return
case html.StartTagToken:
current := z.Token()
if current.Data != "a" {
continue
}
result, ok := getHrefTag(current)
if !ok {
continue
}
hasProto := strings.HasPrefix(result, "http")
if hasProto {
go crawl(result, nextN, ch)
ch <- &Link{result, true}
}
}
}
}
func getHrefTag(token html.Token) (result string, ok bool) {
for _, a := range token.Attr {
if a.Key == "href" {
result = a.Val
ok = true
break
}
}
return
}
更新:
经过一番调整我想出来改变代码以删除数据竞赛,但是我仍然不知道如何避免抓取以前访问过的网址(也许我应该开始另一个问题?):
package main
import (
"fmt"
"log"
"net/http"
"os"
"strings"
"golang.org/x/net/html"
)
func main() {
if len(os.Args) != 2 {
fmt.Println("Usage: crawl [URL].")
}
url := os.Args[1]
if !strings.HasPrefix(url, "http://") {
url = "http://" + url
}
for link := range newCrawl(url, 1) {
fmt.Println(link)
}
}
func newCrawl(url string, num int) chan string {
ch := make(chan string, 20)
go func() {
crawl(url, 1, ch)
close(ch)
}()
return ch
}
func crawl(url string, n int, ch chan string) {
if n < 1 {
return
}
resp, err := http.Get(url)
if err != nil {
log.Fatalf("Can not reach the site. Error = %v\n", err)
os.Exit(1)
}
b := resp.Body
defer b.Close()
z := html.NewTokenizer(b)
nextN := n - 1
for {
token := z.Next()
switch token {
case html.ErrorToken:
return
case html.StartTagToken:
current := z.Token()
if current.Data != "a" {
continue
}
result, ok := getHrefTag(current)
if !ok {
continue
}
hasProto := strings.HasPrefix(result, "http")
if hasProto {
done := make(chan struct{})
go func() {
crawl(result, nextN, ch)
close(done)
}()
<-done
ch <- result
}
}
}
}
func getHrefTag(token html.Token) (result string, ok bool) {
for _, a := range token.Attr {
if a.Key == "href" {
result = a.Val
ok = true
break
}
}
return
}
答案 0 :(得分:0)
我认为递归调用goroutines并不是一个好主意。它可以简单地失去控制..我更喜欢这样的扁平模型:
package main
import (
"fmt"
"log"
"net/http"
"os"
"strings"
"sync"
"golang.org/x/net/html"
)
func main() {
if len(os.Args) != 2 {
fmt.Println("Usage: crawl [URL].")
}
url := os.Args[1]
if !strings.HasPrefix(url, "http://") {
url = "http://" + url
}
wg := NewWorkGroup(1)
wg.Crawl(url)
for k, v := range wg.urlMap {
fmt.Printf("%s: %d\n", k, v)
}
}
// represents single link and its deph
type Link struct {
url string
deph uint32
}
// wraps all around to group
type WorkGroup struct {
*sync.WaitGroup
maxDeph uint32
numW int
pool chan *Worker
linkQ chan Link
urlMap map[string]uint32
}
type Worker struct {
result chan []Link
}
func newWorker() *Worker {
return &Worker{
result: make(chan []Link),
}
}
func NewWorkGroup(maxDeph uint32) *WorkGroup {
numW := int(maxDeph)
if maxDeph > 10 {
numW = 10
}
return &WorkGroup{
WaitGroup: new(sync.WaitGroup),
maxDeph: maxDeph,
numW: numW,
pool: make(chan *Worker, numW),
linkQ: make(chan Link, 100),
urlMap: make(map[string]uint32),
}
}
// dispatch workers -> filter visited -> send not visited to channel
// pool + dispatcher keep order so workers go level by level
func (wg *WorkGroup) spawnDispatcher() {
wg.Add(1)
go func() {
defer wg.Done()
defer close(wg.linkQ)
for w := range wg.pool {
links := <-w.result
for i := 0; i < len(links); i++ {
if _, ok := wg.urlMap[links[i].url]; !ok {
wg.urlMap[links[i].url] = links[i].deph
// dont process links that reach max deph
if links[i].deph < wg.maxDeph {
select {
case wg.linkQ <- links[i]:
// goes well
continue
default:
// channel is too short, protecting possible deadlock
}
// drop rest of links
break
}
}
}
// empty link channel + nothing in process = end
if len(wg.linkQ) == 0 && len(wg.pool) == 0 {
return
}
}
}()
}
//initialize goroutines and crawl url
func (wg *WorkGroup) Crawl(url string) {
defer close(wg.pool)
wg.spawnCrawlers()
wg.spawnDispatcher()
wg.linkQ <- Link{url: url, deph: 0}
wg.Wait()
}
func (wg *WorkGroup) spawnCrawlers() {
// custom num of workers, used maxDeph
for i := 0; i < wg.numW; i++ {
wg.newCrawler()
}
}
func (wg *WorkGroup) newCrawler() {
wg.Add(1)
go func(w *Worker) {
defer wg.Done()
defer close(w.result)
for link := range wg.linkQ {
wg.pool <- w
w.result <- getExternalUrls(link)
}
}(newWorker())
}
// default sligtly modified crawl function
func getExternalUrls(source Link) []Link {
resp, err := http.Get(source.url)
if err != nil {
log.Printf("Can not reach the site. Error = %v\n", err)
return nil
}
b := resp.Body
defer b.Close()
z := html.NewTokenizer(b)
links := []Link{}
for {
token := z.Next()
switch token {
case html.ErrorToken:
return links
case html.StartTagToken:
current := z.Token()
if current.Data != "a" {
continue
}
url, ok := getHrefTag(current)
if ok && strings.HasPrefix(url, "http") {
links = append(links, Link{url: url, deph: source.deph + 1})
}
}
}
return links
}
//default function
func getHrefTag(token html.Token) (result string, ok bool) {
for _, a := range token.Attr {
if a.Key == "href" {
result = a.Val
ok = true
break
}
}
return
}