Question

我正在制作一个网络爬虫。我正在通过一个爬虫函数传递url并解析它以获取anchor标记中的所有链接，然后我为所有这些url使用单独的goroutine为每个url调用相同的爬虫函数。
但是，如果在我收到回复之前发送请求并取消请求，则该特定请求的所有内容仍然在运行现在我想要的是当我取消请求时，由于该请求而被调用的所有goroutine都停止了。
请指导。
以下是抓取工具功能的代码。

func crawler(c echo.Context, urlRec string, feed chan string, urlList *[]string, wg *sync.WaitGroup) {
    defer wg.Done()
    URL, _ := url.Parse(urlRec)
    response, err := http.Get(urlRec)
    if err != nil {
        log.Print(err)
        return
    }

    body := response.Body
    defer body.Close()

    tokenizer := html.NewTokenizer(body)
    flag := true
    for flag {
        tokenType := tokenizer.Next()
        switch {
        case tokenType == html.ErrorToken:
            flag = false
            break
        case tokenType == html.StartTagToken:
            token := tokenizer.Token()

            // Check if the token is an <a> tag
            isAnchor := token.Data == "a"
            if !isAnchor {
                continue
            }

            ok, urlHref := getReference(token)
            if !ok {
                continue
            }

            // Make sure the url begines in http**
            hasProto := strings.Index(urlHref, "http") == 0
            if hasProto {
                if !urlInURLList(urlHref, urlList) {
                    if strings.Contains(urlHref, URL.Host) {
                        *urlList = append(*urlList, urlHref)
                        // fmt.Println(urlHref)
                        // c.String(http.StatusOK, urlHref+"\n")Documents
                        if !checkExt(filepath.Ext(urlHref)) {
                            wg.Add(1)
                            go crawler(c, urlHref, feed, urlList, wg)
                        }
                    }
                }
            }
        }
    }
}

以下是我的POST请求处理程序

func scrapePOST(c echo.Context) error {
    var urlList []string
    urlSession := urlFound{}
    var wg sync.WaitGroup
    urlParam := c.FormValue("url")
    feed := make(chan string, 1000)
    wg.Add(1)
    go crawler(c, urlParam, feed, &urlList, &wg)
    wg.Wait()
    var count = 0
    for _, url := range urlList {
        if filepath.Ext(url) == ".jpg" || filepath.Ext(url) == ".jpeg" || filepath.Ext(url) == ".png" {
            urlSession.Images = append(urlSession.Images, url)
        } else if filepath.Ext(url) == ".doc" || filepath.Ext(url) == ".docx" || filepath.Ext(url) == ".pdf" || filepath.Ext(url) == ".ppt" {
            urlSession.Documents = append(urlSession.Documents, url)
        } else {
            urlSession.Links = append(urlSession.Links, url)
        }
        count = count + 1
    }
    urlSession.Count = count
    // jsonResp, _ := json.Marshal(urlSession)
    // fmt.Print(urlSession)
    return c.JSON(http.StatusOK, urlSession)
}

Answer 1

echo上下文公开HTTP请求，该请求的上下文已与服务器请求相关联。只需获取该上下文，并检查它是否取消，和/或将其传递给采用上下文的方法。

ctx := c.Request().Context()
select {
case <-ctx.Done():
    return ctx.Err()
default:
    // Continue handling the request
}

// and pass along to the db or whatever else:
rows, err := db.QueryContext(ctx, ...)

如果客户端中止连接，则会自动取消Request-scoped上下文。

如果您想添加自己的取消条件，（超时或其他），您也可以这样做：

req := c.Request()
ctx, cancel := context.WithCancel(req.Context())
req.WithContext(ctx)
defer cancel()
// do stuff, which may conditionally call cancel() to cancel the context early

取消HTTP请求时关闭所有goroutine

1 个答案: