并发文件解析并插入Elastic Search

时间:2017-09-13 18:33:56

标签: elasticsearch go

我最近玩Go并提出了一个小脚本,它解析日志文件并将它们插入到弹性搜索中。对于每个文件,我产生了一个像这样的goroutine:

var wg := sync.WaitGroup{}
wg.Add(len(files))
for _, file := range files {
    go func(f os.FileInfo){
        defer wg.Done()
        ProcessFile(f.Name(), config.OriginFilePath, config.WorkingFilePath, config.ArchiveFilePath,fmt.Sprintf("http://%v:%v", config.ElasticSearch.Host, config.ElasticSearch.Port),config.ProviderIndex, config.NetworkData)
    }(file)
}
wg.Wait()

在我的processFile中,我有一个发送到弹性搜索的函数:

func BulkInsert(lines []string, ES *elastic.Client) (*elastic.Response, error){
    r, err := ES.PerformRequest("POST", "/_bulk", url.Values{}, strings.Join(lines, "\n")+"\n")
    if err != nil {
        return nil, err
    }
    return r, nil
}

问题在于我并不完全了解goroutines的工作原理。我的理解是发送到弹性搜索会阻止我的一个goroutine执行。我尝试使用相同的方法生成另一个用于弹性搜索的goroutine进行批量插入:

WaitGroupgo func(){defer wg.Done(); BulkInsert(elems, ES);}() 在我的函数返回之前和wg.Wait()。但是,我发现最终并非所有事件都以弹性搜索结束。我认为这是由于goroutines返回而没有发送/等待批量请求完成。

我的问题是,我对这个问题的处理方法是否正确?我可以获得更好的表现吗?

1 个答案:

答案 0 :(得分:1)

  

我可以获得更好的表现吗?

不清楚,这取决于接收者和发送者的能力。

  

我的问题是,我对这个问题的处理方法是否正确?

这可能有助于您更好地了解常规,

package main

import (
    "fmt"
    "log"
    "net/http"
    "sync"
    "time"
)

func main() {

    addr := "127.0.0.1:2074"

    srv := http.Server{
        Addr: addr,
        Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
            log.Println("hit ", r.URL.String())
            <-time.After(time.Second)
            log.Println("done ", r.URL.String())
        }),
    }
    fail(unblock(srv.ListenAndServe))

    jobs := []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

    // case 1
    // it creates 10 goroutines,
    // that triggers 10 // concurrent get queries
    {
        wg := sync.WaitGroup{}
        wg.Add(len(jobs))
        log.Printf("starting %v jobs\n", len(jobs))
        for _, job := range jobs {
            go func(job int) {
                defer wg.Done()
                http.Get(fmt.Sprintf("http://%v/job/%v", addr, job))
            }(job)
        }
        wg.Wait()
        log.Printf("done %v jobs\n", len(jobs))
    }

    log.Println()
    log.Println("=================")
    log.Println()

    // case 2
    // it creates 3 goroutines,
    // that triggers 3 // concurrent get queries
    {
        wg := sync.WaitGroup{}
        wg.Add(len(jobs))
        in := make(chan string)
        limit := make(chan bool, 3)
        log.Printf("starting %v jobs\n", len(jobs))
        go func() {
            for url := range in {
                limit <- true
                go func(url string) {
                    defer wg.Done()
                    http.Get(url)
                    <-limit
                }(url)
            }
        }()
        for _, job := range jobs {
            in <- fmt.Sprintf("http://%v/job/%v", addr, job)
        }
        wg.Wait()
        log.Printf("done %v jobs\n", len(jobs))
    }

    log.Println()
    log.Println("=================")
    log.Println()

    // case 2: rewrite
    // it creates 6 goroutines,
    // that triggers 6 // concurrent get queries
    {
        wait, add := parallel(6)
        log.Printf("starting %v jobs\n", len(jobs))
        for _, job := range jobs {
            url := fmt.Sprintf("http://%v/job/%v", addr, job)
            add(func() {
                http.Get(url)
            })
        }
        wait()
        log.Printf("done %v jobs\n", len(jobs))
    }
}

func parallel(c int) (func(), func(block func())) {
    wg := sync.WaitGroup{}
    in := make(chan func())
    limit := make(chan bool, c)
    go func() {
        for block := range in {
            limit <- true
            go func(block func()) {
                defer wg.Done()
                block()
                <-limit
            }(block)
        }
    }()
    return wg.Wait, func(block func()) {
        wg.Add(1)
        in <- block
    }
}

func unblock(block func() error) error {
    w := make(chan error)
    go func() { w <- block() }()
    select {
    case err := <-w:
        return err
    case <-time.After(time.Millisecond):
    }
    return nil
}

func fail(err error) {
    if err != nil {
        panic(err)
    }
}

输出

$ go run main.go 
2017/09/14 01:30:50 starting 10 jobs
2017/09/14 01:30:50 hit  /job/0
2017/09/14 01:30:50 hit  /job/4
2017/09/14 01:30:50 hit  /job/5
2017/09/14 01:30:50 hit  /job/2
2017/09/14 01:30:50 hit  /job/9
2017/09/14 01:30:50 hit  /job/1
2017/09/14 01:30:50 hit  /job/3
2017/09/14 01:30:50 hit  /job/7
2017/09/14 01:30:50 hit  /job/8
2017/09/14 01:30:50 hit  /job/6
2017/09/14 01:30:51 done  /job/5
2017/09/14 01:30:51 done  /job/4
2017/09/14 01:30:51 done  /job/2
2017/09/14 01:30:51 done  /job/0
2017/09/14 01:30:51 done  /job/6
2017/09/14 01:30:51 done  /job/9
2017/09/14 01:30:51 done  /job/1
2017/09/14 01:30:51 done  /job/3
2017/09/14 01:30:51 done  /job/7
2017/09/14 01:30:51 done  /job/8
2017/09/14 01:30:51 done 10 jobs
2017/09/14 01:30:51 
2017/09/14 01:30:51 =================
2017/09/14 01:30:51 
2017/09/14 01:30:51 starting 10 jobs
2017/09/14 01:30:51 hit  /job/0
2017/09/14 01:30:51 hit  /job/2
2017/09/14 01:30:51 hit  /job/1
2017/09/14 01:30:52 done  /job/2
2017/09/14 01:30:52 done  /job/0
2017/09/14 01:30:52 done  /job/1
2017/09/14 01:30:52 hit  /job/3
2017/09/14 01:30:52 hit  /job/4
2017/09/14 01:30:52 hit  /job/5
2017/09/14 01:30:53 done  /job/3
2017/09/14 01:30:53 done  /job/4
2017/09/14 01:30:53 done  /job/5
2017/09/14 01:30:53 hit  /job/6
2017/09/14 01:30:53 hit  /job/7
2017/09/14 01:30:53 hit  /job/8
2017/09/14 01:30:54 done  /job/6
2017/09/14 01:30:54 done  /job/7
2017/09/14 01:30:54 done  /job/8
2017/09/14 01:30:54 hit  /job/9
2017/09/14 01:30:55 done  /job/9
2017/09/14 01:30:55 done 10 jobs
2017/09/14 01:30:55 
2017/09/14 01:30:55 =================
2017/09/14 01:30:55 
2017/09/14 01:30:55 starting 10 jobs
2017/09/14 01:30:55 hit  /job/0
2017/09/14 01:30:55 hit  /job/1
2017/09/14 01:30:55 hit  /job/4
2017/09/14 01:30:55 hit  /job/2
2017/09/14 01:30:55 hit  /job/3
2017/09/14 01:30:55 hit  /job/5
2017/09/14 01:30:56 done  /job/0
2017/09/14 01:30:56 hit  /job/6
2017/09/14 01:30:56 done  /job/1
2017/09/14 01:30:56 done  /job/2
2017/09/14 01:30:56 done  /job/4
2017/09/14 01:30:56 hit  /job/7
2017/09/14 01:30:56 done  /job/3
2017/09/14 01:30:56 hit  /job/9
2017/09/14 01:30:56 hit  /job/8
2017/09/14 01:30:56 done  /job/5
2017/09/14 01:30:57 done  /job/6
2017/09/14 01:30:57 done  /job/7
2017/09/14 01:30:57 done  /job/9
2017/09/14 01:30:57 done  /job/8
2017/09/14 01:30:57 done 10 jobs