使用goroutine从多个源并行下载

时间:2015-12-19 16:41:13

标签: performance go amazon-s3 concurrency parallel-processing

首先,我想说我已经查看了Golang download multiple files in parallel using goroutinesExample for sync.WaitGroup correct?,并且我已将它们用作我的代码中的指南。但是我不确定它对我有用。我正在尝试从aws上的多个桶下载文件。这就是我所拥有的(出于安全原因,某些行将为空白)。

package main

import (
    "fmt"
    "os"
    "os/user"
    "path/filepath"
    "sync"
    "time"

    "github.com/aws/aws-sdk-go/aws"
    "github.com/aws/aws-sdk-go/aws/session"
    "github.com/aws/aws-sdk-go/service/s3"
    "github.com/aws/aws-sdk-go/service/s3/s3manager"
)

var (
    //Bucket         = ""                                               // Download from this bucket
    Prefix         = "" // Using this key prefix
    LocalDirectory = "s3logs"                                                    // Into this directory
)

// create a single session to be used
var sess = session.New()

// used to control concurrency
var wg sync.WaitGroup

func main() {
    start := time.Now()
    //map of buckets to region
    regBuckets := map[string]string{

    }



    // download the files for each bucket
    for region, bucket := range regBuckets {
        fmt.Println(region)
        wg.Add(1)
        go getLogs(region, bucket, LocalDirectory, &wg)
    }
    wg.Wait()
    elapsed := time.Since(start)
    fmt.Printf("\nTime took %s\n", elapsed)

}

// function to get data from buckets

func getLogs(region string, bucket string, directory string, wg *sync.WaitGroup) {
    client := s3.New(sess, &aws.Config{Region: aws.String(region)})
    params := &s3.ListObjectsInput{Bucket: &bucket, Prefix: &Prefix}
    manager := s3manager.NewDownloaderWithClient(client, func(d *s3manager.Downloader) {
        d.PartSize = 6 * 1024 * 1024 // 6MB per part
        d.Concurrency = 5
    })
    d := downloader{bucket: bucket, dir: directory, Downloader: manager}
    client.ListObjectsPages(params, d.eachPage)
    wg.Done()
}

// downloader object and methods
type downloader struct {
    *s3manager.Downloader
    bucket, dir string
}

func (d *downloader) eachPage(page *s3.ListObjectsOutput, more bool) bool {
    for _, obj := range page.Contents {
        d.downloadToFile(*obj.Key)
    }
    return true
}

func (d *downloader) downloadToFile(key string) {
    // Create the directories in the path
    // desktop path
    user, errs := user.Current()
    if errs != nil {
        panic(errs)
    }
    homedir := user.HomeDir
    desktop := homedir + "/Desktop/" + d.dir
    file := filepath.Join(desktop, key)
    if err := os.MkdirAll(filepath.Dir(file), 0775); err != nil {
        panic(err)
    }

    // Setup the local file
    fd, err := os.Create(file)
    if err != nil {
        panic(err)
    }
    defer fd.Close()

    // Download the file using the AWS SDK
    fmt.Printf("Downloading s3://%s/%s to %s...\n", d.bucket, key, file)
    params := &s3.GetObjectInput{Bucket: &d.bucket, Key: &key}
    d.Download(fd, params)
    _, e := d.Download(fd, params)
    if e != nil {
        panic(e)
    }
}

regBuckets散列图中,我放置了bucket names : regions的列表在下面的for循环中,我打印了存储桶名称。所以,如果我有两个桶,我想同时从两个桶中下载这些项目。我正在用打印声明对此进行测试。我除了看到第一个桶的名称,并且在第二个桶的名称后不久。然而,似乎不是从多个桶下载文件并行它正在按顺序下载它们,例如桶1完成桶1时for循环继续然后桶2 ...等所以我需要帮助确保我正在下载并行因为我有大约10个桶,速度很重要。我也想知道是不是因为我正在使用一个会话。有什么想法吗?

0 个答案:

没有答案