更新

Question

我需要为超过1GB的文件计算sha256校验和（通过块读取文件），目前我正在使用python：

import hashlib
import time

start_time = time.time()


def sha256sum(filename="big.txt", block_size=2 ** 13):
    sha = hashlib.sha256()
    with open(filename, 'rb') as f:
        for chunk in iter(lambda: f.read(block_size), b''):
           sha.update(chunk)
    return sha.hexdigest()

input_file = '/tmp/1GB.raw'
print 'checksum is: %s\n' % sha256sum(input_file)
print 'Elapsed time: %s' % str(time.time() - start_time)

我想尝试golang认为我可以获得更快的结果，但在尝试以下代码后，它运行速度慢了几秒钟：

package main

import (
    "crypto/sha256"
    "fmt"
    "io"
    "math"
    "os"
    "time"
)   

const fileChunk = 8192

func File(file string) string {
    fh, err := os.Open(file)

    if err != nil {
        panic(err.Error())
    }   

    defer fh.Close()

    stat, _ := fh.Stat()
    size := stat.Size()
    chunks := uint64(math.Ceil(float64(size) / float64(fileChunk)))
    h := sha256.New()

    for i := uint64(0); i < chunks; i++ {
        csize := int(math.Min(fileChunk, float64(size-int64(i*fileChunk))))
        buf := make([]byte, csize)
        fh.Read(buf)
        io.WriteString(h, string(buf))
    }   

    return fmt.Sprintf("%x", h.Sum(nil))
}   

func main() {
    start := time.Now()
    fmt.Printf("checksum is: %s\n", File("/tmp/1G.raw"))
    elapsed := time.Since(start)
    fmt.Printf("Elapsed time: %s\n", elapsed)
}

如果可能的话，知道如何改进golang代码吗？也许要使用所有的计算机CPU内核，一个用于读取，另一个用于散列，任何想法？

更新

正如建议我使用此代码：

package main

import (
    "crypto/sha256"
    "encoding/hex"
    "fmt"
    "io"
    "os"
    "time"
)

func main() {
    start := time.Now()
    fh, err := os.Open("/tmp/1GB.raw")
    if err != nil {
        panic(err.Error())
    }
    defer fh.Close()

    h := sha256.New()
    _, err = io.Copy(h, fh)
    if err != nil {
        panic(err.Error())
    }
    fmt.Println(hex.EncodeToString(h.Sum(nil)))

    fmt.Printf("Elapsed time: %s\n", time.Since(start))
}

为了测试我正在用这个创建1GB文件：

# mkfile 1G /tmp/1GB.raw

新版本更快但不是那么多，使用频道怎么样？可以使用多个CPU /核心来帮助改善吗？我期待至少有20％的改善，但不幸的是我几乎没有获得任何收获，几乎没有。

python的时间结果

 5.867u 0.250s 0:06.15 99.3%    0+0k 0+0io 0pf+0w

编译（go build）和执行二进制文件后的

时间结果：

 5.687u 0.198s 0:05.93 98.9%    0+0k 0+0io 0pf+0w

还有什么想法？

测试结果

在@icza

接受的答案中使用下面发布的频道版本

Elapsed time: 5.894779733s

使用带否频道的版本：

Elapsed time: 5.823489239s

我认为使用频道会增加一点但似乎没有。

我在MacBook Pro OS X Yosemite上运行它。使用go版本：

go version go1.4.1 darwin/amd64

更新2

将runtime.GOMAXPROCS设置为4：

runtime.GOMAXPROCS(4)

让事情变得更快：

Elapsed time: 5.741511748s

更新3

将块大小更改为8192（就像在python版本中一样）会得到预期的结果：

...
for b, hasMore := make([]byte, 8192<<10), true; hasMore; {
...

也仅使用runtime.GOMAXPROCS（2）

Answer 1

你的解决方案非常低效，因为你在每次迭代中都会创建新的缓冲区，你只需使用它们一次就可以把它们扔掉。

此外，您将缓冲区的内容（buf）转换为string，然后将string写入sha256计算器，将其转换回字节：绝对不必要的往返

这是另一个非常快速的解决方案，测试性能：

fh, err := os.Open(file)
if err != nil {
    panic(err.Error())
}   
defer fh.Close()

h := sha256.New()
_, err = io.Copy(h, fh)
if err != nil {
    panic(err.Error())
}   

fmt.Println(hex.EncodeToString(h.Sum(nil)))

一点解释：

io.Copy()是一个函数，它将从Reader读取所有数据（直到达到EOF）并将所有数据写入指定的Writer。由于sha256计算器（hash.Hash）实现了Writer而File（或更确切地说*File）实现了Reader，因此这很简单。< / p>

将所有数据写入哈希后，hex.EncodeToString()只会将结果（由hash.Sum(nil)获得）转换为人类可读的十六进制字符串。

最终裁决

程序从硬盘读取1GB数据并用它进行一些计算（计算其SHA-256哈希值）。由于从硬盘读取操作相对较慢，因此与Python解决方案相比，Go版本的性能提升不会很明显。整个运行需要几秒钟，这与从硬盘读取1 GB数据所需的时间处于同一数量级。由于Go和Python解决方案需要大约相同的时间来从磁盘读取数据，因此您不会看到太多不同的结果。

多个Goroutines的性能改进的可能性

通过将文件块读入一个缓冲区，开始计算其SHA-256哈希值，同时读取文件的下一个块，可以提高性能。完成后，将其发送到SHA-256计算器，同时将下一个块读入第一个缓冲区。

但是，由于从磁盘读取数据比计算其SHA-256摘要（或更新摘要计算器的状态）所花费的时间更长，因此您不会看到显着的改进。您的案例中的性能瓶颈始终是将数据读入内存所需的时间。

这是一个使用2个goroutine的完整，可运行的解决方案，其中1个goroutine读取文件的块，另一个计算先前读取的块的散列，并且当goroutine的读取完成继续散列并允许另一个读取并行。

阶段之间的正确同步（读取，散列）是通过通道完成的。正如所怀疑的那样，性能增益仅略高于<4>的时间（可能因CPU和硬盘速度而异），因为与磁盘读取时间相比，散列计算可忽略不计。如果硬盘的读取速度更快（在SSD上测试），性能增益很可能会更高。

完整的程序：

package main

import (
    "crypto/sha256"
    "encoding/hex"
    "fmt"
    "hash"
    "io"
    "os"
    "runtime"
    "time"
)

const file = "t:/1GB.raw"

func main() {
    runtime.GOMAXPROCS(2) // Important as Go 1.4 uses only 1 by default!

    start := time.Now()

    f, err := os.Open(file)
    if err != nil {
        panic(err)
    }
    defer f.Close()

    h := sha256.New()

    // 2 channels: used to give green light for reading into buffer b1 or b2
    readch1, readch2 := make(chan int, 1), make(chan int, 1)

    // 2 channels: used to give green light for hashing the content of b1 or b2
    hashch1, hashch2 := make(chan int, 1), make(chan int, 1)

    // Start signal: Allow b1 to be read and hashed
    readch1 <- 1
    hashch1 <- 1

    go hashHelper(f, h, readch1, readch2, hashch1, hashch2)

    hashHelper(f, h, readch2, readch1, hashch2, hashch1)

    fmt.Println(hex.EncodeToString(h.Sum(nil)))

    fmt.Printf("Elapsed time: %s\n", time.Since(start))
}

func hashHelper(f *os.File, h hash.Hash, mayRead <-chan int, readDone chan<- int, mayHash <-chan int, hashDone chan<- int) {
    for b, hasMore := make([]byte, 64<<10), true; hasMore; {
        <-mayRead
        n, err := f.Read(b)
        if err != nil {
            if err == io.EOF {
                hasMore = false
            } else {
                panic(err)
            }
        }
        readDone <- 1

        <-mayHash
        _, err = h.Write(b[:n])
        if err != nil {
            panic(err)
        }
        hashDone <- 1
    }
}

备注：

在我的解决方案中，我只使用了2个goroutines。没有必要使用更多，因为如前所述，磁盘读取速度是已经最大限度使用的瓶颈，因为2个goroutine可以随时执行读取。

有关同步的说明： 2个goroutines并行运行。允许每个goroutine随时使用其本地缓冲区b。通过渠道同步对共享File和共享Hash的访问，在任何给定时间只允许1个goroutine使用Hash，并且只允许使用1个goroutine （阅读）在任何给定时间从File开始。

Answer 2

对于没有人知道的人，我认为这会有所帮助。

https://blog.golang.org/pipelines

在本页末尾，有一个goroutines的md5文件解决方案。

我自己试试这个＆＃34;〜＆＃34; DIR。使用goroutines花费1.7秒，使用goroutines使用2.8秒。

这里是没有goroutines的时间。而且我不知道如何计算使用goroutines时的使用时间，因为所有这些都是同时运行的。 time use 2.805522165s time read file 759.476091ms time md5 1.710393575s time sort 17.355134ms

为什么这个Go代码的速度与Python相当（并且速度不是很快）？

更新

测试结果

更新2

更新3

2 个答案:

最终裁决

多个Goroutines的性能改进的可能性