这是我的第一个Go程序。我正在学习该语言,但是要理解所有概念有些困难,因此为了练习,我编写了代码来检测同一文件。这是一个简单的程序,可以递归检查目录中的重复文件。
但是:
如何在目录文件中检测重复文件
问题不是递归目录。问题是如何比较
答案 0 :(得分:1)
您可以获取每个文件正文的哈希值,然后在字典/映射中比较哈希值。
package main
import (
"crypto/md5"
"fmt"
"io"
"io/ioutil"
"log"
"os"
)
func main() {
contentHashes := make(map[string]string)
if err := readDir("./", contentHashes); err != nil {
log.Fatal(err)
}
}
func readDir(dirName string, contentHashes map[string]string) (err error) {
filesInfos, err := ioutil.ReadDir(dirName)
if err != nil {
return
}
for _, fi := range filesInfos {
if fi.IsDir() {
err := readDir(dirName+fi.Name()+"/", contentHashes)
if err != nil {
return err
}
} else {
// The important bits for this question
location := dirName + fi.Name()
// open the file
f, err := os.Open(location)
if err != nil {
return err
}
h := md5.New()
// copy the file body into the hash function
if _, err := io.Copy(h, f); err != nil {
return err
}
// Check if a file body with the same hash already exists
key := fmt.Sprintf("%x", h.Sum(nil))
if val, exists := contentHashes[key]; exists {
fmt.Println("Duplicate found", val, location)
} else {
contentHashes[key] = location
}
}
}
return
}
答案 1 :(得分:0)
使用sha256比较文件
示例:
package main
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"os"
"path/filepath"
"sync"
"flag"
"runtime"
"io"
)
var dir string
var workers int
type Result struct {
file string
sha256 [32]byte
}
func worker(input chan string, results chan<- *Result, wg *sync.WaitGroup) {
for file := range input {
var h = sha256.New()
var sum [32]byte
f, err := os.Open(file)
if err != nil {
fmt.Fprintln(os.Stderr, err)
continue
}
if _, err = io.Copy(h, f); err != nil {
fmt.Fprintln(os.Stderr, err)
f.Close()
continue
}
f.Close()
copy(sum[:], h.Sum(nil))
results <- &Result{
file: file,
sha256: sum,
}
}
wg.Done()
}
func search(input chan string) {
filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
fmt.Fprintln(os.Stderr, err)
} else if info.Mode().IsRegular() {
input <- path
}
return nil
})
close(input)
}
func main() {
flag.StringVar(&dir, "dir", ".", "directory to search")
flag.IntVar(&workers, "workers", runtime.NumCPU(), "number of workers")
flag.Parse()
fmt.Printf("Searching in %s using %d workers...\n", dir, workers)
input := make(chan string)
results := make(chan *Result)
wg := sync.WaitGroup{}
wg.Add(workers)
for i := 0; i < workers; i++ {
go worker(input, results, &wg)
}
go search(input)
go func() {
wg.Wait()
close(results)
}()
counter := make(map[[32]byte][]string)
for result := range results {
counter[result.sha256] = append(counter[result.sha256], result.file)
}
for sha, files := range counter {
if len(files) > 1 {
fmt.Printf("Found %d duplicates for %s: \n", len(files), hex.EncodeToString(sha[:]))
for _, f := range files {
fmt.Println("-> ", f)
}
}
}
}