在目录中查找重复的文件

时间:2018-10-05 17:16:39

标签: file go

这是我的第一个Go程序。我正在学习该语言,但是要理解所有概念有些困难,因此为了练习,我编写了代码来检测同一文件。这是一个简单的程序,可以递归检查目录中的重复文件。

但是:

如何在目录文件中检测重复文件

问题不是递归目录。问题是如何比较

2 个答案:

答案 0 :(得分:1)

您可以获取每个文件正文的哈希值,然后在字典/映射中比较哈希值。

package main

import (
    "crypto/md5"
    "fmt"
    "io"
    "io/ioutil"
    "log"
    "os"
)

func main() {
    contentHashes := make(map[string]string)
    if err := readDir("./", contentHashes); err != nil {
        log.Fatal(err)
    }
}

func readDir(dirName string, contentHashes map[string]string) (err error) {
    filesInfos, err := ioutil.ReadDir(dirName)
    if err != nil {
        return
    }
    for _, fi := range filesInfos {
        if fi.IsDir() {
            err := readDir(dirName+fi.Name()+"/", contentHashes)
            if err != nil {
                return err
            }
        } else {
            // The important bits for this question
            location := dirName + fi.Name()
            // open the file
            f, err := os.Open(location)
            if err != nil {
                return err
            }
            h := md5.New()
            // copy the file body into the hash function
            if _, err := io.Copy(h, f); err != nil {
                return err
            }
            // Check if a file body with the same hash already exists
            key := fmt.Sprintf("%x", h.Sum(nil))
            if val, exists := contentHashes[key]; exists {
                fmt.Println("Duplicate found", val, location)
            } else {
                contentHashes[key] = location
            }
        }
    }
    return
}

答案 1 :(得分:0)

使用sha256比较文件

示例:

package main

import (
    "crypto/sha256"
    "encoding/hex"
    "fmt"
    "os"
    "path/filepath"
    "sync"
    "flag"
    "runtime"
    "io"
)

var dir string
var workers int

type Result struct {
    file   string
    sha256 [32]byte
}

func worker(input chan string, results chan<- *Result, wg *sync.WaitGroup) {
    for file := range input {
        var h = sha256.New()
        var sum [32]byte
        f, err := os.Open(file)
        if err != nil {
            fmt.Fprintln(os.Stderr, err)
            continue
        }
        if _, err = io.Copy(h, f); err != nil {
            fmt.Fprintln(os.Stderr, err)
            f.Close()
            continue
        }
        f.Close()
        copy(sum[:], h.Sum(nil))
        results <- &Result{
            file:   file,
            sha256: sum,
        }
    }
    wg.Done()
}

func search(input chan string) {
    filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
        if err != nil {
            fmt.Fprintln(os.Stderr, err)
        } else if info.Mode().IsRegular() {
            input <- path
        }
        return nil
    })
    close(input)
}

func main() {

    flag.StringVar(&dir, "dir", ".", "directory to search")
    flag.IntVar(&workers, "workers", runtime.NumCPU(), "number of workers")
    flag.Parse()

    fmt.Printf("Searching in %s using %d workers...\n", dir, workers)

    input := make(chan string)
    results := make(chan *Result)

    wg := sync.WaitGroup{}
    wg.Add(workers)

    for i := 0; i < workers; i++ {
        go worker(input, results, &wg)
    }

    go search(input)
    go func() {
        wg.Wait()
        close(results)
    }()

    counter := make(map[[32]byte][]string)
    for result := range results {
        counter[result.sha256] = append(counter[result.sha256], result.file)
    }

    for sha, files := range counter {
        if len(files) > 1 {
            fmt.Printf("Found %d duplicates for %s: \n", len(files), hex.EncodeToString(sha[:]))
            for _, f := range files {
                fmt.Println("-> ", f)
            }
        }
    }

}