Go中的CSV和地图表现不佳

时间:2014-11-24 11:48:43

标签: performance csv go

我需要编写一个Go脚本来打开一个大的CSV文件,并根据每行第一个元素的值创建新的单独的CSV。

CSV文件如下所示:

"country", "otherfield", "otherfield1", "otherfield2", "etc"
"AT", "otherfield", "otherfield1", "otherfield2", "etc"
"AT", "otherfield", "otherfield1", "otherfield2", "etc"
"DE", "otherfield", "otherfield1", "otherfield2", "etc"
"DE", "otherfield", "otherfield1", "otherfield2", "etc"

所以,我要做的是创建一个包含第一个字段值(例如AT.csv)的文件,其中包含以该值开头的所有行。

以下是我到目前为止编写的脚本:

package main

import (
    "encoding/csv"
    "fmt"
    "os"
)

func main() {

    // contentCreated := make(chan map[string]string)

    createContent("union_exp.csv")

}

func createContent(csvfilename string) {

    keys := ""

    content := make(map[string]string)

    csvfile, err := os.Open(csvfilename)

    if err != nil {
        fmt.Println(err)
    }

    defer csvfile.Close()

    reader := csv.NewReader(csvfile)

    reader.FieldsPerRecord = -1

    rawCSVdata, err := reader.ReadAll()

    if err != nil {
        fmt.Println(err)
        os.Exit(1)
    }

    for i, each := range rawCSVdata {

        if i == 0 {
            keys = "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"\n"
        } else {

            stringtoadd := "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"\n"

            if i%10000 == 0 {
                fmt.Println(i)
            }

            exists := Exists(content, each[0])
            if !exists {
                content[each[0]] = keys
            }

            content[each[0]] += stringtoadd

            createFile(each[0], content[each[0]])

        }
    }

}

func createFile(name, content string) {

    f, _ := os.Create(name + ".csv")
    f.WriteString(content)
    f.Close()
}

func Exists(content map[string]string, name string) bool {
    _, exists := content[name]
    return exists
}

我目前遇到的问题是表现很慢。我甚至有一个用PHP编写的类似脚本,它比这更快地执行相同的操作方式。这显然让我觉得我的Go脚本肯定有问题。

有人可以帮我理解它有什么问题吗?

谢谢!

2 个答案:

答案 0 :(得分:2)

您(不必要地)一次加载完整的CVS文件,并在每次内容发生变化时覆盖文件。

尝试以下方法:

package main

import (
    "encoding/csv"
    "fmt"
    "os"
    "sync"
)

func main() {

    input, err := os.Open("union_exp.csv")
    if err != nil {
        fmt.Println("Error while opening CSV file.")
        return
    }
    defer input.Close()

    reader := csv.NewReader(input)
    reader.FieldsPerRecord = -1
    files := make(map[string]chan []string)

    keys, err := reader.Read()
    if err != nil {
        fmt.Println("Error while reading CSV file.")
        return
    }

    wg := &sync.WaitGroup{}

    var line []string
    for line, err = reader.Read(); err == nil; line, err = reader.Read() {

        ch, ok := files[line[0]]
        if ok {
            ch <- line
        } else {
            ch = make(chan []string, 8)
            wg.Add(1)
            go fileWriter(line[0], ch, wg)
            ch <- keys
            files[line[0]] = ch
        }

    }
    if err.Error() != "EOF" {
        fmt.Println("Error while reading CSV file.")
        return
    }

    for _, ch := range files {
        close(ch)
    }
    wg.Wait()

    fmt.Println("Done!")
}

func fileWriter(fileName string, ch chan []string, wg *sync.WaitGroup) {
    defer wg.Done()

    file, err := os.Create("x" + fileName + ".csv")
    if err != nil {
        fmt.Println("Error while creating output file.")
        os.Exit(1) // Kill the whole app
    }
    defer file.Close()

    writer := csv.NewWriter(file)
    defer writer.Flush()

    for line := range ch {
        writer.Write(line)
    }

}

答案 1 :(得分:1)

我是@ plusmid&#39;回答 - 你的程序花费在开/关(写)/关闭文件上的绝大部分时间。

因此,首先,修复此错误,并为每个密钥只写一次内容:

package main

import (
    "encoding/csv"
    "fmt"
    "os"
)

func main() {

    // contentCreated := make(chan map[string]string)

    createContent("union_exp.csv")

}

func createContent(csvfilename string) {

    keys := ""

    content := make(map[string]string)

    csvfile, err := os.Open(csvfilename)

    if err != nil {
        fmt.Println(err)
    }

    defer csvfile.Close()

    reader := csv.NewReader(csvfile)

    reader.FieldsPerRecord = -1

    rawCSVdata, err := reader.ReadAll()

    if err != nil {
        fmt.Println(err)
        os.Exit(1)
    }

    for i, each := range rawCSVdata {

        if i == 0 {
            keys = "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"\n"
        } else {

            stringtoadd := "\"" + each[0] + "\",\"" + each[1] + "\",\"" + each[2] + "\",\"" + each[3] + "\",\"" + each[4] + "\"\n"

            if i%10000 == 0 {
                fmt.Println(i)
            }

            exists := Exists(content, each[0])
            if !exists {
                content[each[0]] = keys
            }

            content[each[0]] += stringtoadd
        }
    }

    for key, content := range content {
        createFile(key, content)
    }
}

func createFile(name, content string) {

    f, _ := os.Create(name + ".csv")
    f.WriteString(content)
    f.Close()
}

func Exists(content map[string]string, name string) bool {
    _, exists := content[name]
    return exists
}

在25k CSV上它给我50 - &gt;速度增加5秒。

接下来,考虑使用goroutines并行解析文件。现在你只使用单核。 此外,还有一些问题,如使用+运算符来连接字符串,这通常比fmt.Sprintf()慢。您有足够的空间来优化代码。