转1.0.2如何使用zlib压缩时提高速度

时间:2014-08-15 20:24:20

标签: go zlib

我正在使用Go(v1.0.2)开发一个小程序。我试图将行加载到内存中,我可以通过索引(行的第一列)来回忆。 为了节省空间,我用zlib压缩每一行。线条按块分组。

从文件加载项目。项目由一条线表示。这一行有许多由制表分隔的属性。

它的工作正常,但目前它确实很慢。我花了差不多50秒才加载一个40 Mb的文件! (注意,如果我禁用" compress"部分,它只需要0.87秒)。我显然做错了什么,但我找不到什么。请注意,我是这种语言的初学者。

注意:我坚持使用Go 1.0.2并且无法更新。

package main

import (
    "bytes"
    "compress/zlib"
    "encoding/json"
    "flag"
    "fmt"
    "gotwcc/mylib"
    "io"
    "net/http"
    "os"
)

type Block struct {
    data []byte
}

type Row struct {
    offset, len uint32
    block       *Block
}

type Cache struct {
    blocks          []Block
    indexes         map[string]*Row
    enable_compress bool
    maxRowGroup     uint
}

func (this *Cache) dump() {
    for key, value := range this.indexes {
        fmt.Printf("[%s] = \"%s\"\n", key, value.block)
    }
}

func (this *Cache) search(item_id string) string {
    row := this.indexes[item_id]
    if row == nil {
        return "(Not found)"
    }
    block := this.uncompress(row.block.data)
    slice := block[row.offset : row.offset+row.len]
    return string(slice)
}

func (this *Cache) compress(data []byte) []byte {
    if !this.enable_compress {
        return data
    }

    var b bytes.Buffer
    w := zlib.NewWriter(&b)
    w.Write(data)
    w.Close()
    return b.Bytes()
}

func (this *Cache) uncompress(data []byte) []byte {
    if !this.enable_compress {
        return data
    }

    var res bytes.Buffer
    b := bytes.NewReader(data)
    r, err := zlib.NewReader(b)
    if err != nil {
        panic(err)
    }
    io.Copy(&res, r)

    r.Close()
    return res.Bytes()

}

func (this *Cache) loadFile(s string) {
    type TempRowBuf struct {
        item_id     []byte
        offset, len uint32
    }

    file, err := os.Open(s)
    if err != nil {
        panic(err.Error())
    }
    defer file.Close()

    scanner := mybufio.NewScanner(file)
    scanner.Split(mybufio.ScanLines)

    var tmp_buf bytes.Buffer
    var buffer bytes.Buffer
    var tmp_list []TempRowBuf
    this.indexes = make(map[string]*Row)

    var offset uint32 = 0
    nb := this.maxRowGroup
    for scanner.Scan() {
        nb--
        tmp_buf.Reset()
        tmp_buf.Write(scanner.Bytes())
        line := tmp_buf.Bytes()
        item_id, _ := tmp_buf.ReadBytes('\t')
        item_id = item_id[0 : len(item_id)-1]
        size := uint32(len(line))
        buffer.Write(line)
        tmp_list = append(tmp_list, TempRowBuf{item_id, offset, size})
        offset += size
        if nb <= 0 {
            compressed := this.compress(buffer.Bytes())
            buff := make([]byte, len(compressed))
            copy(buff, compressed)
            var block *Block = &Block{buff}
            for _, tmp := range tmp_list {
                this.indexes[string(tmp.item_id)] = &Row{tmp.offset, tmp.len, block}
            }
            nb = this.maxRowGroup
            offset = 0
            tmp_list = nil
            buffer.Reset()
        }
    }
    if nb > 0 {
        compressed := this.compress(buffer.Bytes())
        buff := make([]byte, len(compressed))
        copy(buff, compressed)
        var block *Block = &Block{buff}
        for _, tmp := range tmp_list {
            this.indexes[string(tmp.item_id)] = &Row{tmp.offset, tmp.len, block}
        }
    }
}

func wsCacheHandler(cache *Cache, writer http.ResponseWriter, request *http.Request) {
    var value map[string]string = make(map[string]string)

    item_id := request.FormValue("item_id")
    value["item_id"] = item_id
    value["raw"] = cache.search(item_id)
    jsonResp, err := json.Marshal(value)
    if err != nil {
        fmt.Println("error:", err)
    } else {
        fmt.Fprintf(writer, "%s", string(jsonResp))
    }
}

func main() {
    filename := flag.String("data", "default.txt", "The data filename")
    no_http := flag.Bool("no-http", false, "Do not start an http server")
    dumpMap := flag.Bool("dump", false, "If we should dump the map to stdout")
    noCompression := flag.Bool("no-compress", false, "Disable compression")
    maxRowGroup := flag.Uint("max-row-group", 100, "How much line to group when doing compression")

    flag.Parse()
    var cache Cache
    cache.enable_compress = !*noCompression
    cache.maxRowGroup = *maxRowGroup

    cache.loadFile(*filename)

    if *dumpMap {
        cache.dump()
        fmt.Println(cache.search("100001"))
        fmt.Println(cache.search("100002"))
        fmt.Println(cache.search("100003"))
        fmt.Println(cache.search("100004"))
        fmt.Println(cache.search("100005"))
        fmt.Println(cache.search("100006"))
        fmt.Println(cache.search("100007"))
        fmt.Println(cache.search("100008"))
        fmt.Println(cache.search("100009"))
        fmt.Println(cache.search("100010"))

    }

    if !*no_http {
        http.HandleFunc("/", func(writer http.ResponseWriter, request *http.Request) {
            wsCacheHandler(&cache, writer, request)
        })
        fmt.Println("Cache loaded, now listening on port 8585...")
        http.ListenAndServe(":8585", nil)
    }
}

这是我使用的测试文件(我在这里没有粘贴40 Mo文件:p):

data.txt中:

100001  bar
100002  foo
100003  bob
100004  nuts
100005  gogopowran
100006  green
100007  test
100008  alongwordwithlotofletters
100009  
100010  space space space

我像这样启动我的应用程序:

time ./mybin -data=data.txt -no-http -no-compress => ok (0.6 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=100 => slow (12.1 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=1000 => still slow (10.9 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=10000 => still slow (10.6 sec to load)

编辑:在代码上应用go fmt。添加选项以选择包装的大小。测试3种不同的尺寸。

1 个答案:

答案 0 :(得分:-1)

单独压缩每一行只会非常慢,并且提供相对低效的压缩来启动。

有没有理由你不是简单地压缩整个文件?或者至少一次“阻止”?