我正在使用Go(v1.0.2)开发一个小程序。我试图将行加载到内存中,我可以通过索引(行的第一列)来回忆。 为了节省空间,我用zlib压缩每一行。线条按块分组。
从文件加载项目。项目由一条线表示。这一行有许多由制表分隔的属性。
它的工作正常,但目前它确实很慢。我花了差不多50秒才加载一个40 Mb的文件! (注意,如果我禁用" compress"部分,它只需要0.87秒)。我显然做错了什么,但我找不到什么。请注意,我是这种语言的初学者。
注意:我坚持使用Go 1.0.2并且无法更新。
package main
import (
"bytes"
"compress/zlib"
"encoding/json"
"flag"
"fmt"
"gotwcc/mylib"
"io"
"net/http"
"os"
)
type Block struct {
data []byte
}
type Row struct {
offset, len uint32
block *Block
}
type Cache struct {
blocks []Block
indexes map[string]*Row
enable_compress bool
maxRowGroup uint
}
func (this *Cache) dump() {
for key, value := range this.indexes {
fmt.Printf("[%s] = \"%s\"\n", key, value.block)
}
}
func (this *Cache) search(item_id string) string {
row := this.indexes[item_id]
if row == nil {
return "(Not found)"
}
block := this.uncompress(row.block.data)
slice := block[row.offset : row.offset+row.len]
return string(slice)
}
func (this *Cache) compress(data []byte) []byte {
if !this.enable_compress {
return data
}
var b bytes.Buffer
w := zlib.NewWriter(&b)
w.Write(data)
w.Close()
return b.Bytes()
}
func (this *Cache) uncompress(data []byte) []byte {
if !this.enable_compress {
return data
}
var res bytes.Buffer
b := bytes.NewReader(data)
r, err := zlib.NewReader(b)
if err != nil {
panic(err)
}
io.Copy(&res, r)
r.Close()
return res.Bytes()
}
func (this *Cache) loadFile(s string) {
type TempRowBuf struct {
item_id []byte
offset, len uint32
}
file, err := os.Open(s)
if err != nil {
panic(err.Error())
}
defer file.Close()
scanner := mybufio.NewScanner(file)
scanner.Split(mybufio.ScanLines)
var tmp_buf bytes.Buffer
var buffer bytes.Buffer
var tmp_list []TempRowBuf
this.indexes = make(map[string]*Row)
var offset uint32 = 0
nb := this.maxRowGroup
for scanner.Scan() {
nb--
tmp_buf.Reset()
tmp_buf.Write(scanner.Bytes())
line := tmp_buf.Bytes()
item_id, _ := tmp_buf.ReadBytes('\t')
item_id = item_id[0 : len(item_id)-1]
size := uint32(len(line))
buffer.Write(line)
tmp_list = append(tmp_list, TempRowBuf{item_id, offset, size})
offset += size
if nb <= 0 {
compressed := this.compress(buffer.Bytes())
buff := make([]byte, len(compressed))
copy(buff, compressed)
var block *Block = &Block{buff}
for _, tmp := range tmp_list {
this.indexes[string(tmp.item_id)] = &Row{tmp.offset, tmp.len, block}
}
nb = this.maxRowGroup
offset = 0
tmp_list = nil
buffer.Reset()
}
}
if nb > 0 {
compressed := this.compress(buffer.Bytes())
buff := make([]byte, len(compressed))
copy(buff, compressed)
var block *Block = &Block{buff}
for _, tmp := range tmp_list {
this.indexes[string(tmp.item_id)] = &Row{tmp.offset, tmp.len, block}
}
}
}
func wsCacheHandler(cache *Cache, writer http.ResponseWriter, request *http.Request) {
var value map[string]string = make(map[string]string)
item_id := request.FormValue("item_id")
value["item_id"] = item_id
value["raw"] = cache.search(item_id)
jsonResp, err := json.Marshal(value)
if err != nil {
fmt.Println("error:", err)
} else {
fmt.Fprintf(writer, "%s", string(jsonResp))
}
}
func main() {
filename := flag.String("data", "default.txt", "The data filename")
no_http := flag.Bool("no-http", false, "Do not start an http server")
dumpMap := flag.Bool("dump", false, "If we should dump the map to stdout")
noCompression := flag.Bool("no-compress", false, "Disable compression")
maxRowGroup := flag.Uint("max-row-group", 100, "How much line to group when doing compression")
flag.Parse()
var cache Cache
cache.enable_compress = !*noCompression
cache.maxRowGroup = *maxRowGroup
cache.loadFile(*filename)
if *dumpMap {
cache.dump()
fmt.Println(cache.search("100001"))
fmt.Println(cache.search("100002"))
fmt.Println(cache.search("100003"))
fmt.Println(cache.search("100004"))
fmt.Println(cache.search("100005"))
fmt.Println(cache.search("100006"))
fmt.Println(cache.search("100007"))
fmt.Println(cache.search("100008"))
fmt.Println(cache.search("100009"))
fmt.Println(cache.search("100010"))
}
if !*no_http {
http.HandleFunc("/", func(writer http.ResponseWriter, request *http.Request) {
wsCacheHandler(&cache, writer, request)
})
fmt.Println("Cache loaded, now listening on port 8585...")
http.ListenAndServe(":8585", nil)
}
}
这是我使用的测试文件(我在这里没有粘贴40 Mo文件:p):
data.txt中:
100001 bar
100002 foo
100003 bob
100004 nuts
100005 gogopowran
100006 green
100007 test
100008 alongwordwithlotofletters
100009
100010 space space space
我像这样启动我的应用程序:
time ./mybin -data=data.txt -no-http -no-compress => ok (0.6 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=100 => slow (12.1 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=1000 => still slow (10.9 sec to load)
time ./mybin -data=data.txt -no-http --max_row_group=10000 => still slow (10.6 sec to load)
编辑:在代码上应用go fmt。添加选项以选择包装的大小。测试3种不同的尺寸。
答案 0 :(得分:-1)
单独压缩每一行只会非常慢,并且提供相对低效的压缩来启动。
有没有理由你不是简单地压缩整个文件?或者至少一次“阻止”?