我写了一个基于Huffman算法的压缩器来压缩文本:
# compressor Huffman quijote
from collections import Counter
import math
import pickle
import re
quijote = open("quijote.txt", encoding="utf8")
num_lletres = 0
llistaQ = []
for linia in quijote:
for lletra in linia:
llistaQ.append(lletra)
num_lletres = num_lletres+1
c = Counter(llistaQ)
c_ordenat = c.most_common()
c_final=c_ordenat[::-1]
for i,j in enumerate(c_final):
c_final[i]=list(c_final[i])
diccionari=[]
for i,j in c_final:
diccionari.append([i,None])
while len(c_final) > 1:
petit1=c_final[0]
petit2=c_final[1]
c_final.append([petit1[0]+petit2[0],petit1[1]+petit2[1]])
for i in petit1[0]:
for pos,x in enumerate(diccionari):
if x[0]==i:
val_antic=diccionari[pos][1]
diccionari[pos].pop(1)
if val_antic==None:
diccionari[pos].insert(1,"1")
else:
diccionari[pos].insert(1,"1"+val_antic)
for i in petit2[0]:
for pos,x in enumerate(diccionari):
if x[0]==i:
val_antic=diccionari[pos][1]
diccionari[pos].pop(1)
if val_antic==None:
diccionari[pos].insert(1,"0")
else:
diccionari[pos].insert(1,"0"+val_antic)
del c_final[0]
del c_final[0]
c_final.sort(key = lambda f: f[1])
s=0
entropia=0
llarg=0
llistat=c_ordenat[::-1]
for i,j in llistat:
freq=float(j)/float(num_lletres)
s=freq*math.log(freq,2.0)
entropia=entropia+s
for pos,z in enumerate(diccionari):
if z[0]==i:
llarg=llarg+(len(z[1])*freq)
entropia=-(entropia)
with open("taula_Huffman.txt", "wb") as taula_final:
pickle.dump(diccionari, taula_final)
cadena=''
with open("quijote.txt", encoding="utf8") as entrada, open('Huffman sortida', 'wb') as sortida:
for line in entrada:
for x, y in diccionari:
line = line.replace(x, y)
cadena=cadena+line
cadena=str(1)+cadena
bits=re.findall('........',cadena)
for i in bits:
sortida.write(bytes([int(i,2)]))
ll=len(cadena)
sob=ll%8
a=len(cadena)/8
inta=(int(a))
nc=''
for n in range(1,sob+1):
nc=nc+cadena[(inta*8)+n-1]
penultim=nc+str(1)*(8-sob)
ultim=str('{0:08b}'.format(sob))
sortida.write(bytes([int(penultim,2)]))
sortida.write(bytes([int(ultim,2)]))
print("entropia=",entropia)
print("expected length=",llarg)
quijote.close()
解压缩器:
#Descompressor Huffman
import binascii
import pickle
with open("Huffman sortida", "rb") as entrada, open('quijote descomprimit.txt','w',encoding='utf-8') as sortida, open("taula_Huffman.txt", "rb") as diccionari:
byte = entrada.read()
hexadecimal = binascii.hexlify(byte).decode()
binary=bin(int(hexadecimal, 16))[2:].zfill(8)
cadena=binary[1:]
ultim=cadena[(len(cadena)-8):(len(cadena))]
cadena=cadena[:(len(cadena)-8)]
penultim=cadena[(len(cadena)-8):(len(cadena))]
cadena=cadena[:(len(cadena)-8)]
ultim=int(ultim,2)
afegit=penultim[:ultim]
cadena=cadena+afegit
dic = pickle.load(diccionari)
dicc={}
for i in dic:
dicc[i[0]] = i[1]
dicci = {v: k for k, v in dicc.items()}
temporal=''
text=''
for i in cadena:
temporal+=i
if temporal in dicci:
text+=dicci[temporal]
temporal=''
sortida.write(text)
问题是解压缩的文件除了数字之外没问题,我的意思是,在原始文件中最后文件中的字母有相同的字母,但原始文件中的数字是最终文件中的数字apears一系列 1111 Csicauicau,11dunrdunr dunrdunrdunrdunr 111 1111dunrdunr dunrdunrdunrdunr 等等。
所以我发现失败的压缩器部分是替换指令(第74-75行):
for line in entrada:
for x, y in diccionari:
line = line.replace(x, y)
cadena=cadena+line
我用以下内容替换了这四行:
for line in entrada:
for ch in line:
for x, y in diccionari:
if ch==x:
cadena=cadena+y
break
问题在于这会使程序变得如此缓慢(在我的电脑上运行约40秒)。第一个选项是在我的电脑上大约9秒钟。
有没有办法在 内的 内更快地执行 ?或者,我的.replace(x,y)是否有解决方案,但数字失败?