我正在为我的霍夫曼压缩尝试构建频率表。我知道如何制作压缩树,但如果我必须在汇编程序中构建代码,我就完全迷失了。任何人都可以帮助我构建汇编语言中霍夫曼压缩的频率表吗?
答案 0 :(得分:2)
好的,亲爱的H.N.,霍夫曼算法与数组的实现很差,而不是树。我知道它有效,因为我用它来做好旧Pascal的作业(不像Thomas Kilian那么老),它非常简单:
步骤1:计算要压缩的文件中的所有字符,将字符及其计数器放入数组中并按DESCENDING顺序排序(最常见的字符是第一个,最不频繁的持续)。例如:
第2步:以统一的方式为每个字符分配霍夫曼代码:每个字符获得1位,并且与其位置一样多的零,第一个字符获取代码10,第二个字符获取第二个字符100,第三个1000,依此类推。如果我们要使用树,下一张图片会显示等效代码:
请注意,前一个树只是数组的实现,这就是我们可以使用数组生成霍夫曼代码的原因。不要紧张,我们不使用树木,只是为了说明我们的观点。
其余的是古老的历史:在字节内的位中转换1和0,并将字节存储在生成的(而不是太多压缩的)文件中。
非常重要:将霍夫曼代码及其在文件开头所代表的字符保存为字符串。例如:C = 10,E = 100,R = 1000,A = 10000。我们需要它来解压缩我们的文件。
现在减压。让我们看看“ARE”这个词是如何压缩的:
ARE = 100001000100
解码统一的霍夫曼代码很容易,因为第1位是分隔符,它们标记每个霍夫曼代码的开头和它们代表的字符。算法很简单:
记住,这是一个很差的Huffman实现,但它确实有效。压缩级别取决于字符频率。
我不确定我是否解释了一切。有问题吗?
编辑:哦,是的!差点忘了:最后一个霍夫曼代码可能不会填满最后一个字节,所以用1位填充它,没有零。我们将忽略连续1,因为我们知道连续1只是填充符(有效代码是1后跟零,而不是1后跟1)。
为压缩机组装8086代码
下一个代码是用EMU8086制作的。首先,询问用户要压缩的文件的名称和压缩文件的名称,然后计算填充频率数组的所有字符,按降序排列数组,将字符数组保存在压缩文件中,以及最后,从源文件中读取每个字符并将霍夫曼代码存储在另一个文件中。压缩级别可以是好的也可以是坏的,具体取决于字符频率:如果许多字符重复相同的次数,压缩将是不好的(文件可能比原始文件大),如果大多数字符重复不同的次数,压缩比较好(比原来小)。 警告:这是一个8086程序,它无法处理大于64 Kb(65535字节)的文件,因为计数器是16位(0..65535),它可能压缩更大的文件,只要没有一个字节重复超过65535次(这很难知道),也许将来我会发布64位版本。
.model small
.stack 100h
;-----------------------------------------
.data
char_array db 256 dup(?) ;ARRAY FOR ASCII CHARACTERS.
freq_array dw 256 dup(0) ;ARRAY OF FREQUENCIES OF EACH ASCII CHARACTER.
msj1 db 13,10,'Enter name of file to compress: $'
filenameS db 99 ;MAX NUMBER OF CHARACTERS ALLOWED (98).
db ? ;LENGTH (NUMBER OF CHARACTERS ENTERED BY USER).
db 99 dup(0) ;CHARACTERS ENTERED BY USER. END WITH CHR(13).
filehandlerS dw ? ;SOURCE HANDLER.
msj2 db 13,10,'Enter name of compressed file : $'
filenameD db 99 ;MAX NUMBER OF CHARACTERS ALLOWED (98).
db ? ;LENGTH (NUMBER OF CHARACTERS ENTERED BY USER).
db 99 dup(0) ;CHARACTERS ENTERED BY USER. END WITH CHR(13).
filehandlerD dw ? ;DESTINATION HANDLER.
msj3 db 13,10,'The file was compressed.$'
the_char db ?
i dw ?
j dw ?
mask0 db 11111110b ;USED TO SET BITS IN 0 WITH AND.
db 11111101b
db 11111011b
db 11110111b
db 11101111b
db 11011111b
db 10111111b
db 01111111b
mask1 db 00000001b ;USED TO SET BITS IN 1 WITH OR.
db 00000010b
db 00000100b
db 00001000b
db 00010000b
db 00100000b
db 01000000b
db 10000000b
the_byte db ? ;BYTE FILLED WITH HUFFMAN BITS.
;-----------------------------------------
.code
start:
;INITIALIZE DATA SEGMENT.
mov ax, @data
mov ds, ax
call get_source_file ;FILE TO COMPRESS.
call get_destination_file ;COMPRESSED FILE.
call fill_chars ;FILL CHAR_ARRAY WITH 256 ASCII CHARS.
call count_chars ;FILL FREQ_ARRAY WITH FREQUENCIES OF CHARS.
call bubble_sort_descending ;SORT FREQ_ARRAY AND CHAR_ARRAY.
call huffman_file ;READ EACH CHAR AND SAVE ITS HUFFMAN CODE.
;WAIT FOR ANY KEY.
mov ah, 7
int 21h
;FINISH PROGRAM.
mov ax, 4c00h
int 21h
;-----------------------------------------
get_source_file proc
;DISPLAY MESSAGE.
mov dx, offset msj1
mov ah, 9
int 21h
;CAPTURE FILENAME FROM KEYBOARD.
mov ah, 0Ah
mov dx, offset filenameS
int 21h
;CAPTURED STRING ENDS WITH CHR(13), BUT FILES REQUIRE
;THE FILENAME TO END WITH CHR(0), SO LET'S CHANGE IT.
mov si, offset filenameS + 1 ;STRING LENGTH.
mov cl, [ si ] ;MOVE LENGTH TO CL.
mov ch, 0 ;CLEAR CH TO USE CX.
inc cx ;ONE MORE BYTE TO REACH CHR(13).
add si, cx ;NOW SI POINTS TO CHR(13).
mov al, 0
mov [ si ], al ;REPLACE CHR(13) BY 0.
ret
get_source_file endp
;-----------------------------------------
get_destination_file proc
;DISPLAY MESSAGE.
mov dx, offset msj2
mov ah, 9
int 21h
;CAPTURE FILENAME FROM KEYBOARD.
mov ah, 0Ah
mov dx, offset filenameD
int 21h
;CAPTURED STRING ENDS WITH CHR(13), BUT FILES REQUIRE
;THE FILENAME TO END WITH CHR(0), SO LET'S CHANGE IT.
mov si, offset filenameD + 1 ;STRING LENGTH.
mov cl, [ si ] ;MOVE LENGTH TO CL.
mov ch, 0 ;CLEAR CH TO USE CX.
inc cx ;ONE MORE BYTE TO REACH CHR(13).
add si, cx ;NOW SI POINTS TO CHR(13).
mov al, 0
mov [ si ], al ;REPLACE CHR(13) BY 0.
ret
get_destination_file endp
;-----------------------------------------
;FILL ARRAY OF CHARS WITH ASCII CHARS 0..255.
fill_chars proc
mov si, offset char_array
mov cx, 0
filling:
mov [ si ], cl
inc si
inc cx
cmp cx, 255
jbe filling
ret
fill_chars endp
;-----------------------------------------
;READ ALL CHARACTERS FROM FILE INCREASING THE COUNTER OF
;EACH CHARACTER IN THE ARRAY OF FREQUENCIES. EACH CHARACTER
;IS USED AS THE OFFSET OF ITS OWN COUNTER, EXAMPLE: THE
;COUNTER FOR 'A' IS THE POSITION 65 OF FREQ_ARRAY.
count_chars proc
;OPEN FILE.
mov ah, 3dh ;SERVICE TO OPEN FILE.
mov al, 0 ;OPEN AS READ ONLY.
mov dx, offset filenameS + 2
int 21h
mov filehandlerS, ax ;NECESSARY FOR OPERATIONS ON FILE.
;COUNT CHARACTERS.
reading:
;READ ONE CHAR FROM FILE.
mov ah, 3fh ;SERVICE TO READ FROM FILE.
mov bx, filehandlerS
mov cx, 1 ;HOW MANY BYTES TO READ.
mov dx, offset the_char ;WHERE TO STORE THE READ BYTES.
int 21h
;CHECK END OF FILE.
cmp ax, 0
je end_reading ;IF READ ZERO BYTES, FINISH.
;INCREASE COUNTER. THE CHAR ITSELF IS BEEN USES AS INDEX: THE
;COUNTER FOR CHAR 65 ('A') IS IN THE 65th POSITION OF THE ARRAY.
mov si, offset freq_array
mov al, the_char ;USE CHAR AS OFFSET OF ITS OWN COUNTER.
mov ah, 0 ;CLEAR AH TO USE AX.
shl ax, 1 ;AX * 2, BECAUSE EVERY COUNTER IS 2 BYTES.
add si, ax ;SI POINTS TO COUNTER POSITION.
inc [ word ptr si ]
jmp reading
end_reading:
;CLOSE FILE.
mov ah, 3eh ;SERVICE TO CLOSE FILE.
mov bx, filehandlerS
int 21h
ret
count_chars endp
;-----------------------------------------
;SORT BOTH ARRAYS (FREQ_ARRAY AND CHAR_ARRAY) IN
;DESCENDING ORDER.
;for ( i = 0; i < len-1; i++ )
; for ( j = i+1; j < len; j++ )
; if ( arr[i] < arr[j] ) // '<' BECAUSE IT'S ASCENDING.
; exchange
bubble_sort_descending proc
mov i, 0 ;I = 0.
fori:
mov ax, i ;AX = I.
inc ax ;I++.
mov j, ax ;J = I++.
forj:
;GET FREQ[ I ].
mov si, offset freq_array
mov ax, i
shl ax, 1 ;I * 2, BECAUSE EVERY COUNTER IS 2 BYTES.
add si, ax
mov ax, [ si ] ;AX = FREQ[ I ].
;GET FREQ[ J ].
mov di, offset freq_array
mov cx, j
shl cx, 1 ;J * 2, BECAUSE EVERY COUNTER IS 2 BYTES.
add di, cx
mov cx, [ di ] ;CX = FREQ[ J ].
;IF ( FREQ[ I ] < FREQ[ J ] ).
cmp ax, cx ;CMP FREQ[ I ], FREQ[ J ].
jae bigger ;IF ( FREQ[I] >= FREQ[J] ) NO EXCHANGE.
;EXCHANGE BECAUSE FREQ[ I ] IS NOT BIGGER THAN FREQ[ J ].
;EXCHANGE COUNTERS IN FREQ_ARRAY.
mov [ si ], cx ;FREQ[ I ] = FREQ[ J ].
mov [ di ], ax ;FREQ[ J ] = FREQ[ I ].
;EXCHANGE CHARACTERS IN CHAR_ARRAY.
;GET CHAR[ I ].
mov si, offset char_array
add si, i ;DON'T NEED SHL, BECAUSE EVERY CHAR IS 1 BYTE.
mov al, [ si ] ;AL = CHAR[ I ].
;GET CHAR[ J ].
mov di, offset char_array
add di, j ;DON'T NEED SHL, BECAUSE EVERY CHAR IS 1 BYTE.
mov ah, [ di ] ;AH = CHAR[ J ].
;EXCHANGE.
mov [ si ], ah ;CHAR[ I ] = CHAR[ J ].
mov [ di ], al ;CHAR[ J ] = CHAR[ I ].
bigger:
;NEXT J.
inc j ;J++.
cmp j, 255
jbe forj ;IF ( J <= 255 ) REPEAT.
;NEXT I.
inc i ;I++.
cmp i, 255
jb fori ;IF ( I < 255 ) REPEAT.
ret
bubble_sort_descending endp
;-----------------------------------------
;READ EACH CHARACTER FROM SOURCE FILE, AND STORE THE
;HUFFMAN CODE OF EACH CHARACTER IN DESTINATION FILE.
huffman_file proc
;OPEN SOURCE FILE.
mov ah, 3dh ;SERVICE TO OPEN FILE.
mov al, 0 ;OPEN AS READ ONLY.
mov dx, offset filenameS + 2
int 21h
mov filehandlerS, ax ;NECESSARY FOR OPERATIONS ON FILE.
;CREATE DESTINATION FILE.
mov ah, 3ch ;SERVICE TO CREATE FILE.
mov cx, 0 ;NO ATTRIBUTES.
mov dx, offset filenameD + 2
int 21h
mov filehandlerD, ax ;NECESSARY FOR OPERATIONS ON FILE.
call save_chars ;SAVE ARRAY OF CHARS AT FILE'S BEGIN-.
;NING, NECESSARY TO DECOMPRESS.
;CODIFY CHARACTERS.
mov bp, 0 ;START SAVING BITS IN BIT 0.
mov di,0
codifying:
;READ ONE CHAR FROM SOURCE FILE.
mov ah, 3fh ;SERVICE TO READ FROM FILE.
mov bx, filehandlerS
mov cx, 1 ;HOW MANY BYTES TO READ.
mov dx, offset the_char ;WHERE TO STORE THE READ BYTES.
int 21h
;CHECK END OF FILE.
cmp ax, 0
je end_codifying ;IF READ ZERO BYTES, FINISH.
call huffman_char ;CODIFY CHAR, SAVE CODE TO DESTINATION FILE.
inc di
jmp codifying ;REPEAT PROCESS FOR NEXT CHAR.
end_codifying:
call last_byte ;IF LAST BYTE WAS NOT FULL, FILL IT.
;CLOSE FILES.
mov ah, 3eh
mov bx, filehandlerD
int 21h
mov ah, 3eh
mov bx, filehandlerS
int 21h
mov ah, 9
mov dx, offset msj3
int 21h
ret
huffman_file endp
;-----------------------------------------
;WRITES TO FILE THE ARRAY OF CHARS. NECESSARY TO
;DECOMPRESS THE FILE. THE CHARS ARE SAVED IN DESCENDING
;ORDER, THIS WAY WE KNOW FIRST CHAR TAKES CODE 10, NEXT
;IS 100, ETC. SAVING THE CHARS TAKES LESS BYTES THAN
;STORING THE 256 HUFFMAN CODES, BECAUSE THE FIRST CODE
;IS 10, NEXT 100, ETC., THE TOTAL 256 CODES ARE: FIRST
;CODE IS 1 ZERO, NEXT IS 2 ZEROES, LAST IS 256 ZEROES,
;FORMULA: (N*(N+1))/2 = (256*(256+1))/2 = 32.896 ZEROES,
;PLUS ONE '1' PER CODE = 256 '1', SO, THE TOTAL SIZE FOR
;256 HUFFMAN CODES IS = 32.896 + 256 = 33.152. COMPARE
;THIS NUMBER WITH THE SIZE OF THE ARRAY = 256 BYTES.
save_chars proc
mov ah, 40h ;SERVICE TO WRITE ON FILE.
mov bx, filehandlerD ;FILE.
mov dx, offset char_array ;DATA TO WRITE.
mov cx, 256 ;DATA SIZE IN BYTES.
int 21h
ret
save_chars endp
;-----------------------------------------
;BUILD HUFFMAN CODE FOR CHAR "THE_CHAR". IF CODE REQUIRES
;MORE THAN ONE BYTE, BYTES ARE BEEN WRITTEN WHILE THEY ARE
;FILLED. THE STARTING BIT IS INDICATED BY BP.
;AFTER FINISH, THE NEXT BIT TO USE IS INDICATED BY BP. FOR
;EXAMPLE, THE FIRST CHAR STARTS AT BIT 0, IF ITS CODE IS
;"1000", BP WILL BE 4 (POSITION FOR NEXT CODE), IF CODE
;WOULD BE "100000000" (9 BITS), A FULL BYTE WILL BE FILLED,
;WILL BE STORED TO DESTINATION FILE, AND ONE BIT OF ANOTHER
;BYTE WILL BE USED, SO BP WILL BE "1".
huffman_char proc
;SET BIT 1 (ALL HUFFMAN CODES START WITH 1 AND N ZEROES).
mov si, offset mask1 ;ARRAY WITH 8 MASKS TO SET BITS IN 1.
add si, bp ;SI POINTS TO MASK TO SET BIT BP IN 1.
mov al, [ si ] ;AL = MASK TO SET BIT BP IN 1.
or the_byte, al ;SET BIT BP (0..7) IN 1.
inc bp ;NEXT BIT TO SET.
;CHECK IF BYTE IS FULL.
cmp bp, 7
jbe not_full1 ;IF ( BP <= 7 ) BYTE IS NOT FULL.
call write_byte ;BYTE IS FULL. SAVE IT TO FILE.
not_full1:
;SET BITS 0 (ALL HUFFMAN CODES START WITH 1 AND N ZEROES).
;WE NEED AS MANY 0 AS THE POSITION OF THE CHAR, PLUS 1 (CHAR
;0 GETS 1 ZERO, CHAR 1 GETS 2 ZEROES, ETC.).
mov al, the_char ;CHAR BEEN CODIFIED.
call find_char ;RETURNS POSITION OF CHAR IN CX.
inc cx ;POSITION+1 ZEROES.
zeroes:
;SET BIT 0 (ALL HUFFMAN CODES START WITH 1 AND N ZEROES).
mov si, offset mask0 ;ARRAY WITH 8 MASKS TO SET BITS IN 0.
add si, bp ;SI POINTS TO MASK TO SET BIT BP IN 0.
mov al, [ si ] ;AL = MASK TO SET BIT BP IN 0.
and the_byte, al ;SET BIT BP (0..7) IN 0.
inc bp ;NEXT BIT TO SET.
;CHECK IF BYTE IS FULL.
cmp bp, 7
jbe not_full0 ;IF ( BP <= 7 ) BYTE IS NOT FULL.
push cx ;PRESERVE CX BECAUSE "WRITE_BYTE" MODIFY IT.
call write_byte ;BYTE IS FULL. SAVE IT TO FILE.
pop cx ;RESTORE CX.
not_full0:
loop zeroes ;CX-1. IF ( CX > 0 ) REPEAT.
ret
huffman_char endp
;-----------------------------------------
;WHEN ONE BYTE IS FILLED WITH BITS, IT IS WRITTEN on
;DESTINATION FILE.
write_byte proc
mov ah, 40h ;SERVICE TO WRITE ON FILE.
mov bx, filehandlerD ;FILE.
mov dx, offset the_byte ;DATA TO WRITE.
mov cx, 1 ;DATA SIZE IN BYTES.
int 21h
mov bp, 0 ;NEXT BYTE STARTS AT BIT 0.
ret
write_byte endp
;-----------------------------------------
;PARAMETER : AL = CHAR TO FIND.
;RETURNS : CX = CHAR POSITION IN ARRAY (0..).
find_char proc
mov si, offset char_array ;SI = POINTER TO ARRAY.
mov cx, 0 ;START POSITION.
finding: ;COUNTER.
cmp [ si ], al
je found ;IF ( CURRENTCHAR == CHARTOFIND ) FINISH.
inc si ;INCREASE POINTER.
inc cx ;INCREASE POINTER.
cmp cx, 255
jbe finding ;IF ( COUNTER <= 255 ) REPEAT.
found:
ret
find_char endp
;-----------------------------------------
;FILLS THE LAST BYTE WITH 1s.
last_byte proc
cmp bp, 0
je no_last_byte ;IF ( BP == 0 ) THERE IS NO NEED OF LAST BYTE.
;FILL LAST BYTE WITH 1.
fill1:
;SET BIT 1.
mov si, offset mask1 ;ARRAY WITH 8 MASKS TO SET BITS IN 1.
add si, bp ;SI POINTS TO MASK TO SET BIT BP IN 1.
mov al, [ si ] ;AL = MASK TO SET BIT BP IN 1.
or the_byte, al ;SET BIT BP (0..7) IN 1.
inc bp ;NEXT BIT TO SET.
;CHECK IF BYTE IS FULL.
cmp bp, 7
jbe fill1 ;IF ( BP <= 7 ) BYTE IS NOT FULL.
call write_byte ;BYTE IS FULL. SAVE IT TO FILE.
no_last_byte:
ret
last_byte endp
;-----------------------------------------
end start