自己在Python中实现编码和解码base64文件

时间:2015-03-14 15:23:55

标签: python encoding utf-8 base64 decoding

我自己实现base64编码时遇到问题。我已经实现了以下代码。我认为它只适用于带有英文字母的文本文件。例如,pdf文件被编码和解码,它有不同的单个字符。

def base64Encode(data):
    alphabet = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P","Q","R","S","T","U","V","W","X","Y","Z","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","0","1","2","3","4","5","6","7","8","9","+","/"]
    bit_str = ""      
    base64_str = ""

    for char in data:
        bin_char = bin(char).lstrip("0b")
        bin_char = bin_char.zfill(8)
        bit_str += bin_char 

    brackets = [bit_str[x:x+6] for x in range(0,len(bit_str),6)]

    for bracket in brackets:
        if(len(bracket) < 6):
            bracket = bracket + (6-len(bracket))*"0" 
        base64_str += alphabet[int(bracket,2)]

    # print(brackets[-4:])
    #if(bracket[-1:)
    #print(len(base64_str))
    #if(len(base64_str) != 76):
    #    base64_str += "="

    return base64_str

def base64Decode(text):
        alphabet = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","0","1","2","3","4","5","6","7","8","9","+","/"]
        bit_str = ""
        text_str = ""

        for char in text:
            if char in alphabet:
                bin_char = bin(alphabet.index(char)).lstrip("0b")
                bin_char = bin_char.zfill(6)
                bit_str += bin_char

        brackets = [bit_str[x:x+8] for x in range(0,len(bit_str),8)]

        for bracket in brackets:
            text_str += chr(int(bracket,2))

        return text_str.encode("UTF-8")

w = open("encode.txt", "w") 
with open("bla.txt", "rb") as f:
    byte = f.read(57)
    while byte:
        w.write(base64Encode(byte))
        w.write("\n")
        byte = f.read(57)
    w.close()
f.close()

w = open("decode.txt", "wb") 
with open("encode.txt", "r") as f:
    byte = f.read(77)
    while byte:
        w.write(base64Decode(byte))
        byte = f.read(77)
    w.close()
f.close()

在我看来,这行“return text_str.encode(”UTF-8“)”应该没有解码为UTF-8。但是,如果只留下“return text_str”,则会收到错误:TypeError:'str'不支持缓冲区接口。

bla.txt:

Phil Mercer reports on Cyclone Pam which has ravaged the Pacific nation of Vanuatu. Video courtesy of YouTube/Isso Nihmei at 350.org

Save the Children's Vanuatu country director Tom Skirrow said on Saturday: "The scene here this morning is complete devastation - houses are destroyed, trees are down, roads are blocked and people are wandering the streets looking for help.

ĄŚĆŹŻÓ

encode.txt

UGhpbCBNZXJjZXIgcmVwb3J0cyBvbiBDeWNsb25lIFBhbSB3aGljaCBoYXMgcmF2YWdlZCB0aGUg
UGFjaWZpYyBuYXRpb24gb2YgVmFudWF0dS4gVmlkZW8gY291cnRlc3kgb2YgWW91VHViZS9Jc3Nv
IE5paG1laSBhdCAzNTAub3JnDQoNClNhdmUgdGhlIENoaWxkcmVuJ3MgVmFudWF0dSBjb3VudHJ5
IGRpcmVjdG9yIFRvbSBTa2lycm93IHNhaWQgb24gU2F0dXJkYXk6ICJUaGUgc2NlbmUgaGVyZSB0
aGlzIG1vcm5pbmcgaXMgY29tcGxldGUgZGV2YXN0YXRpb24gLSBob3VzZXMgYXJlIGRlc3Ryb3ll
ZCwgdHJlZXMgYXJlIGRvd24sIHJvYWRzIGFyZSBibG9ja2VkIGFuZCBwZW9wbGUgYXJlIHdhbmRl
cmluZyB0aGUgc3RyZWV0cyBsb29raW5nIGZvciBoZWxwLg0KDQrEhMWaxIbFucW7w5M

decode.txt

Phil Mercer reports on Cyclone Pam which has ravaged the Pacific nation of Vanuatu. Video courtesy of YouTube/Isso Nihmei at 350.org

Save the Children's Vanuatu country director Tom Skirrow said on Saturday: "The scene here this morning is complete devastation - houses are destroyed, trees are down, roads are blocked and people are wandering the streets looking for help.

ÄÅÄŹŻÃ

由页面编码的相同文本:http://www.motobit.com/util/base64-decoder-encoder.asp

UGhpbCBNZXJjZXIgcmVwb3J0cyBvbiBDeWNsb25lIFBhbSB3aGljaCBoYXMgcmF2YWdlZCB0aGUg
UGFjaWZpYyBuYXRpb24gb2YgVmFudWF0dS4gVmlkZW8gY291cnRlc3kgb2YgWW91VHViZS9Jc3Nv
IE5paG1laSBhdCAzNTAub3JnDQoNClNhdmUgdGhlIENoaWxkcmVuJ3MgVmFudWF0dSBjb3VudHJ5
IGRpcmVjdG9yIFRvbSBTa2lycm93IHNhaWQgb24gU2F0dXJkYXk6ICJUaGUgc2NlbmUgaGVyZSB0
aGlzIG1vcm5pbmcgaXMgY29tcGxldGUgZGV2YXN0YXRpb24gLSBob3VzZXMgYXJlIGRlc3Ryb3ll
ZCwgdHJlZXMgYXJlIGRvd24sIHJvYWRzIGFyZSBibG9ja2VkIGFuZCBwZW9wbGUgYXJlIHdhbmRl
cmluZyB0aGUgc3RyZWV0cyBsb29raW5nIGZvciBoZWxwLg0KDQrEhMWaxIbFucW7w5M=

除了“=”之外,它是相同的,由于文件开头的错误而忽略了实现。

以PDF格式提供原始文件样本:

%PDF-1.5
%µµµµ
1 0 obj
<</Type/Catalog/Pages 2 0 R/Lang(pl-PL) /StructTreeRoot 8 0 R/MarkInfo<</Marked true>>>>
endobj
2 0 obj
<</Type/Pages/Count 1/Kids[ 3 0 R] >>
endobj
3 0 obj
<</Type/Page/Parent 2 0 R/Resources<</Font<</F1 5 0 R>>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI] >>/MediaBox[ 0 0 595.32 841.92] /Contents 4 0 R/Group<</Type/Group/S/Transparency/CS/DeviceRGB>>/Tabs/S/StructParents 0>>
endobj
4 0 obj
<</Filter/FlateDecode/Length 110>>
stream
xœUÌ­
€@ྰï0QËÝ®Èiž?(†kb°hòý«ZD˜4ßÀΨ*;…¡xº  ¨#“íªFrÄI!w…˜2ËQ81®D<™ÇS=Ó’léŠ82µ·>^åŒÊO-  >[´SÀ 
endstream
endobj
5 0 obj
<</Type/Font/Subtype/TrueType/Name/F1/BaseFont/ABCDEE+Calibri/Encoding/WinAnsiEncoding/FontDescriptor 6 0 R/FirstChar 32/LastChar 97/Widths 15 0 R>>
endobj
6 0 obj
<</Type/FontDescriptor/FontName/ABCDEE+Calibri/Flags 32/ItalicAngle 0/Ascent 750/Descent -250/CapHeight 750/AvgWidth 521/MaxWidth 1743/FontWeight 400/XHeight 250/StemV 52/FontBBox[ -503 -250 1240 750] /FontFile2 16 0 R>>
endobj
7 0 obj

执行脚本后:

%PDF-1.5
%µµµµ
1 0 obj
<</Type/Catalog/Pages 2 0 R/Lang(pl-PL) /StructTreeRoot 8 0 R/MarkInfo<</Marked true>>>>
endobj
2 0 obj
<</Type/Pages/Count 1/Kids[ 3 0 R] >>
endobj
3 0 obj
<</Type/Page/Parent 2 0 R/Resources<</Font<</F1 5 0 R>>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI] >>/MediaBox[ 0 0 595.32 841.92] /Contents 4 0 R/Group<</Type/Group/S/Transparency/CS/DeviceRGB>>/Tabs/S/StructParents 0>>
endobj
4 0 obj
<</Filter/FlateDecode/Length 110>>
stream
xUÌ­
@ྰï0QËÝ®Èi?(kb°hòý«ZD4ßÀΨ*;¡xº  ¨#íªFrÄI!w2ËQ81®D<ÇS=Ólé82µ·>^åÊO-  >[´SÀ 
endstream
endobj
5 0 obj
<</Type/Font/Subtype/TrueType/Name/F1/BaseFont/ABCDEE+Calibri/Encoding/WinAnsiEncoding/FontDescriptor 6 0 R/FirstChar 32/LastChar 97/Widths 15 0 R>>
endobj
6 0 obj
<</Type/FontDescriptor/FontName/ABCDEE+Calibri/Flags 32/ItalicAngle 0/Ascent 750/Descent -250/CapHeight 750/AvgWidth 521/MaxWidth 1743/FontWeight 400/XHeight 250/StemV 52/FontBBox[ -503 -250 1240 750] /FontFile2 16 0 R>>
endobj
7 0 obj

差异在例如第15行和第16行的开头。

我的目标是加载文件并在base64中对其进行编码,然后解码并获取相同的文件。适合使用。 我认为错误在数据读取或写入或编码中。有什么建议?

3 个答案:

答案 0 :(得分:0)

我的第一个建议是排除故障:确定您是否未能正确编码或解码,或两者兼而有之。使用工作实用程序和您的应用程序对文件进行编码并进行比较。使用您的应用程序和工作实用程序解码正确编码的文件并进行比较。 第二个建议:将数据作为单个字节处理,而不是可能被解释为UTF-8的文本。

以二进制模式打开PDF文件。有关如何执行此操作,请参阅Reading binary file in Python and looping over each byte。将原始字节传递给base64Encode。不要使用bin函数将字符串转换为二进制文件。

答案 1 :(得分:0)

我能够完成这项任务。替换.encode(“latin-1”)上的.encode(“UTF-8”)行,它至少适用于pdf文件。

答案 2 :(得分:0)

我已经修改了原始代码。这适用于文本,PNG和PDF,我还没有尝试过其他文件类型,但我希望它可以在它们上使用。

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 16 07:38:19 2019

@author: tracyanne
"""
import os

class Base64():

    def __init__(self):

        ## We only need to do this once
        self.b64 = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P","Q","R","S","T","U","V","W","X","Y","Z","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","0","1","2","3","4","5","6","7","8","9","+","/"]


    def Encode(self, data):
        alphabet = self.b64
        bit_str = ""
        base64_str = ""

        for char in data:
            bin_char = bin(char).lstrip("0b")
            bin_char = bin_char.zfill(8)
            bit_str += bin_char

        brackets = [bit_str[x:x+6] for x in range(0,len(bit_str),6)]

        for bracket in brackets:
            if(len(bracket) < 6):
                bracket = bracket + (6-len(bracket))*"0"
            base64_str += alphabet[int(bracket,2)]

        ##Add padding characters to maintain compatibility with forced padding
        padding_indicator = len(base64_str) % 4
        if padding_indicator == 3:
            base64_str += "="
        elif  padding_indicator == 2:
            base64_str += "=="

        return base64_str

    def Decode(self, text, eof):
        alphabet = self.b64
        bit_str = ""
        text_str = ""

        for char in text:
            if char in alphabet:
                bin_char = bin(alphabet.index(char)).lstrip("0b")
                bin_char = bin_char.zfill(6)
                bit_str += bin_char

        brackets = [bit_str[x:x+8] for x in range(0,len(bit_str),8)]

        for bracket in brackets:
            ## When eof ignore last value in brackets to remove \x00
            if eof and brackets[len(brackets) -1] == bracket:
                pass
            else:
                text_str += chr(int(bracket,2))

        ## encode string as Latin-1 == ISO-8859-1
        return text_str.encode("ISO-8859-1")

    def base64Encode(self, inFile, outFile):
        w = open(outFile, "w")
        with open(inFile, "rb") as f:
            byte = f.read(57)
            while byte:
                w.write(self.Encode(byte))
                w.write("\n")
                byte = f.read(57)
            w.close()
        f.close()

    def base64Decode(self, inFile, outFile):
        ## Get size of input file for later comparison
        fsize = os.path.getsize(inFile)
        incsize = 0
        eof = False

        w = open(outFile, "wb")
        with open(inFile, "r") as f:
            byte = f.read(77)
            while byte:
                ## keep current dataread and if current data read ==
                ## input file size set eof True
                incsize += len(byte)
                if fsize - incsize == 0:
                    eof = True
                ## Pass in eof to Decode
                w.write(base64.base64Decode(byte, eof))
                byte = f.read(77)
            w.close()
        f.close()