在python中创建阿拉伯语文本文件的错误,

时间:2017-05-29 09:44:01

标签: python unicode arabic

我编写了一个python代码来读取特定文件夹下的文件中的阿拉伯文本并删除不需要的符号(例如(,),$,...等),然后将所有阿拉伯语明文保存到另一个文件夹中。

问题是代码使用以下文本创建文件:

\' C3 \' CB \' D1 \' CA \ F1 \ \ F0 \' DD \'编\' C7 \ F1 \ \ F0 \' E1 \' C3 \' E4 \ F1 \ \ F0 \' C8 \' D8 \' E1 \' E5 \' C7

不是阿拉伯语形式。

我如何只能以阿拉伯语形式阅读和书写文字。

代码

import sys, os
import codecs
import unicodedata
from nltk.tokenize import word_tokenize

if not os.path.exists(corpus_clean):
    os.makedirs(corpus_clean)

reload(sys)

sys.setdefaultencoding('utf8')
def readUnicodeDataFrom(inputDir):
    unicode_input = codecs.open(inputDir, encoding='utf8', mode='r')
    unicode_data = unicode_input.read()
    norm_data = unicodedata.normalize('NFKD', unicode_data).encode('utf8', 'ignore')
    norm_words = word_tokenize(norm_data)
    unicode_input.close
    return norm_words

def removeCharFrom(mylist):
    mylist = [x for x in mylist if not (x.isdigit() 
                                         or x[0] == '-' and x[1:].isdigit())]   

    to_remove = ['*','#', '/', '.', '...', ':','-','_',',',';','<','>','|','\\','0','1','2','3','4','5','6','7','8','9','?',')','(','%','&','+','$','^','!','"','[',']']

    for char in mylist:
        if char in to_remove:
            mylist.remove(char) 
    return mylist


def writeUnicodeDataTo(outputDir, listOfWords):
    unicode_output = codecs.open(outputDir, encoding='utf8', mode='w+')
    for word in listOfWords:
        word = unicode(word, errors='ignore')
        unicode_output.write(word+'\n')#.encode('utf8', 'ignore'))
    unicode_output.seek(0)
        unicode_output.close

if __name__ == '__main__':
    i = 1
    for root, dirs, files in os.walk(yourpath, topdown=False):
        for name in files:
             if name !='.DS_Store':
                f = open(os.path.join(root, name))
                norm_words = readUnicodeDataFrom(os.path.join(root, name))
                uniq_words = removeCharFrom(norm_words)
                writeUnicodeDataTo(os.path.join(corpus_clean, name), uniq_words)
                print ('The file number: '+str(i)+'\n\n')
                i+=1
                for j in range(len(uniq_words)):
                    print(u''+norm_words[j])
  f.close()

3 个答案:

答案 0 :(得分:1)

norm_data = unicodedata.normalize('NFKD', unicode_data).encode('utf8', 'ignore')

更改为

norm_data = unicodedata.normalize('NFKD', unicode_data)

word = unicode(word, errors='ignore')
unicode_output.write(word+'\n')#.encode('utf8', 'ignore'))

更改为

unicode_output.write(word+'\n')

原因

问题是你不应该在codecs.open(inputDir, encoding='utf8', mode='r')之后再次编码。 codecs已经为你做了。

使用unicode_output.write(word+'\n')写入文件时出现同样的问题。

答案 1 :(得分:0)

也许您应该以二进制模式读取和写入文件:

...
unicode_input = codecs.open(inputDir, encoding='utf8', mode='rb')
...
unicode_output = codecs.open(outputDir, encoding='utf8', mode='wb')

答案 2 :(得分:0)

您好以下代码有帮助吗? `

with open('arabic.txt','r') as in_file:
     new_text=[]
     arabic_text = in_file.readlines()
     for each_line in arabic_text:
         each_line.translate(None,'!@##$%^**()')
         new_text.extends(each_line)
with open('new_arabic.txt'.'a') as out_file:
     for line in new_text:
         out_file.write(line.encode(utf-8))`