您好我正在使用Language translation
处理Keras
。我有一个English text
的文本文件和一个Hindi text
的文件。
我正面对"UnicodeDecodeError:"
。我相信也许是因为它无法将非unicode转换为unicode。
请让我知道如何去做。 github链接在
下面https://github.com/shashankg7/Seq2Seq/tree/master/seq2seq
代码段:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import codecs
import pdb
import numpy as np
#from utils import preprocess_text, text2seq_generator
def preprocess_text(file_path_src, file_path_tar, max_feats):
f_src = open(file_path_src)
f_tar = open(file_path_tar)
vocab = defaultdict(int)
freq_src = defaultdict(int)
freq_tar = defaultdict(int)
sents_src = [line.rstrip() for line in f_src.readlines()]
sents_tar = [line.rstrip() for line in f_tar.readlines()]
def preprocess(self):
# Preprocessing source and target text sequence files
self.vocab_src, self.vocab_tar, self.sents_src, self.sents_tar =
preprocess_text(self.path_src, self.path_tar, self.max_feat)
if __name__ == "__main__":
pre = preprocess('C:\\Users\\anagha\\Desktop\\Language-Translation\\Seq2Seq-master\\Seq2Seq-master\\seq2seq\\training.hi-en.hi', 'C:\\Users\\anagha\\Desktop\\Language-Translation\\Seq2Seq-master\\Seq2Seq-master\\seq2seq\\training.hi-en.en', 5500, 15)
pre.preprocess()
for e in range(1):
print("epoch no %d"%e)
for X,Y in pre.gen_batch():
print(X)
错误:
Using TensorFlow backend.
Traceback (most recent call last):
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2898, in run_code
self.showtraceback()
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 1807, in showtraceback
self.showsyntaxerror(filename)
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 1864, in showsyntaxerror
stb = self.SyntaxTB.structured_traceback(etype, value, [])
File "C:\Users\anagha\Anaconda3\lib\site-packages\IPython\core\ultratb.py", line 1441, in structured_traceback
newtext = ulinecache.getline(value.filename, value.lineno)
File "C:\Users\anagha\Anaconda3\lib\linecache.py", line 16, in getline
lines = getlines(filename, module_globals)
File "C:\Users\anagha\Anaconda3\lib\linecache.py", line 47, in getlines
return updatecache(filename, module_globals)
File "C:\Users\anagha\Anaconda3\lib\linecache.py", line 137, in updatecache
lines = fp.readlines()
File "C:\Users\anagha\Anaconda3\lib\codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 7588: invalid start byte