这是我的代码:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import time
import os
print u'read data ...'
train_data = pd.read_csv('Train.csv', index_col='SentenceId', delimiter='\t', encoding='utf-8')
test_data = pd.read_csv('Test.csv', index_col='SentenceId', delimiter='\t', encoding='utf-8')
train_label = pd.read_csv('Label.csv', index_col='SentenceId', delimiter='\t', encoding='utf-8')
addition_data = pd.read_csv('addition_data.csv', header=None, encoding='utf-8')[0]
train_data.dropna(inplace=True) # drop some empty sentences
...
def findall(sub_string, string):
start = 0
idxs = []
while True:
idx = string[start:].find(sub_string)
if idx == -1:
return idxs
else:
idxs.append(start + idx)
start += idx + len(sub_string)
tags = {'pos':1, 'neu':2, 'neg':3}
def label2tag(i):
s = train_data.loc[i]['Content']
r = np.array([0]*len(s))
try:
l = train_label.loc[[i]].as_matrix()
except:
return r
for i in l:
for j in findall(i[0], s):
r[j:j+len(i[0])] = tags[i[1]]
return r
print u'translating target into tags ...'
train_data['label'] = map(label2tag, tqdm(iter(train_data.index)))
这是我得到的错误的追溯:
Traceback (most recent call last):
File "shibie.py", line 88, in <module>
train_data['label'] = map(label2tag, tqdm(iter(train_data.index)))
File "shibie.py", line 83, in label2tag
for j in findall(i[0], s):
File "shibie.py", line 66, in findall
idx = string[start:].find(sub_string)
TypeError: coercing to Unicode: need string or buffer, float found
上面的代码在我自己的PC上运行,但是在我学校的Ubuntu中,它会产生很多错误。我不知道这是不是因为我文件中的空格,但是我看到我的文件里面没有空格。