我有一个这样的txt文件:
`Empty DataFrame
Columns: [0, 1, 2, 3, 4]
Index: []
Empty DataFrame
Columns: [0, 1, 2, 3, 4]
Index: []
0 1 2 \
46 RNA/4v6p.csv,46AA/U/551 RNA/4v6p.csv,46AA/A/33 RNA/4v6p.csv,46WW_cis
47 RNA/4v6p.csv,46AA/G/550 RNA/4v6p.csv,46AA/C/34 RNA/4v6p.csv,46WW_cis
48 RNA/4v6p.csv,46AA/A/553 RNA/4v6p.csv,46AA/U/30 RNA/4v6p.csv,46WW_cis
49 RNA/4v6p.csv,46AA/U/552 RNA/4v6p.csv,46AA/A/33 RNA/4v6p.csv,46WW_cis
50 RNA/4v6p.csv,46AA/U/1199 RNA/4v6p.csv,46AA/G/1058 RNA/4v6p.csv,46WW_cis
3 4
46 NaN NaN
47 NaN NaN
48 NaN NaN
49 NaN NaN
50 NaN NaN`
我想把它读成一个有3列的数组。目前我尝试使用pd.read_csv(self.filename,delim_whitespace=True)
,但在尝试阅读Empty DataFrame
部分时,这会给我带来很多错误。如何让程序忽略这部分?
修改 如果我的文件中没有空DataFrame,那么最佳解决方案就是。该文件是在许多文件中搜索的效果,其中一些是空的。我以为我通过提供异常来过滤空文件,这样在空文件中搜索的效果就不会存储在结果中。我想我是以错误的方式做到的。有人可以纠正我吗?
from numpy import numpy.mean as nm
def find_same_direction_chain(self, results):
separation= lambda x: pd.Series([i for i in x.split('/')])
left_chain=self.data[0].apply(separation)
right_chain=self.data[1].apply(separation)
i=1
try:
while i<len(self.data[:])-5:
if nm(left_chain[2][i:i+3])>=nm(left_chain[2][i+2:i+5]) and nm(right_chain[2][i:i+3])>=nm(right_chain[2][i+2:i+5]) and len(self.data[:])>0:
if nm(left_chain[2][i+2:i+5])>=nm(left_chain[2][i+4:i+7]) and nm(right_chain[2][i+2:i+5])>=nm(right_chain[2][i+4:i+7]):
results.chains.append(str(self.filename+", "+str(i)+self.data[0:3][i:i+5]))
else: pass
i+=1
except ValueError:
results.bin.append(self.filename)
except TypeError:
results.data_structure_error.append(self.filename)
答案 0 :(得分:1)
您可以使用:
import pandas as pd
import io
temp=u"""Empty DataFrame
Columns: [0, 1, 2, 3, 4]
Index: []
Empty DataFrame
Columns: [0, 1, 2, 3, 4]
Index: []
0 1 2 \
46 RNA/4v6p.csv,46AA/U/551 RNA/4v6p.csv,46AA/A/33 RNA/4v6p.csv,46WW_cis
47 RNA/4v6p.csv,46AA/G/550 RNA/4v6p.csv,46AA/C/34 RNA/4v6p.csv,46WW_cis
48 RNA/4v6p.csv,46AA/A/553 RNA/4v6p.csv,46AA/U/30 RNA/4v6p.csv,46WW_cis
49 RNA/4v6p.csv,46AA/U/552 RNA/4v6p.csv,46AA/A/33 RNA/4v6p.csv,46WW_cis
50 RNA/4v6p.csv,46AA/U/1199 RNA/4v6p.csv,46AA/G/1058 RNA/4v6p.csv,46WW_cis
3 4
46 NaN NaN
47 NaN NaN
48 NaN NaN
49 NaN NaN
50 NaN NaN"""
#after testing replace io.StringIO(temp) to filename
df = pd.read_csv(io.StringIO(temp), delim_whitespace=True, names=range(7))
#remove rows with NaN in columns 0 - 3
df = df.dropna(subset=[0,1,2,3])
#remove rows where first column contains text 'Columns'
df = df[~df.iloc[:,0].str.contains('Columns')]
#shift first row
df.iloc[0,:] = df.iloc[0,:].shift(-3)
#set first column to index
df = df.set_index(df.iloc[:,0])
#remove unnecessary columns
df = df.drop([0,4,5,6], axis=1)
print df
1 2 3
0
46 RNA/4v6p.csv,46AA/U/551 RNA/4v6p.csv,46AA/A/33 RNA/4v6p.csv,46WW_cis
47 RNA/4v6p.csv,46AA/G/550 RNA/4v6p.csv,46AA/C/34 RNA/4v6p.csv,46WW_cis
48 RNA/4v6p.csv,46AA/A/553 RNA/4v6p.csv,46AA/U/30 RNA/4v6p.csv,46WW_cis
49 RNA/4v6p.csv,46AA/U/552 RNA/4v6p.csv,46AA/A/33 RNA/4v6p.csv,46WW_cis
50 RNA/4v6p.csv,46AA/U/1199 RNA/4v6p.csv,46AA/G/1058 RNA/4v6p.csv,46WW_cis
或read_csv
中skiprows
的解决方案:
#after testing replace io.StringIO(temp) to filename
df = pd.read_csv(io.StringIO(temp), delim_whitespace=True, names=range(7), skiprows=6)
#remove rows with NaN
df = df.dropna(subset=[0,1,2,3])
#shift first row
df.iloc[0,:] = df.iloc[0,:].shift(-3)
#set first column to index
df = df.set_index(df.iloc[:,0])
#remove unnecessary columns
df = df.drop([0,4,5,6], axis=1)
print df
1 2 3
0
46 RNA/4v6p.csv,46AA/U/551 RNA/4v6p.csv,46AA/A/33 RNA/4v6p.csv,46WW_cis
47 RNA/4v6p.csv,46AA/G/550 RNA/4v6p.csv,46AA/C/34 RNA/4v6p.csv,46WW_cis
48 RNA/4v6p.csv,46AA/A/553 RNA/4v6p.csv,46AA/U/30 RNA/4v6p.csv,46WW_cis
49 RNA/4v6p.csv,46AA/U/552 RNA/4v6p.csv,46AA/A/33 RNA/4v6p.csv,46WW_cis
50 RNA/4v6p.csv,46AA/U/1199 RNA/4v6p.csv,46AA/G/1058 RNA/4v6p.csv,46WW_cis
编辑:
您可以尝试更改(我没有样本数据,因此未经测试):
results.chains.append(str(self.filename+", "+str(i)+self.data[0:3][i:i+5]))
为:
if len(self.data[0:3][i:i+5]) > 0:
results.chains.append(str(self.filename+", "+str(i)+self.data[0:3][i:i+5]))