我正在尝试从“Mathscore”获取数据并将值转换为数值,所有这些都在“Mathscore”下。
strong = 1 弱= 0
我尝试使用For循环通过以下函数执行此操作,但我无法运行代码。我试图分配数据的方式不正确吗?
谢谢!
import pandas as pd
data = {'Id_Student' : [1,2,3,4,5,6,7,8,9,10],'Mathscore' :['Strong','Weak','Weak','Strong','Strong','Weak','Strong','Strong','Weak','Strong']}
df = pd.DataFrame(data)
df
# # Strong = 1 and Weak =0
##def tran_mathscore(x): if x == 'Strong': return 1 if x == 'Weak': return 0
##
##df['Trans_MathScore'] = df['Mathscore'].apply(tran_mathscore)
##df
##df.Mathscore[0]=["Weak"]
##print(df.columns)
##
##
##print(df.Mathscore)
def tran_mathscore():
for i in df.Mathscore:
if i == "Strong":
df.Mathscore[i]= ['1']
elif i == "Weak":
df.Mathscore[i]= ['0']
tran_mathscore()
答案 0 :(得分:3)
您可以categorize您的数据:
current_key current_value
或映射它:
#!/usr/bin/env python3
import sys
import heapq
import os
import tempfile
class Partitions(list):
def __init__(self, stream, max_lines_per_partition, tmpdir):
self.tmpdir=tmpdir
m = {}
for line in stream:
line = line.rstrip('\n')
if not line in m:
if len(m) == max_lines_per_partition:
self.save(m)
m.clear()
m[line] = 0
m[line] += 1
if len(m) > 0:
self.save(m)
del m
def save(self, m):
i = len(self)
new_partition_fname = '{}/part{}'.format(self.tmpdir, i)
self.append(new_partition_fname)
f = open(new_partition_fname, 'w')
for key in sorted(m.keys()):
f.write('{} {}\n'.format(key, m[key]))
f.close()
class PartitionEntryIterator:
def __init__(self, fname):
self.fname = fname
self.f = open(fname, 'r')
self.next()
def next(self):
line = self.f.readline()
if len(line) != 0:
self.key, self.count = line.rsplit(maxsplit=1)
self.count = int(self.count.rstrip('\n'))
return True
else:
return False
def __del__(self):
self.f.close()
def __lt__(self, other):
return self.key < other.key
def count_distinct_lines(lines, max_lines_per_partition):
with tempfile.TemporaryDirectory() as tmpdir:
h = []
for fname in Partitions(lines, max_lines_per_partition, tmpdir):
x = PartitionEntryIterator(fname)
heapq.heappush(h, x)
key = h[0].key
count = 0
while not len(h) == 0:
x = heapq.heappop(h)
if key == x.key:
count += x.count
else:
yield (key, count)
key, count = x.key, x.count
if x.next():
heapq.heappush(h, x)
yield (key, count)
if __name__ == '__main__':
if len(sys.argv) != 2:
print('Usage:\n\t' + sys.argv[0] + ' <max-lines-per-partition>')
exit(1)
for key, count in count_distinct_lines(sys.stdin, int(sys.argv[1])):
print(key, count, sep=': ')
PS我更喜欢第一个选项,因为In [23]: df['Mathscore'] = df.Mathscore.astype('category').cat.rename_categories(['1','0'])
In [24]: df
Out[24]:
Id_Student Mathscore
0 1 1
1 2 0
2 3 0
3 4 1
4 5 1
5 6 0
6 7 1
7 8 1
8 9 0
9 10 1
In [25]: df.dtypes
Out[25]:
Id_Student int64
Mathscore category
dtype: object
dtype使用更少的内存
答案 1 :(得分:1)
您可以使用:
df['Mathscore'] = df['Mathscore'].str.replace('Strong','1')
df['Mathscore'] = df['Mathscore'].str.replace('Weak','0')
返回:
In [1]: df
Out[1]:
Id_Student Mathscore
0 1 1
1 2 0
2 3 0
3 4 1
4 5 1
5 6 0
6 7 1
7 8 1
8 9 0
9 10 1