使用For循环更改DataFrame列(Pandas)中的数据

时间:2016-09-07 20:32:22

标签: python pandas dataframe

我正在尝试从“Mathscore”获取数据并将值转换为数值,所有这些都在“Mathscore”下。

strong = 1 弱= 0

我尝试使用For循环通过以下函数执行此操作,但我无法运行代码。我试图分配数据的方式不正确吗?

谢谢!

import pandas as pd

data = {'Id_Student' : [1,2,3,4,5,6,7,8,9,10],'Mathscore' :['Strong','Weak','Weak','Strong','Strong','Weak','Strong','Strong','Weak','Strong']}

df = pd.DataFrame(data)
df

# # Strong = 1 and Weak =0

##def tran_mathscore(x): if x == 'Strong': return 1 if x == 'Weak': return 0
##
##df['Trans_MathScore'] = df['Mathscore'].apply(tran_mathscore)
##df


##df.Mathscore[0]=["Weak"]

##print(df.columns)
##
##
##print(df.Mathscore)

def tran_mathscore():
    for i in df.Mathscore:
        if i == "Strong":
        df.Mathscore[i]= ['1']

    elif i == "Weak":
        df.Mathscore[i]= ['0']


tran_mathscore()

2 个答案:

答案 0 :(得分:3)

您可以categorize您的数据:

current_key current_value

或映射它:

#!/usr/bin/env python3

import sys
import heapq
import os
import tempfile

class Partitions(list):
    def __init__(self, stream, max_lines_per_partition, tmpdir):
        self.tmpdir=tmpdir
        m = {}
        for line in stream:
            line = line.rstrip('\n')
            if not line in m:
                if len(m) == max_lines_per_partition:
                    self.save(m)
                    m.clear()
                m[line] = 0
            m[line] += 1

        if len(m) > 0:
            self.save(m)
        del m

    def save(self, m):
        i = len(self)
        new_partition_fname = '{}/part{}'.format(self.tmpdir, i)
        self.append(new_partition_fname)
        f = open(new_partition_fname, 'w')
        for key in sorted(m.keys()):
            f.write('{} {}\n'.format(key, m[key]))
        f.close()


class PartitionEntryIterator:
    def __init__(self, fname):
        self.fname = fname
        self.f = open(fname, 'r')
        self.next()

    def next(self):
        line = self.f.readline()
        if len(line) != 0:
            self.key, self.count = line.rsplit(maxsplit=1)
            self.count = int(self.count.rstrip('\n'))
            return True
        else:
            return False

    def __del__(self):
        self.f.close()

    def __lt__(self, other):
        return self.key < other.key

def count_distinct_lines(lines, max_lines_per_partition):
    with tempfile.TemporaryDirectory() as tmpdir:
        h = []
        for fname in Partitions(lines, max_lines_per_partition, tmpdir):
            x = PartitionEntryIterator(fname)
            heapq.heappush(h, x)

        key = h[0].key
        count = 0
        while not len(h) == 0:
            x = heapq.heappop(h)
            if key == x.key:
                count += x.count
            else:
                yield (key, count)
                key, count = x.key, x.count
            if x.next():
                heapq.heappush(h, x)

        yield (key, count)

if __name__ == '__main__':

    if len(sys.argv) != 2:
        print('Usage:\n\t' + sys.argv[0] + ' <max-lines-per-partition>')
        exit(1)

    for key, count in count_distinct_lines(sys.stdin, int(sys.argv[1])):
        print(key, count, sep=': ')

PS我更喜欢第一个选项,因为In [23]: df['Mathscore'] = df.Mathscore.astype('category').cat.rename_categories(['1','0']) In [24]: df Out[24]: Id_Student Mathscore 0 1 1 1 2 0 2 3 0 3 4 1 4 5 1 5 6 0 6 7 1 7 8 1 8 9 0 9 10 1 In [25]: df.dtypes Out[25]: Id_Student int64 Mathscore category dtype: object dtype使用更少的内存

答案 1 :(得分:1)

您可以使用:

df['Mathscore'] = df['Mathscore'].str.replace('Strong','1')
df['Mathscore'] = df['Mathscore'].str.replace('Weak','0')

返回:

In [1]: df

Out[1]:

   Id_Student Mathscore
0           1         1
1           2         0
2           3         0
3           4         1
4           5         1
5           6         0
6           7         1
7           8         1
8           9         0
9          10         1