我试图根据某些条件隐藏数据

时间:2019-11-20 12:31:47

标签: python pandas

如果该列包含exceptionList中的值,我试图隐藏单个列的数据,那么它应该转义并移至下一个,但是以某种方式,我无法将其隐藏并引发错误

   if(x in exceptionList):
 ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

这是我的代码

data = [['NISAMANEE ROWELL', '9198762345','98 Oxford Ave.Elk Grove Village, IL 60007'], ['ALICE BAISDEN', '8756342865', '94 Valley Rd.Miami Gardens, FL 33056'], ['MARC COGNETTI', '9198762345', '221 Summer CircleGreer, SC 29650'], ['JOHNS HOPKINS HEALTHCARE', '9654987642', '8522 Pendergast AvenueVilla Park, IL 60181']] 
df = pd.DataFrame(data, columns = ['Name', 'Number', 'Address']) 
df


def title_format(inp):
    return inp.str.title()
def new(x):
    #x = input('Enter your column name')
    #x = x.title()
    x = title_format(x)
    print(x)
    exc_list=['Mackesson Inc','Care','Healthcare','Henery Schien','Besse','LLC','CandP','INC','LTD','PHARMACY','PHARMACEUTICAL','HOSPITAL','COMPANY','ELECTRONICS','APP','VOLUNTEERS','SPECIALITIES','APPLIANCE','EXPRESS','MAGAZINE','SUPPLY','ENDOSCOPY','NETWandK','SCHOOL','AT&T','SOLUTIONS','SANITATION','SYSTEMS','COMPOUNDING','CLINIC','UTILITIES','DEPARTMENT','CREATIVE','PIN','employment','consultant','units','label','machine','anesthesia','services','medical','community','plaza','tech','bipolar','brand','commerce','testing','inspection','killer','plus','electric','division','diagnostic','materials','imaging','international','district','chamber','city','products','essentials','life','scissand','leasing','units','health','healthcare','surgical','enterprises','print','radiology','water','screens','telecom']
    exceptionList = [z.title() for z in exc_list]
    if(x in exceptionList):
        return x
    else:
        return x.str.replace(x, 'X' * random.randrange(3, 8))

#new(df.Name.astype(str))
new(df['Name'].astype(str))

2 个答案:

答案 0 :(得分:1)

据我所知,我在代码中更改了几行:

import pandas as pd
import random

data = [['NISAMANEE ROWELL', '9198762345','98 Oxford Ave.Elk Grove Village, IL 60007'], ['ALICE BAISDEN', '8756342865', '94 Valley Rd.Miami Gardens, FL 33056'], ['MARC COGNETTI', '9198762345', '221 Summer CircleGreer, SC 29650'], ['Healthcare', '9654987642', '8522 Pendergast AvenueVilla Park, IL 60181']] 
df = pd.DataFrame(data, columns = ['Name', 'Number', 'Address']) 


def title_format(inp):
    return inp.str.title()
def new(x):
    #x = input('Enter your column name')
    #x = x.title()
    x = title_format(x)
    print(x)
    exc_list=['Mackesson Inc','Care','Healthcare','Henery Schien','Besse','LLC','CandP','INC','LTD','PHARMACY','PHARMACEUTICAL','HOSPITAL','COMPANY','ELECTRONICS','APP','VOLUNTEERS','SPECIALITIES','APPLIANCE','EXPRESS','MAGAZINE','SUPPLY','ENDOSCOPY','NETWandK','SCHOOL','AT&T','SOLUTIONS','SANITATION','SYSTEMS','COMPOUNDING','CLINIC','UTILITIES','DEPARTMENT','CREATIVE','PIN','employment','consultant','units','label','machine','anesthesia','services','medical','community','plaza','tech','bipolar','brand','commerce','testing','inspection','killer','plus','electric','division','diagnostic','materials','imaging','international','district','chamber','city','products','essentials','life','scissand','leasing','units','health','healthcare','surgical','enterprises','print','radiology','water','screens','telecom']
    exceptionList = [z.title() for z in exc_list]
    match = [x1 in exceptionList for x1 in x]
    df.loc[match,'Name'] = ['X' * random.randrange(3, 8) for a in range(sum(match))]
#           return x
#    else:
#        return x.str.replace(x, 'X' * random.randrange(3, 8))

#new(df.Name.astype(str))
new(df['Name'].astype(str))

df
Out[1]:

        Name            Number      Address
0   NISAMANEE ROWELL    9198762345  98 Oxford Ave.Elk Grove Village, IL 60007
1   ALICE BAISDEN       8756342865  94 Valley Rd.Miami Gardens, FL 33056
2   MARC COGNETTI       9198762345  221 Summer CircleGreer, SC 29650
3   XXXXXXX             9654987642  8522 Pendergast AvenueVilla Park, IL 60181

执行此操作的最佳方法

exc_list = [x.title() for x in exc_list]
df['Name'] = df['Name'].map(str.title)
df['match'] = [nn in exc_list for nn in df['Name']]
df.loc[df['match'] == True,'Name'] = ['X' * random.randrange(3, 8) for a in range(sum(df['match']))] 

隐藏前3个符号

exc_list = [x.title() for x in exc_list]
df['Name'] = df['Name'].map(str.title)
df['match'] = [nn in exc_list for nn in df['Name']]
df['NameIf'] = list(zip(df['Name'], [(lambda x: 'XXX' + s[3:] if len(x)>3 else 'XXX')(s) for s in df['Name']]))

df['Name'] = [n[0][n[1]] for n in list(zip(df['NameIf'],df['match'].astype(int)))]
df = df.drop(['NameIf', 'match'], axis = 1)
df

隐藏整行

exc_list = [x.title() for x in exc_list]
df['Name'] = df['Name'].map(str.title)
df['match'] = [nn in exc_list for nn in df['Name']]

hide_row = {c:'XXX' for c in df.columns}
df[df['match'] != True].merge(pd.DataFrame(hide_row, index = df[df['match'] == True].index), how = 'outer')
简短说明
# Step 1. this gives you DataFrame without matching

df[df['match'] != True]

Out[3]:
    Name                Number      Address                                    match
0   Nisamanee Rowell    9198762345  98 Oxford Ave.Elk Grove Village, IL 60007   False
1   Alice Baisden       8756342865  94 Valley Rd.Miami Gardens, FL 33056        False
2   Marc Cognetti       9198762345  221 Summer CircleGreer, SC 29650            False


# Step 2. this opposite gives you DataFrame with matching

df[df['match'] == True]

Out[4]:
    Name        Number      Address                                     match
3   Healthcare  9654987642  8522 Pendergast AvenueVilla Park, IL 60181  True

# Step 3. but you take only index from Step 2. And create new dataframe with indexes and 'XXX' columns

hide_row = {c:'XXX' for c in df.columns}
pd.DataFrame(hide_row, index = df[df['match'] == True].index)

Out[5]:
    Name    Number  Address match
3   XXX     XXX     XXX     XXX

# Step 4. And then you just merge two dataframes from step 1 and step 3 by indexes

df[df['match'] != True].merge(pd.DataFrame(hide_row, index = df[df['match'] == True].index), how = 'outer')

答案 1 :(得分:0)

对您的代码进行很小的改动就可以了,请记住这不是最佳的,但是效果很好。

data = [['NISAMANEE ROWELL', '9198762345','98 Oxford Ave.Elk Grove Village, IL 60007'], ['ALICE BAISDEN', '8756342865', '94 Valley Rd.Miami Gardens, FL 33056'], ['MARC COGNETTI', '9198762345', '221 Summer CircleGreer, SC 29650'], ['Healthcare', '9654987642', '8522 Pendergast AvenueVilla Park, IL 60181']] 
df = pd.DataFrame(data, columns = ['Name', 'Number', 'Address']) 
df


def title_format(inp):
    return inp.title()
def new(x):
    #x = input('Enter your column name')
    #x = x.title()
    x = title_format(x)
    print(x)
    exc_list=['Mackesson Inc','Care','Healthcare','Henery Schien','Besse','LLC','CandP','INC','LTD','PHARMACY','PHARMACEUTICAL','HOSPITAL','COMPANY','ELECTRONICS','APP','VOLUNTEERS','SPECIALITIES','APPLIANCE','EXPRESS','MAGAZINE','SUPPLY','ENDOSCOPY','NETWandK','SCHOOL','AT&T','SOLUTIONS','SANITATION','SYSTEMS','COMPOUNDING','CLINIC','UTILITIES','DEPARTMENT','CREATIVE','PIN','employment','consultant','units','label','machine','anesthesia','services','medical','community','plaza','tech','bipolar','brand','commerce','testing','inspection','killer','plus','electric','division','diagnostic','materials','imaging','international','district','chamber','city','products','essentials','life','scissand','leasing','units','health','healthcare','surgical','enterprises','print','radiology','water','screens','telecom']
    exceptionList = [z.title() for z in exc_list]
    if(x in exceptionList):
        return x
    else:
        return x.replace(x, 'X' * random.randrange(3, 8))

#new(df.Name.astype(str))
df['Name'] = df['Name'].apply(new)