我有一个大型Dataframe(使用python pandas df.to_csv模块读取csv文件),我想检查该列中是否存在任何数字,然后我将删除整行。按以下方式进行:
_rowsToRemoveIfFuncIsTrue = OrderedDict({
File_Name_All : {
'SRC_DEAL_REF_STR' : [ ( removeFunc_noDigit, {} ) ],
},
File_Name.TCP_PRELIM_ADJUSTMENTS : {
'FACILITY_ID' : [ ( removeFunc_hasDuplicatesInColumn, {} ) ],
},
})
@Validator
def rowsToRemoveIfFuncIsTrue( colName, val, fileType, rowsToRemoveIfFuncIsTrueCfg = None ):
''' Function returns False if a row needs to be removed if removeFunc returns True '''
if _isBlank( colName ):
return ValidatorResponse( rule_decision = Rule_Decision.INVALID_INPUT, rule_return_message = 'colName cannot be Empty' )
if _isBlank( fileType ):
return ValidatorResponse( rule_decision = Rule_Decision.INVALID_INPUT, rule_return_message = 'fileType %s is not a valid file for rule : rowsToRemoveIfFuncIsTrue' % fileType )
rowsToRemoveIfFuncIsTrueCfg = rowsToRemoveIfFuncIsTrueCfg if rowsToRemoveIfFuncIsTrueCfg else getCfgDictFromMeta( 'rowsToRemoveIfFuncIsTrue' )[ 1 ]
if not rowsToRemoveIfFuncIsTrueCfg or not isinstance( rowsToRemoveIfFuncIsTrueCfg, OrderedDict ):
return ValidatorResponse( rule_decision = Rule_Decision.INVALID_INPUT, rule_return_message = 'Input Function is not a valid type for rule : rowsToRemoveIfFuncIsTrue' )
if File_Name_All in rowsToRemoveIfFuncIsTrueCfg or fileType in rowsToRemoveIfFuncIsTrueCfg:
for myfileName, cfgDict in rowsToRemoveIfFuncIsTrueCfg.iteritems():
if myfileName == File_Name_All or myfileName == fileType:
for mycolName, removeFuncs in cfgDict.iteritems():
if colName == mycolName:
for removeFunc, kwargs in removeFuncs:
if removeFunc( val, kwargs ):
return ValidatorResponse( rule_decision = Rule_Decision.FAILED, rule_name = removeFunc.func_name )
return ValidatorResponse( rule_decision = Rule_Decision.SUCCESS )
检查它没有数字:
def removeFunc_noDigit( val, kwargs = def_kwargs ):
''' Function returns true if val is a digit '''
return hasDigit( val, kwargs = def_kwargs )
现在,我想检查列中是否有任何重复项,然后删除整行,我将按以下方式执行此操作:
seen = set()
def removeFunc_hasDuplicatesInColumn( val, kwargs = def_kwargs ):
if val not in seen and not seen.add(val):
return False
else:
return True
但是,我收到以下错误:
“'系列'对象是可变的,因此无法进行哈希处理。”如何检查列是否应该没有重复项?