更新了更新:
我做了以下工作: 1.如果if-elif结构与if-elif-else替换(见下文)。 2.将dec评估为一个字符串(即dec =='1'而不是dec == 1)
if len(SframeDup.index) > 0 and dec == '1':
SframeDup.to_csv('NWEA CSVs/Students/StudentDuplicates.csv', sep=',')
print ("%d instances of repeated student IDs detected." % len(SframeDup.index))
print ("See StudentDuplicates.csv for duplicates.")
print ("\nThis program will now stop.")
raise SystemExit
#quit() and exit() work too, but only in the editor
#doing this in Ipython Notebook will restart the kernal and require
#re-running and re-compiling preceeding code
elif len(SframeDup.index) >0 and dec == '2':
print ("%d instances of repeated student IDs detected." % len(SframeDup.index))
print ("See StudentDuplicates.csv for duplicates.")
Sframe['dup_check_1'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = False)
Sframe['dup_check_2'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = True)
Sframe = Sframe[(Sframe['dup_check_1'] == False) & (Sframe['dup_check_2'] == False)]
del Sframe['dup_check_1'], Sframe['dup_check_2']
else:
print ("No duplicates found. Oh yeah!")
更新
虽然我已尽力“继续”,但我想尽可能地记录下来。我正在粘贴2套代码;第一次尝试使用if-elif但未能使Sframe摆脱重复。第二个成功地省略了重复,但为了这样做,我不得不摆脱if-elif。
import pandas as pd
import numpy as np
import glob
import csv
import os
import sys
path = r'NWEA CSVs/Students/Raw'
allFiles = glob.glob(path + "/*.csv")
Sframe = pd.DataFrame()
list = []
for file in allFiles:
sdf = pd.read_csv(file,index_col=None, header=0)
list.append(sdf)
Sframe = pd.concat(list,ignore_index=False)
Sframe.to_csv('NWEA CSVs/Students/OutStudents.csv', sep=',')
Sframe["TermSchoolStudent"]=Sframe["TermName"]+Sframe["SchoolName"]+\
Sframe["StudentID"].map(str)
SframeDup = Sframe[Sframe.duplicated("TermSchoolStudent") == True]
if len(SframeDup.index) > 0:
SframeDup.to_csv('NWEA CSVs/Students/StudentDuplicates.csv', sep=',')
print ("%d instances of repeated student IDs detected." % len(SframeDup.index))
print ("See StudentDuplicates.csv for duplicates.")
print ("Enter 1 to make corrections and rerun program. \
\nEnter 2 to proceed without repeated student IDs.")
dec = input("-->")
if dec == 1:
print ("This program will now stop.")
print ("See StudentDuplicates.csv for duplicates.")
raise SystemExit
elif dec == 2:
Sframe['dup_check_1'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = False)
Sframe['dup_check_2'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = True)
Sframe = Sframe[(Sframe['dup_check_1'] == False) & (Sframe['dup_check_2'] == False)]
del Sframe['dup_check_1'], Sframe['dup_check_2']
print (len(Sframe))
输出:2840
import pandas as pd
import numpy as np
import glob
import csv
import os
import sys
path = r'NWEA CSVs/Students/Raw'
allFiles = glob.glob(path + "/*.csv")
Sframe = pd.DataFrame()
list = []
for file in allFiles:
sdf = pd.read_csv(file,index_col=None, header=0)
list.append(sdf)
Sframe = pd.concat(list,ignore_index=False)
Sframe.to_csv('NWEA CSVs/Students/OutStudents.csv', sep=',')
Sframe["TermSchoolStudent"]=Sframe["TermName"]+Sframe["SchoolName"]+\
Sframe["StudentID"].map(str)
SframeDup = Sframe[Sframe.duplicated("TermSchoolStudent") == True]
if len(SframeDup.index) > 0:
SframeDup.to_csv('NWEA CSVs/Students/StudentDuplicates.csv', sep=',')
print ("%d instances of repeated student IDs detected." % len(SframeDup.index))
print ("See StudentDuplicates.csv for duplicates.")
Sframe['dup_check_1'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = False)
Sframe['dup_check_2'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = True)
Sframe = Sframe[(Sframe['dup_check_1'] == False) & (Sframe['dup_check_2'] == False)]
del Sframe['dup_check_1'], Sframe['dup_check_2']
print (len(Sframe))
输出:2834
**
** 我认为这是一个简单的问题,答案对我来说并不是一个新的程序员。基本上,我有一个数据帧(Sframe),我的程序检查它是否重复。如果用户指示程序应该在没有重复项的情况下继续进行,则从数据帧中删除重复项(及其唯一值),并且应使Sframe等于Sframe并删除重复项(因此用修改后的Sframe替换原始Sframe)。之后,在主程序中,如果用户如上所述选择“2”,则Sframe应为修改版本。否则,如果首先没有检测到重复项(并且从未输入用户输入),则应使用原始Sframe。
我的代码看起来像这样:
Import Pandas as pd
Sframe = pd.DataFrame()
这里,代码检查重复项。如果它们存在,则以下运行。 如果它们不存在,则跳过以下内容,并按原始定义使用Sframe。
这是假设检测到重复项的代码:
dec = input("-->")
if dec == 1:
print ("This program will now stop.")
print ("this_file.csv to resolve a problem.")
raise SystemExit
elif dec == 2:
# add "Repeated" field to student with duplicates table. Values="NaN"
SframeDup["Repeated"]="NaN"
# New table joins (left, inner) Sframe with duplicates table (SframeDup) to
# identify all rows of duplicates (including the unique values that had
# duplicates)
SframeWDup=pd.merge(Sframe, SframeDup, on='identifier', how='left')
# Eliminate all repeating rows, including originals as pulled during left join
SframeWODup=SframeWDup[SframeWDup.Repeated_y!="NaN"]
# So here, in my mind, I should be able to just do this and the rest of
# the code should treat replace Sframe with SframeWODup (without the found
# duplicates)...
Sframe = SframeWODup
但它不起作用。我知道这一点是因为当我选择len(Sframe)
以消除重复项(及其唯一的原始值)后检查2
时,我得到的数字与处理重复项之前的数字相同。
提前感谢您的帮助。如果不清楚,我将很乐意澄清。
更新: Sframe.Types TermName对象
区域名对象
SchoolName对象
StudentLastName对象
StudentFirstName对象
StudentMI对象
StudentID对象
StudentDateOfBirth对象
StudentEthnicGroup对象
StudentGender对象
成绩对象
TermSchoolStudent对象
dtype:object
Sframe.head()返回以下链接中图像中的表: https://drive.google.com/file/d/0B1cr7dwUpr_JR3d0YzlwLWFwQU0/view?usp=sharing
答案 0 :(得分:0)
试试Sframe = SframeWODup.copy()
更新:
你能用这段代码来达到预期的效果吗?
# Made-up data
Sframe = pd.DataFrame({'TermName': ['Fall', 'Fall', 'Fall', 'Fall'],
'DistrictName': ['Downtown', 'Downtown', 'Downtown', 'Downtown'],
'SchoolName': ['Seattle Central', 'Ballard', 'Ballard', 'Ballard'],
'StudentLastName': ['Doe', 'Doe', 'Doe', 'Doe'],
'StudentFirstName': ['John', 'Jane', 'Jane', 'Jane'],
'StudentMI': ['X', 'X', 'X', 'X'],
'StudentID': ['1234', '9876', '9876', '9876'],
'StudentDateOfBirth': ['2000-01-01', '2001-01-01', '2001-01-01', '2001-01-01'],
'StudentEthnicGroup': ['Asian American', 'White', 'White', 'White'],
'StudentGender': ['M', 'F', 'F', 'F'],
'Grade': ['10th', '9th', '9th', '9th'],
'TermSchoolStudent': ['Z', 'Z', 'Z', 'Z']})
# Remove duplicates based upon StudentID, in-place (i.e., modify object 'Sframe').
# UPDATE: I read that you want duplicates completely removed from data frame.
# Sframe.drop_duplicates(cols = ['StudentID'], take_last = False, inplace = True)
Sframe['dup_check_1'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = False)
Sframe['dup_check_2'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = True)
Sframe = Sframe[(Sframe['dup_check_1'] == False) & (Sframe['dup_check_2'] == False)]
del Sframe['dup_check_1'], Sframe['dup_check_2']
答案 1 :(得分:0)
我做了以下工作并且工作正常:1。如果if-elif结构使用if-elif-else替换(见下文)。 2.将dec评为字符串(即dec ==' 1'而不是dec == 1)
if len(SframeDup.index) > 0 and dec == '1':
SframeDup.to_csv('NWEA CSVs/Students/StudentDuplicates.csv', sep=',')
print ("%d instances of repeated student IDs detected." % len(SframeDup.index))
print ("See StudentDuplicates.csv for duplicates.")
print ("\nThis program will now stop.")
raise SystemExit
#quit() and exit() work too, but only in the editor
#doing this in Ipython Notebook will restart the kernal and require
#re-running and re-compiling preceeding code
elif len(SframeDup.index) >0 and dec == '2':
print ("%d instances of repeated student IDs detected." % len(SframeDup.index))
print ("See StudentDuplicates.csv for duplicates.")
Sframe['dup_check_1'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = False)
Sframe['dup_check_2'] = Sframe.duplicated(cols = ['TermName', 'SchoolName', 'StudentID'], take_last = True)
Sframe = Sframe[(Sframe['dup_check_1'] == False) & (Sframe['dup_check_2'] == False)]
del Sframe['dup_check_1'], Sframe['dup_check_2']
else:
print ("No duplicates found. Oh yeah!")