I am trying to remove elements from a list of strings (read from a file). The elements are themselves a list (in the form of a string, separated by commas).
I want to remove those strings from the list that have the same elements. For eg:
1: GGSIPU,RANK,BTECH,9
2: GGSIPU,BTECH,RANK,9
3: GGSIPU,BTECH,RANK,9
hence lines 2 and 3 should be removed.
here is my code:
# to remove duplicates
with open('itemset3.txt', 'r') as f:
lines = f.readlines()
f.close()
i = 0
while (i<len(lines)):
j = i + 1
temp = []
temp1 = lines[i].split(',')
print 'outer %d %s' % (i,temp1)
temp.append(temp1[0])
temp.append(temp1[1])
temp.append(temp1[2])
while (j<len(lines)):
if all(t in lines[j] for t in temp):
print temp, ' found at ',j,': ',lines[j]
# lines.remove(lines[j])
del lines[j]
j = j + 1
i = i + 1
f = open('itemset3.txt', 'w')
i = 0
while (i<len(lines)):
f.write(lines[i])
i = i + 1
f.close()
and here is the text file
GGSIPU,RANK,BTECH,9
GGSIPU,BTECH,RANK,9
GGSIPU,BTECH,RANK,9
GGSIPU,SEMESTER,RANK,9
GGSIPU,CALCULATOR,RANK,9
GGSIPU,CHECK,RANK,7
GGSIPU,Certified,RANK,7
GGSIPU,Winner,RANK,7
GGSIPU,Application,RANK,7
GGSIPU,Techexpo2015,RANK,7
GGSIPU,Students,RANK,6
RANK,BTECH,GGSIPU,9
RANK,BTECH,GGSIPU,9
RANK,BTECH,GGSIPU,9
RANK,SEMESTER,GGSIPU,9
RANK,SEMESTER,GGSIPU,9
RANK,CALCULATOR,GGSIPU,9
RANK,CALCULATOR,GGSIPU,9
RANK,CHECK,GGSIPU,7
RANK,CHECK,GGSIPU,7
RANK,Certified,GGSIPU,7
RANK,Certified,GGSIPU,7
RANK,Winner,GGSIPU,7
RANK,Winner,GGSIPU,7
RANK,Application,GGSIPU,7
RANK,Application,GGSIPU,7
RANK,Techexpo2015,GGSIPU,7
RANK,Techexpo2015,GGSIPU,7
RANK,Students,GGSIPU,6
RANK,Students,GGSIPU,6
BTECH,SEMESTER,GGSIPU,9
BTECH,CALCULATOR,GGSIPU,9
SEMESTER,CALCULATOR,GGSIPU,9
CHECK,Certified,GGSIPU,7
CHECK,Winner,GGSIPU,7
CHECK,Application,GGSIPU,7
CHECK,Techexpo2015,GGSIPU,7
CHECK,Students,GGSIPU,6
Certified,Winner,GGSIPU,7
Certified,Application,GGSIPU,7
Certified,Techexpo2015,GGSIPU,7
Certified,Students,GGSIPU,6
Winner,Application,GGSIPU,7
Winner,Techexpo2015,GGSIPU,7
Winner,Students,GGSIPU,6
Application,Techexpo2015,GGSIPU,7
Application,Students,GGSIPU,6
Techexpo2015,Students,GGSIPU,6
The question is that after running the code, there are still some redundant (duplicate) lines in the output. How should I rectify it?
here's the ouput upon making tuples:
('Certified', 'Winner', 'GGSIPU', '7')
('RANK', 'Application', 'GGSIPU', '7')
('Techexpo2015', 'Students', 'GGSIPU', '6')
('CHECK', 'Certified', 'GGSIPU', '7')
('RANK', 'SEMESTER', 'GGSIPU', '9')
('Application', 'Techexpo2015', 'GGSIPU', '7')
('GGSIPU', 'SEMESTER', 'RANK', '9')
('CHECK', 'Techexpo2015', 'GGSIPU', '7')
('RANK', 'Winner', 'GGSIPU', '7')
('CHECK', 'Winner', 'GGSIPU', '7')
('Winner', 'Students', 'GGSIPU', '6')
('GGSIPU', 'Winner', 'RANK', '7')
('GGSIPU', 'BTECH', 'RANK', '9')
('RANK', 'Techexpo2015', 'GGSIPU', '7')
('Certified', 'Students', 'GGSIPU', '6')
('GGSIPU', 'CHECK', 'RANK', '7')
('RANK', 'BTECH', 'GGSIPU', '9')
('GGSIPU', 'Students', 'RANK', '6')
('RANK', 'CALCULATOR', 'GGSIPU', '9')
('Winner', 'Techexpo2015', 'GGSIPU', '7')
('GGSIPU', 'Certified', 'RANK', '7')
('RANK', 'CHECK', 'GGSIPU', '7')
('CHECK', 'Application', 'GGSIPU', '7')
('RANK', 'Certified', 'GGSIPU', '7')
('GGSIPU', 'RANK', 'BTECH', '9')
('GGSIPU', 'CALCULATOR', 'RANK', '9')
('CHECK', 'Students', 'GGSIPU', '6')
('GGSIPU', 'Application', 'RANK', '7')
('GGSIPU', 'Techexpo2015', 'RANK', '7')
('Winner', 'Application', 'GGSIPU', '7')
('BTECH', 'SEMESTER', 'GGSIPU', '9')
('Certified', 'Techexpo2015', 'GGSIPU', '7')
('RANK', 'Students', 'GGSIPU', '6')
('SEMESTER', 'CALCULATOR', 'GGSIPU', '9')
('Certified', 'Application', 'GGSIPU', '7')
('Application', 'Students', 'GGSIPU', '6')
('BTECH', 'CALCULATOR', 'GGSIPU', '9')
the lines such as the following are still present
1: ('GGSIPU', 'Application', 'RANK', '7')
2: ('RANK', 'Application', 'GGSIPU', '7')
答案 0 :(得分:0)
with open('C:\Users\DELL\Documents\itemset3.txt', 'r') as f:
lines = f.readlines()
f.close()
linesUp = []
for line in lines:
linesUp.append(tuple(line.replace("\n","").split(',')))
setOfLines = set(linesUp)
我已经从,
处的字符串拆分构造了元组并将它们放入列表中。然后结束创建了一个只消除重复的集合。
在字符串
line
上使用了替换,因为很少有字符串在您的数据中出乎意料地没有新行。
我使用了一小组数据。希望它对你有用
答案 1 :(得分:-1)
coverting lines into tuples a making sets.
allLines = set()
with open('data') as f:
for line in f:
line = line.strip()
line = tuple(line.split(','))
allLines.add(line)
pp(allLines)
{('Application', 'Students', 'GGSIPU', '6'),
('Application', 'Techexpo2015', 'GGSIPU', '7'),
('BTECH', 'CALCULATOR', 'GGSIPU', '9'),
('BTECH', 'SEMESTER', 'GGSIPU', '9'),
('CHECK', 'Application', 'GGSIPU', '7'),
('CHECK', 'Certified', 'GGSIPU', '7'),
('CHECK', 'Students', 'GGSIPU', '6'),
('CHECK', 'Techexpo2015', 'GGSIPU', '7'),
('CHECK', 'Winner', 'GGSIPU', '7'),
('Certified', 'Application', 'GGSIPU', '7'),
('Certified', 'Students', 'GGSIPU', '6'),
('Certified', 'Techexpo2015', 'GGSIPU', '7'),
('Certified', 'Winner', 'GGSIPU', '7'),
('GGSIPU', 'Application', 'RANK', '7'),
('GGSIPU', 'BTECH', 'RANK', '9'),
('GGSIPU', 'CALCULATOR', 'RANK', '9'),
('GGSIPU', 'CHECK', 'RANK', '7'),
('GGSIPU', 'Certified', 'RANK', '7'),
('GGSIPU', 'RANK', 'BTECH', '9'),
('GGSIPU', 'SEMESTER', 'RANK', '9'),
('GGSIPU', 'Students', 'RANK', '6'),
('GGSIPU', 'Techexpo2015', 'RANK', '7'),
('GGSIPU', 'Winner', 'RANK', '7'),
('RANK', 'Application', 'GGSIPU', '7'),
('RANK', 'BTECH', 'GGSIPU', '9'),
('RANK', 'CALCULATOR', 'GGSIPU', '9'),
('RANK', 'CHECK', 'GGSIPU', '7'),
('RANK', 'Certified', 'GGSIPU', '7'),
('RANK', 'SEMESTER', 'GGSIPU', '9'),
('RANK', 'Students', 'GGSIPU', '6'),
('RANK', 'Techexpo2015', 'GGSIPU', '7'),
('RANK', 'Winner', 'GGSIPU', '7'),
('SEMESTER', 'CALCULATOR', 'GGSIPU', '9'),
('Techexpo2015', 'Students', 'GGSIPU', '6'),
('Winner', 'Application', 'GGSIPU', '7'),
('Winner', 'Students', 'GGSIPU', '6'),
('Winner', 'Techexpo2015', 'GGSIPU', '7')}