#!/usr/bin/env python2.7
import vobject
abinfile='/foo/bar/dir/infile.vcf' #ab stands for address book
aboutfile='/foo/bar/dir/outfile.vcf'
def eliminate_vcard_duplicates (abinfile, aboutfile):
#we first convert the Adrees Book IN FILE into a list
with open(abinfile) as source_file:
ablist = list(vobject.readComponents(source_file))
#then add each vcard from that list in a new list unless it's already there
ablist_norepeats=[]
ablist_norepeats.append(ablist[0])
for i in range(1, len(ablist)):
jay=len(ablist_norepeats)
for j in reversed(range(0, jay)): #we do reversed because usually cards have duplicates nearby
if ablist_norepeats[j].serialize() == ablist[i].serialize():
break
else:
jay += -1
if jay == 0:
ablist_norepeats.append(ablist[i])
#and finally write the singularized list to the Adrees Book OUT FILE
with open(aboutfile, 'w') as destination_file:
for j in range(0, len(ablist_norepeats)):
destination_file.write(ablist_norepeats[j].serialize)
eliminate_vcard_duplicates(abinfile, aboutfile)
上面的代码可以工作并创建一个新文件,其中没有完全相同的重复项(具有相同单一化的重复项)。我知道代码有一些效率问题:它是n平方,当它可能是n * log n;我们只能将每个空位序列化一次;低效使用等等。在这里,我想提供一个简短的代码来说明我不知道如何解决的问题之一。
我不确定如何优雅地解决的问题是:如果某些卡中的字段被扰乱,它将无法检测到它们是否相等。有没有办法用vobject,re或其他方法检测这样的重复?
测试中使用的文件内容有四个相同的vcards(电话乱糟糟的代码 - 而不是电子邮件扰乱的想法),就是这个:
BEGIN:VCARD
VERSION:3.0
FN:Foo_bar1
N:;Foo_bar1;;;
EMAIL;TYPE=INTERNET:foobar1@foo.bar.com
TEL;TYPE=CELL:123456789
TEL;TYPE=CELL:987654321
END:VCARD
BEGIN:VCARD
VERSION:3.0
FN:Foo_bar1
N:;Foo_bar1;;;
EMAIL;TYPE=INTERNET:foobar1@foo.bar.com
TEL;TYPE=CELL:123456789
TEL;TYPE=CELL:987654321
END:VCARD
BEGIN:VCARD
VERSION:3.0
FN:Foo_bar1
N:;Foo_bar1;;;
TEL;TYPE=CELL:123456789
TEL;TYPE=CELL:987654321
EMAIL;TYPE=INTERNET:foobar1@foo.bar.com
END:VCARD
BEGIN:VCARD
VERSION:3.0
FN:Foo_bar1
N:;Foo_bar1;;;
TEL;TYPE=CELL:987654321
TEL;TYPE=CELL:123456789
EMAIL;TYPE=INTERNET:foobar1@foo.bar.com
END:VCARD
以上代码不会检测到这四个是完全相同的,因为最后一个代码的电话号码已被扰乱。
作为奖励积分,如果某人拥有更快的算法,那么如果可以共享则会很棒。上面的一个在30.000 Vcard文件上需要几天......
答案 0 :(得分:1)
您可能已经注意到的一件事是,如果您致电
.serialize()
方法,然后EMAIL
在FN
之前排序。但
不幸的是,电话号码没有排序。如果是的话,你
可以将序列化的单个组件添加到集合中,然后
独特的哈希值可以解决多种情况。
如果您调查从发电机获得的收益
vobject.readComponents()
(例如使用type()
),您会看到
是来自模块Component
的{{1}},并使用vobject.base
一个实例,您会看到方法dir()
。如果你看
在源代码中,您将找到:
getSortedChildren()
和def getSortedChildren(self):
return [obj for k in self.sortChildKeys() for obj in self.contents[k]]
上方的a:
sortChildKeys()
在示例实例上调用def sortChildKeys(self):
try:
first = [s for s in self.behavior.sortFirst if s in self.contents]
except Exception:
first = []
return first + sorted(k for k in self.contents.keys() if k not in first)
会得到sortChildKeys()
,这会得出两个结论:
['version',
'email', 'fn', 'n', 'tel']
使sortFirst
位于最前面version
未排序,因此您的TEL条目也未排序。解决方案似乎是您将for obj in self.contents[k]
重新定义为:
getSortedChildren()
但这导致:
TypeError:“ ContentLine”和“ ContentLine”的实例之间不支持“ <”
因此您需要提供一些基本的比较操作
return [obj for k in self.sortChildKeys() for obj in sorted(self.contents[k])]
也在ContentLine
中定义:
vobject.base
给出:
import vobject
from vobject.base import Component, ContentLine
def gsc(self):
return [obj for k in self.sortChildKeys() for obj in sorted(self.contents[k])]
Component.getSortedChildren = gsc
def ltContentLine(self, other):
return str(self) < str(other)
def eqContentLine(self, other):
return str(self) == str(other)
ContentLine.__lt__ = ltContentLine
ContentLine.__eq__ = eqContentLine
addresses = set()
with open('infile.vcf') as fp:
for vcard in vobject.readComponents(fp):
# print(type(vcard))
# print(dir(vcard))
# print(vcard.sortChildKeys())
# print(vcard.contents.keys())
addresses.add(vcard.serialize())
with open('outfile.vcf', 'w') as fp:
for a in addresses:
fp.write(a)
# and check
with open('outfile.vcf') as fp:
print(fp.read(), end="")
答案 1 :(得分:0)
以下是一个更快的代码(大约三个数量级),但仍然只删除完全重复的代码......
#!/usr/bin/env python2.7
import vobject
import datetime
abinfile='/foo/bar/dir/infile.vcf' #ab stands for address book
aboutfile='/foo/bar/dir/outfile.vcf'
def eliminate_vcard_duplicatesv2(abinfile, aboutfile):
#we first convert the Adrees Book IN FILE into a list
ablist=[]
with open(abinfile) as source_file:
ablist = list(vobject.readComponents(source_file))
#we then serialize the list to expedite comparison process
ablist_serial=[]
for i in range(0, len(ablist)):
ablist_serial.append(ablist[i].serialize())
#then add each unique vcard's position from that list in a new list unless it's already there
ablist_singletons=[]
duplicates=0
for i in range(1, len(ablist_serial)):
if i % 1000 == 0:
print "COMPUTED CARD:", i, "Number of duplicates: ", duplicates, "Current time:", datetime.datetime.now().time()
jay=len(ablist_singletons)
for j in reversed(range(0, jay)): #we do reversed because usually cards have duplicates nearby
if ablist_serial[ablist_singletons[j]] == ablist_serial[i]:
duplicates += 1
break
else:
jay += -1
if jay == 0:
ablist_singletons.append(i)
print "Length of Original Vcard File: ", len(ablist)
print "Length of Singleton Vcard File: ", len(ablist_singletons)
print "Generating Singleton Vcard file and storing it in: ", aboutfile
#and finally write the singularized list to the Adrees Book OUT FILE
with open(aboutfile, 'w') as destination_file:
for k in range(0, len(ablist_singletons)):
destination_file.write(ablist_serial[ablist_singletons[k]])
eliminate_vcard_duplicatesv2(abinfile, aboutfile)
答案 2 :(得分:0)
Anthon's answer 的变体,使用类装饰器。
import vobject
from vobject.base import Component, ContentLine
def sortedContents(cls):
def getSortedChildren(self):
return [obj for k in self.sortChildKeys() for obj in sorted(self.contents[k])]
cls.getSortedChildren = getSortedChildren
return cls
def sortableContent(cls):
def __lt__(self, other):
return str(self) < str(other)
def __eq__(self, other):
return str(self) == str(other)
cls.__lt__ = __lt__
cls.__eq__ = __eq__
return cls
Component = sortedContents(Component)
ContentLine = sortableContent(ContentLine)
addresses = set()
with open('infile.vcf') as infile:
for vcard in vobject.readComponents(infile):
addresses.add(vcard.serialize())
with open('outfile.vcf', 'wb') as outfile:
for address in addresses:
outfile.write(bytes(address, 'UTF-8'))