我有一个字典,其中包含一个扩展类名列表作为键,每个键指向一个列表,其中包含该类出现在不同罐子中的次数以及它出现的罐子。
例如:
classToJars = {
'com.sun.xml.ws.policy.PolicyMapKey.class' : [ 1, 'policy-2.3.1.jar'],
'com.sun.xml.ws.policy.PolicyMerger.class' : [ 1, 'policy-2.3.1.jar'],
'com.sun.xml.ws.policy.PolicyAssertion.class' : [ 1, 'policy-2.3.1.jar' ],
'com.sun.xml.bind.AccessorFactory.class' : [1, 'jaxb-impl-2.2.6.jar'],
'com.sun.xml.bind.AccessorFactoryImpl.class' : [1, 'jaxb-impl-2.2.6.jar'],
'com.sun.xml.bind.AnyTypeAdapter.class' : [1, 'jaxb-impl-2.2.6.jar' ],
'org.apache.mina.integration.jmx.IoSessionManager.class' : [1, 'mina-integration-jmx-1.1.7.jar'],
'org.apache.mina.integration.jmx.IoServiceManager.class' : [1, 'mina-integration-jmx-1.1.7.jar'],
'org.apache.log4j.Appender.class' : [2, 'log4j-1.2.14.jar', 'log4j-1.2.15.jar'],
'org.apache.log4j.AppenderSkeleton.class' : [2, 'log4j-1.2.14.jar', 'log4j-1.2.15.jar'],
'com.sun.activation.registries.LineTokenizer.class' : [1, 'activation-1.1.jar'],
'com.sun.activation.registries.LogSupport.class' : [1, 'activation-1.1.jar'],
'com.sun.istack.Builder.class' : [2, 'jaxb-impl-2.2.6.jar istack-commons-runtime-2.4.jar'],
'com.sun.istack.ByteArrayDataSource.class' : [2, 'jaxb-impl-2.2.6.jar istack-commons-runtime-2.4.jar'],
'com.reuters.rfa.ansipage.Page.class' : [1, 'rfa-7.2.0.E2.jar'],
'com.reuters.rfa.ansipage.PageUpdate.class' : [1, 'rfa-7.2.0.E2.jar'],
'org.apache.http.impl.io.AbstractMessageWriter.class' : [1, 'rfa-7.2.0.E2.jar'],
'org.apache.http.impl.io.ChunkedOutputStream.class' : [1, 'rfa-7.2.0.E2.jar']
}
这是一个大字典,有数千个键和值环绕着大量的罐子。我们的想法是能够折叠dict,如果值相同,则将其折叠到最大的公共子串。
例如:当我运行折叠函数时,上面的哈希应该减少到4行,如下所示:
'com.sun.xml.ws.policy' : [ 1, 'policy-2.3.1.jar'],
'com.sun.xml.bind' : [1, 'jaxb-impl-2.2.6.jar'],
'org.apache.mina.integration.jmx' : [1, 'mina-integration-jmx-1.1.7.jar'],
'org.apache.log4j' : [2, 'log4j-1.2.14.jar', 'log4j-1.2.15.jar'],
'com.sun.activation.registries' : [1, 'activation-1.1.jar'],
'com.sun.istack' : [2, 'jaxb-impl-2.2.6.jar istack-commons-runtime-2.4.jar'],
'com.reuters.rfa.ansipage' : [1, 'rfa-7.2.0.E2.jar'],
'org.apache.http.impl.io' : [1, 'rfa-7.2.0.E2.jar'],
等等。
因为com.reuters.rfa和org.apache.http之间没有任何共同之处,如果你选择最大的公共子字符串,它将返回一个空键。
在这种情况下,它应该分别粘贴com.reuters.rfa和org.apache.http。
关于如何实现这一目标的任何想法?
答案 0 :(得分:1)
这是你想要的吗?
import os
classToJars = {
'com.sun.xml.ws.policy.PolicyMapKey.class' : [ 1, 'policy-2.3.1.jar'],
'com.sun.xml.ws.policy.PolicyMerger.class' : [ 1, 'policy-2.3.1.jar'],
'com.sun.xml.ws.policy.PolicyAssertion.class' : [ 1, 'policy-2.3.1.jar' ],
'com.sun.xml.bind.AccessorFactory.class' : [1, 'jaxb-impl-2.2.6.jar'],
'com.sun.xml.bind.AccessorFactoryImpl.class' : [1, 'jaxb-impl-2.2.6.jar'],
'com.sun.xml.bind.AnyTypeAdapter.class' : [1, 'jaxb-impl-2.2.6.jar' ],
'org.apache.mina.integration.jmx.IoSessionManager.class' :
[1, 'mina-integration-jmx-1.1.7.jar'],
'org.apache.mina.integration.jmx.IoServiceManager.class' :
[1, 'mina-integration-jmx-1.1.7.jar'],
'org.apache.mina.integration.jmx.IoSessionManagerMBean.class' :
[1, 'mina-integration-jmx-1.1.7.jar' ],
'org.apache.log4j.Appender.class' : [2, 'log4j-1.2.14.jar', 'log4j-1.2.15.jar'],
'org.apache.log4j.AppenderSkeleton.class' : [2, 'log4j-1.2.14.jar', 'log4j-1.2.15.jar'],
'org.apache.log4j.AsyncAppender.class' : [2, 'log4j-1.2.14.jar log4j-1.2.15.jar'],
# ...
}
#
# from http://stackoverflow.com/a/21419164/866915
#
def common_prefix(names):
prefix = os.path.commonprefix( [ n.split('.') for n in names ] )
return '.'.join(prefix)
# return the first 3 components of a class name
def min_prefix(name):
return '.'.join( name.split('.')[0:3] )
jarsForKey = {}
keyForClass = {}
for c in classToJars:
jars = classToJars[c]
s = '|'.join(jars[1:])
jarsForKey[s] = classToJars[c]
keyForClass[c] = s
# group together classes based on their key
sameKey = {}
for c in classToJars:
s = keyForClass[c]
sameKey.setdefault(s,[]).append(c)
# for each group of classes with the same key, find the largest common substring
for s in sameKey:
cls = sameKey[s] # all of the classes with the same key
jars = jarsForKey[s]
# partition cls into groups having at least 3 components in common
group = {}
for c in cls:
m = min_prefix(c)
group.setdefault(m, []).append(c)
# find the common prefix for each group
for m in group:
cls = group[m]
prefix = common_prefix(cls)
print prefix, "==>", jars