我正在使用嵌套字典构建一棵树。树中的根是句子的起始词,然后边是按顺序形成的。 Children
包含每个节点的子项列表。
当在某处检测到时,每个节点的权重(value
)增加1 /因子(因子可以假设为1)
def constructtree(lst,tree,factor):
if lst==[]:
return {}
else:
word =lst[0]
if not tree.has_key(word):
tree[word]={'name':word,'value':1/factor,"children":{}}
tree[word]["children"]=constructtree(lst[1:],tree[word]["children"],factor)
else:
#print 22
tree[word]["value"]+=1/factor
tree[word]["children"]=constructtree(lst[1:],tree[word]["children"],factor)
return tree
def doall(wl,tree):
for x in wl:
#print 11,x
tree2=constructtree(x,tree,1)
tree=tree2
#print 1,tree
return tree
mnn= doall(wordList2,{})
所以真正的问题是嵌套字典完全依赖于树(大小???),这让我完全糊涂了。就像它可能适用于列表中的10个字符串但不适用于20个字符串。
可能是什么原因?是否总是依赖于字符串的内容? (我知道声音很蹩脚,但我尝试使用重复的模式并获得了正确的结果,但有点自然和独特的模式,它搞砸了!)
编辑:
示例输入
[['obama', 'seems', 'like', 'a', 'good', 'person', 'with', 'good', 'intentions', "that's", 'enough', 'for', ''], ['obama', 'listens', 'to', 'nickelback'], ['obama', 'vote', 'to', 'romney', 'vote', 'http://t.co/9pllsq97\xe2\x80\xa6'], ['obama', 'have', 'been', 'a', 'man-made', 'disaster'], ['obama', 'killed', 'osama', 'he', 'deserves', 'another', '4', 'years', '#gtawinohio'], ['obama', 'even', 'though', "i'm", 'a', 'rep', 'he', 'has', 'a', 'better', 'vision', 'and', "he's", 'proven', 'himself', '#why', 'i', 'voted', 'for', 'obama'], ['obama', 'likes', 'blink-182', 'and', 'romney', 'likes', 'nicki', 'minaj', 'your', 'choice', 'america'], ['obama', 'votes', 'into', 'romney', 'votes', 'http://t.co/n7h', ''], ['obama', 'pack', 'his', 'shit', 'and', 'get', 'out', 'lol'], ['obama', '\xe2\x9d\x92', 'romney', '\xe2\x9c\x94', 'vader'], ['obama', 'for', 'equal', 'marriage', 'rights'], ['obama', 'just', 'acts', 'cool', 'and', 'he', 'has', 'it', 'unfortunately'], ['obama', 'wears', 'crocs'], ['obama'], ['obama', 'talks', 'about', 'the', 'nation', 'romney', 'says', '"i', 'obama', 'says', '"we', 'pay', 'attention', 'to', 'the', 'small', 't', ''], ['obama', 'as', 'president', '#rns'], ['obama', 'felicit\xc3\xb3', 'a', 'romney', 'por', '"su', 'en\xc3\xa9rgico', 'trabajo', 'duranta', 'la', 'campa\xc3\xb1a', 'electoral', 'http://t.co/joj5bth3'], ['obama', 'button', '\xf0\x9f\x87\xba\xf0\x9f\x87\xb8', 'http://t.co/jpragtuk'], ['obama', 'or', 'no', 'health', 'care', 'obama', 'or', 'no', 'jobs', 'obama', 'or', 'no', "women's", 'rights', 'obama', 'or', 'no', 'student', 'loans', 'the', 'choice', 'is', 'yours', ''], ['obama', 'lol', 'quit', 'frontin', 'and', 'that', 'counts', 'as', '2', 'votes', 'since', 'that', 'rat', 'on', 'his', 'head', 'got', 'its', 'own', ''], ['obama', 'to', 'romney', 'in', 'pennsylvania', 'http://t.co/nkau67dd'], ['obama', 'wins', 'virginia'], ['obama', 'talks', 'about', 'the', 'nation', 'romney', 'says', '"i', 'obama', 'says', '"we', 'pay', 'attention', 'to', 'the', 'small', 't', ''], ['obama', 'helps', 'out', 'a', 'lot', 'of', 'people', 'but', 'that', 'help', 'is', 'the', 'reason', 'the', 'economy', 'is', 'crashing', 'people', 'care', 'about', 'themsleves', 'not', 'our', 'country'], ['obama', 'to', 'romney', 'in', 'pennsylvania', 'http://t.co/nkau67dd'], ['obama', "doesn't", 'win', 'then', 'the', "world's", 'gone', 'mad'], ['obama', 'did', 'a', 'wonderful', 'job', 'at', 'reaching', 'out', 'to', 'our', 'generation'], ['obama', "doesn't", 'win', "i'll", 'lose', 'faith', 'in', 'humanity', 'http://t.co/jhhe1qlo'], ['obama', 'was', 'president', 'wen', 'i', 'got', 'my', 'first', 'job', '', '#rt'], ['obama', 'win'], ['obama', 'from', 'britain', '(where', 'it', "doesn't", 'count)', '@johnfugelsang'], ['obama', 'will', 'grab', 'the', 'early', 'lead', 'today......until', 'all', 'the', 'republicans', 'get', 'off', 'work'], ['obama', 'says'], ['obama', 'votes', 'to', 'romney', 'votes', 'http://t.co/ohcmiccl', '#tech', '#technology'], ['obama', 'supports', 'coat', 'hanger', 'abortion'], ['obama', 'votes', 'to', 'romney', 'votes', 'http://t.co/r4wbqcs6', '@suryaray', '@suryaray', '@suryaray'], ['obama', 'bitches', '', 'http://t.co/3nvdpb2n'], ['obama', 'to', 'romney', 'in', 'pennsylvania', 'http://t.co/nkau67dd'], ['obama', 'per', 'far', 'felice', 'gerardah', 'noi', 'americani', 'non', 'siamo', 'tanto', 'imbecilli,fidatevi', 'e', 'calmatevi'], ['obama', 'to', 'win', '#govote'], ['obama', 'to', 'romney', 'in', 'pennsylvania', 'http://t.co/osmpa9u8\xe2\x80\x9d'], ['obama', 'en', '2008', 'quatre', 'ans', 'plus', 'tard', 'le', 'candidat', 'http://t.co/amhaon2s'], ['obama', 'is', 'down', '3%', 'because', 'of', 'how', 'many', 'idiots', 'are', 'posting', 'pics', 'on', 'instagram', 'and', 'etc', '', '', 'please', 'be', 'smart', 'and', 'rt', 'this', 's', ''], ['obama', 'pollster', '&', 'rs', 'who', 'have', 'spoken', 'to', "mitt's", 'chicago', 'sure', 'of', 'win', 'b', ''], ['obama'], ['obama', 'is', 'a', 'loser', 'roflmfao', ';-d'], ['obama', 'losin', 'votes', 'because', 'ppl', 'postin', 'ballots', 'and', 'shit', 'smh'], ['obama', 'is', 'humble', '', "that's", 'why', 'everyone', 'loves', 'him', 'its', 'not', 'because', "he's", 'black', 'its', 'because', 'he', 'understands', "us.can't", 'say', 'th', ''], ['obama', 'supporters'], ['obama', 'currently', 'leads', 'romney', 'by', '15', 'with', '259', 'current', 'electoral', 'votes', 'romney', 'has', '244']]
因此,对于此输入,树具有缺少的节点
结果
mnn={'obama': {'name': 'obama', 'value': 50, 'children': {'is': {'name': 'is', 'value': 2, 'children': {'a': {'name': 'a', 'value': 1, 'children': {'loser': {'name': 'loser', 'value': 1, 'children': {'roflmfao': {'name': 'roflmfao', 'value': 1, 'children': {';-d': {'name': ';-d', 'value': 1, 'children': {}}}}}}}}, 'humble': {'name': 'humble', 'value': 1, 'children': {'': {'name': '', 'value': 1, 'children': {"that's": {'name': "that's", 'value': 1, 'children': {'why': {'name': 'why', 'value': 1, 'children': {'everyone': {'name': 'everyone', 'value': 1, 'children': {'loves': {'name': 'loves', 'value': 1, 'children': {'him': {'name': 'him', 'value': 1, 'children': {'its': {'name': 'its', 'value': 1, 'children': {'not': {'name': 'not', 'value': 1, 'children': {'because': {'name': 'because', 'value': 1, 'children': {"he's": {'name': "he's", 'value': 1, 'children': {'black': {'name': 'black', 'value': 1, 'children': {'its': {'name': 'its', 'value': 1, 'children': {'because': {'name': 'because', 'value': 1, 'children': {'he': {'name': 'he', 'value': 1, 'children': {'understands': {'name': 'understands', 'value': 1, 'children': {"us.can't": {'name': "us.can't", 'value': 1, 'children': {'say': {'name': 'say', 'value': 1, 'children': {'th': {'name': 'th', 'value': 1, 'children': {'': {'name': '', 'value': 1, 'children': {}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}, 'currently': {'name': 'currently', 'value': 1, 'children': {'leads': {'name': 'leads', 'value': 1, 'children': {'romney': {'name': 'romney', 'value': 1, 'children': {'by': {'name': 'by', 'value': 1, 'children': {'15': {'name': '15', 'value': 1, 'children': {'with': {'name': 'with', 'value': 1, 'children': {'259': {'name': '259', 'value': 1, 'children': {'current': {'name': 'current', 'value': 1, 'children': {'electoral': {'name': 'electoral', 'value': 1, 'children': {'votes': {'name': 'votes', 'value': 1, 'children': {'romney': {'name': 'romney', 'value': 1, 'children': {'has': {'name': 'has', 'value': 1, 'children': {'244': {'name': '244', 'value': 1, 'children': {}}}}}}}}}}}}}}}}}}}}}}}}}}, 'supporters': {'name': 'supporters', 'value': 1, 'children': {}}, 'losin': {'name': 'losin', 'value': 1, 'children': {'votes': {'name': 'votes', 'value': 1, 'children': {'because': {'name': 'because', 'value': 1, 'children': {'ppl': {'name': 'ppl', 'value': 1, 'children': {'postin': {'name': 'postin', 'value': 1, 'children': {'ballots': {'name': 'ballots', 'value': 1, 'children': {'and': {'name': 'and', 'value': 1, 'children': {'shit': {'name': 'shit', 'value': 1, 'children': {'smh': {'name': 'smh', 'value': 1, 'children': {}}}}}}}}}}}}}}}}}}}}}