Question

我有一个矩阵格式的文本文件，我想读取该文件并创建树状图，但是我遇到错误。

这是我的文本文件的内容：

Kaynak：YSK

Adalar

伊斯坦布尔的位置Adalar.svg

d•t

Toplam secmen sayisi Toplam sandik sayisi

12,369 45

Adaylik Sonuclar

吻Parti Aday Oy sayisi Oy orani

CHP Cumhuriyet Halk Partisi Atilla Aytac 5.207％50.1

AK Parti Adalet ve Kalkinma Partisi Coskun Ozden 4.381％42.1

import re
import feedparser
import clusters

def getwordcounts(url):

    print url
    d = feedparser.parse(url)
    wc = {}

    for e in d.entries:
        if 'summary' in e:
            summary = e.summary
        else:
            summary = e.description

        words = getwords(e.title + ' ' + summary)
        for word in words:
            wc.setdefault(word, 0)
            wc[word] += 1
    try:
        return d.feed.title, wc
    except AttributeError:
        return None,None
def getwords(html):

    txt = re.compile(r'<[^>]+>').sub('',html)

    words = re.compile(r'[^A-Z^a-z]+').split(txt)

    return [word.lower() for word in words if word != '']

apcount = {}
wordcounts = {}
feedlist = []

for feedurl in file('cc.txt'):
    title, wc = getwordcounts(feedurl)
    if title != None:
        feedlist.append(feedurl)
        wordcounts[title] = wc
        for word,count in wc.items():
            apcount.setdefault(word,0)
            apcount[word] += 1


wordlist = []
for w,bc in apcount.items():
    frac = float(bc) / len(feedlist)
    if frac > 1 and frac < 2:
        wordlist.append(w)

out = file('sa.txt','w')
out.write('Blog')
for word in wordlist:
    out.write('\t%s' % word)
out.write('\n')
for blog_title, wc in wordcounts.items():
    blog_title = blog_title.encode('ascii'
,'ignore')
    out.write(blog_title)
    for word in wordlist:
        if word in wc:
            out.write('\t%d' % wc[word])
        else:
            out.write('\t0')
    out.write('\n')


blognames, words, data = clusters.readfile('sa.txt')
clust = clusters.hcluster(data)
clusters.drawdendrogram (clust, blognames, jpeg = 'cl.jpg')

reload(clusters)
clusters.printclust(clust,labels = blognames)

这是我的集群功能；

def readfile(filename):
  lines=[line for line in file(filename)]

  # First line is the column titles
  colnames=lines[0].strip().split("\t")[1:]
  rownames=[]
  data=[]
  for line in lines[1:]:
    p=line.split('\t')
    # First column in each row is the rowname
    rownames.append(p[0])
    # The data for this row is the remainder of the row
    data.append([float(x) for x in p[1:]])
  return rownames,colnames,data

错误代码： colnames = lines [0] .strip（）。split（“ \ t”）[1：] IndexError：列表索引超出范围

Answer 1

错误是因为第一行中没有制表符，并且该行根本没有被分割。或者，如果第一个字符是制表符，则可以使用.strip（）命令将其删除。

您可以在显示特殊字符的同时打印该行，以检查选项卡字符是否存在：

print repr(lines[0])

读取作为矩阵的文本文件

1 个答案: