您好我是python中的初学者,我正在尝试执行此程序来为集合文件创建反向索引:
import sys
import re
from porterStemmer import PorterStemmer
from collections import defaultdict
from array import array
import gc
porter=PorterStemmer()
class CreateIndex:
def __init__(self):
self.index=defaultdict(list) #the inverted index
def getStopwords(self):
'''get stopwords from the stopwords file'''
f=open(self.stopwordsFile, 'r')
stopwords=[line.rstrip() for line in f]
self.sw=dict.fromkeys(stopwords)
f.close()
def getTerms(self, line):
'''given a stream of text, get the terms from the text'''
line=line.lower()
line=re.sub(r'[^a-z0-9 ]',' ',line) #put spaces instead of non-alphanumeric characters
line=line.split()
line=[x for x in line if x not in self.sw] #eliminate the stopwords
line=[ porter.stem(word, 0, len(word)-1) for word in line]
return line
def parseCollection(self):
''' returns the id, title and text of the next page in the collection '''
doc=[]
for line in self.collFile:
if line=='</page>\n':
break
doc.append(line)
curPage=''.join(doc)
pageid=re.search('<id>(.*?)</id>', curPage, re.DOTALL)
pagetitle=re.search('<title>(.*?)</title>', curPage, re.DOTALL)
pagetext=re.search('<text>(.*?)</text>', curPage, re.DOTALL)
if pageid==None or pagetitle==None or pagetext==None:
return {}
d={}
d['id']=pageid.group(1)
d['title']=pagetitle.group(1)
d['text']=pagetext.group(1)
return d
def writeIndexToFile(self):
'''write the inverted index to the file'''
f=open(self.indexFile, 'w')
for term in self.index.iterkeys():
postinglist=[]
for p in self.index[term]:
docID=p[0]
positions=p[1]
postinglist.append(':'.join([str(docID) ,','.join(map(str,positions))]))
print >> f, ''.join((term,'|',';'.join(postinglist)))
f.close()
def getParams(self):
'''get the parameters stopwords file, collection file, and the output index file'''
param=sys.argv
self.stopwordsFile=param[0]
self.collectionFile=param[1]
self.indexFile=param[2]
def createIndex(self):
'''main of the program, creates the index'''
self.getParams()
self.collFile=open(self.collectionFile,'r')
self.getStopwords()
#bug in python garbage collector!
#appending to list becomes O(N) instead of O(1) as the size grows if gc is enabled.
gc.disable()
pagedict={}
pagedict=self.parseCollection()
#main loop creating the index
while pagedict != {}:
lines='\n'.join((pagedict['title'],pagedict['text']))
pageid=int(pagedict['id'])
terms=self.getTerms(lines)
#build the index for the current page
termdictPage={}
for position, term in enumerate(terms):
try:
termdictPage[term][1].append(position)
except:
termdictPage[term]=[pageid, array('I',[position])]
#merge the current page index with the main index
for termpage, postingpage in termdictPage.iteritems():
self.index[termpage].append(postingpage)
pagedict=self.parseCollection()
gc.enable()
self.writeIndexToFile()
if __name__=="__main__":
c=CreateIndex()
c.createIndex()
它说sys.argv中只有一个参数...
其他参数应该怎么出现?
答案 0 :(得分:0)
在getParams
函数中,您可以看到您的代码请求3个参数。
当你打电话给你的节目时:
python your_program.py
# sys.argv[0] = 'your_program.py'
它有一个参数。所以你还需要两个:
python your_program.py arg_1 arg_2
# sys.argv[0] = 'your_program.py'
# sys.argv[1] = 'arg_1'
# sys.argv[2] = 'arg_2