我正在尝试根据技能搜索或地点搜索关键字找到简历。我写了一个python脚本来根据关键字搜索简历。 将所有简历导入/ dependency / documents /项目目录中。
1. Indexer.py文件
from collections import defaultdict
import glob
import tokenizer
import json
def get_file_names():
files = []
for file in glob.glob("../dependency/documents/resumes/*.pdf"):
files.append(file)
return files
def make_index(tokens, document_name, index, length):
for term in set(tokens):
index[term].append([document_name,tokens.count(term)])
length[document_name] = len(set(tokens))
def generator():
resume_files = get_file_names()
inverted_index = defaultdict(list)
length_index = defaultdict(list)
for file in resume_files:
make_index(tokenizer.tokenize(file), file, inverted_index, length_index)
write(inverted_index,length_index)
print "Indexes generated"
def write(inverted_index,length_index):
inv_index_file = open("../dependency/indexes/inverted_index.json","w")
json.dump(inverted_index,inv_index_file)
length_index_file = open("../dependency/indexes/length_index.json","w")
json.dump(length_index,length_index_file)
2. Ranker.py
import json
import operator
from collections import defaultdict
from retrieval import BM25
# get average document length
def get_avdl(length_index):
corpus_length = 0
for document in length_index:
corpus_length += length_index[document]
return float(corpus_length) / float(len(length_index))
def search(query):
inv_index_file = open("../dependency/indexes/inverted_index.json","r")
inverted_index = json.load(inv_index_file)
length_index_file = open("../dependency/indexes/length_index.json","r")
length_index = json.load(length_index_file)
scores = defaultdict(list)
query_tokens = query.split()
for token in query_tokens:
for entry in inverted_index[token]:
scores[entry[0]] = BM25(length_index[entry[0]],get_avdl(length_index),len(inverted_index[token]),len(length_index),entry[1],1,0)
return sorted(scores.items(),key=operator.itemgetter(1))
Retriever.py
编码:utf-8
from math import log
'''Using the following formula to calculate BM25
((k3 + 1)q)/((k3 + q)) * ((k1 + 1)f)/((K + f)) * log((r + 0.5)(N − n − R + r + 0.5))/((n − r + 0.5)(R − r + 0.5))
REFERENCE: https://xapian.org/docs/bm25.html
'''
# DEFINING CONSTANTS
k1 = 1.2
b = 0.75
k2 = 100
R = 0 #Since no relevance info is available
# MAIN METHOD
def BM25(docLen, avDocLen, n, N, f, q, r):
p1 = ((k2 + 1) * q) / (k2 + q)
p2 = ((k1 + 1) * f) / (getK(docLen, avDocLen) + f)
p3 = log((r + 0.5) * (N - n - R + r + 0.5)) / ((n - r + 0.5) * (R - r + 0.5))
return p1 * p2 * p3
def getK(docLen, avDocLen):
return k1 * ((1 - b) + b * (float(docLen) / float(avDocLen)))
Tokenizer.py
编码:utf-8
import pyPdf
from nltk.corpus import stopwords
def tokenize(path):
# open PDF
pdf = pyPdf.PdfFileReader(open(str(path),"rb"))
stopword_list = list(stopwords.words("english"))
# read PDF file in a list
pdf_content = []
for page in pdf.pages:
pdf_content.append(page.extractText())
# tokenize all the words in the resume
tokenize = []
for line in pdf_content:
tokenize = filter(None,(line.split(" ")))
# remove punctuations and case-fold
no_punctuations = []
for token in tokenize:
no_punctuations.append(token.rstrip(",:|.-").lower())
# remove stop words
without_stop_words = []
for word in filter(None, no_punctuations):
if word not in stopword_list:
without_stop_words.append(word)
return without_stop_words
main.py
from indexer import generator
from ranker import search
generator()
option = ""
while option != "q":
print "\n"
print "Enter search query"
keywords = raw_input(":: ")
results = search(keywords)
print"\nThe Matching Resumes Are:"
for result in results:
print result[0]
运行 main.py 文件
后,出现以下错误,如下所示python main.py
已生成索引
Enter search query
:: Excel
Traceback (most recent call last):
File "main.py", line 10, in <module>
results = search(keywords)
File "/home/new/Resume-Matcher-master/src/ranker.py", line 23, in search
for entry in inverted_index[token]:
KeyError: 'Excel'
任何帮助。赞赏。