基于搜索关键字Python的搜索简历

时间:2018-08-28 09:34:56

标签: python python-3.x nltk

我正在尝试根据技能搜索或地点搜索关键字找到简历。我写了一个python脚本来根据关键字搜索简历。 将所有简历导入/ dependency / documents /项目目录中。

1. Indexer.py文件

from collections import defaultdict
import glob
import tokenizer
import json


def get_file_names():
    files = []
    for file in glob.glob("../dependency/documents/resumes/*.pdf"):
        files.append(file)
    return files


def make_index(tokens, document_name, index, length):
    for term in set(tokens):
        index[term].append([document_name,tokens.count(term)])
        length[document_name] = len(set(tokens))


def generator():
    resume_files = get_file_names()
    inverted_index = defaultdict(list)
    length_index = defaultdict(list)
    for file in resume_files:
        make_index(tokenizer.tokenize(file), file, inverted_index, length_index)
    write(inverted_index,length_index)
    print "Indexes generated"


def write(inverted_index,length_index):
    inv_index_file = open("../dependency/indexes/inverted_index.json","w")
    json.dump(inverted_index,inv_index_file)

    length_index_file = open("../dependency/indexes/length_index.json","w")
    json.dump(length_index,length_index_file)

2. Ranker.py

import json
import operator
from collections import defaultdict
from retrieval import BM25

# get average document length
def get_avdl(length_index):
    corpus_length = 0
    for document in length_index:
        corpus_length += length_index[document]
    return float(corpus_length) / float(len(length_index))

def search(query):
    inv_index_file = open("../dependency/indexes/inverted_index.json","r")
    inverted_index = json.load(inv_index_file)

    length_index_file = open("../dependency/indexes/length_index.json","r")
    length_index = json.load(length_index_file)

    scores = defaultdict(list)
    query_tokens = query.split()
    for token in query_tokens:
        for entry in inverted_index[token]:
            scores[entry[0]] = BM25(length_index[entry[0]],get_avdl(length_index),len(inverted_index[token]),len(length_index),entry[1],1,0)
    return sorted(scores.items(),key=operator.itemgetter(1))
  1. Retriever.py

      

    编码:utf-8

    from math import log
    
    '''Using the following formula to calculate BM25
    ((k3 + 1)q)/((k3 + q)) * ((k1 + 1)f)/((K + f)) * log((r + 0.5)(N − n − R + r + 0.5))/((n − r + 0.5)(R − r + 0.5))
    REFERENCE: https://xapian.org/docs/bm25.html
    '''
    
    # DEFINING CONSTANTS
    
    k1 = 1.2
    b = 0.75
    k2 = 100
    R = 0 #Since no relevance info is available
    
    # MAIN METHOD
    
    def BM25(docLen, avDocLen, n, N, f, q, r):
        p1 = ((k2 + 1) * q) / (k2 + q)
        p2 = ((k1 + 1) * f) / (getK(docLen, avDocLen) + f)
        p3 = log((r + 0.5) * (N - n - R + r + 0.5)) / ((n - r + 0.5) * (R - r + 0.5))
        return p1 * p2 * p3
    
    def getK(docLen, avDocLen):
        return k1 * ((1 - b) + b * (float(docLen) / float(avDocLen)))
    
  2. Tokenizer.py

      

    编码:utf-8

    import pyPdf
    from nltk.corpus import stopwords
    
    def tokenize(path):
    # open PDF
        pdf = pyPdf.PdfFileReader(open(str(path),"rb"))
        stopword_list = list(stopwords.words("english"))
    
    # read PDF file in a list
        pdf_content = []
        for page in pdf.pages:
            pdf_content.append(page.extractText())
    
    # tokenize all the words in the resume
        tokenize = []
        for line in pdf_content:
            tokenize = filter(None,(line.split(" ")))
    
    # remove punctuations and case-fold
        no_punctuations = []
        for token in tokenize:
            no_punctuations.append(token.rstrip(",:|.-").lower())
    
    # remove stop words
        without_stop_words = []
    
        for word in filter(None, no_punctuations):
            if word not in stopword_list:
                without_stop_words.append(word)
    
        return without_stop_words
    
  3. main.py

    from indexer import generator
    from ranker import search
    
    generator()
    option = ""
    while option != "q":
        print "\n"
        print "Enter search query"
        keywords = raw_input(":: ")
        results = search(keywords)
        print"\nThe Matching Resumes Are:"
        for result in results:
            print result[0]
    

运行 main.py 文件

后,出现以下错误,如下所示
  

python main.py

已生成索引

Enter search query
:: Excel
Traceback (most recent call last):
  File "main.py", line 10, in <module>
    results = search(keywords)
  File "/home/new/Resume-Matcher-master/src/ranker.py", line 23, in search
    for entry in inverted_index[token]:
KeyError: 'Excel'

任何帮助。赞赏。

0 个答案:

没有答案