我正在一个需要文件的项目中,经过指定的目录并以%返回它们的相似性...我已经成功了,但是问题是将其转换为GUI,原始代码有效,但是当在GUI(PyQt5)中实现时,会引发TypeError: normalize() missing 1 required positional argument: 'text'
错误...
这是原始代码
import docx
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from pathlib import Path
from TV1 import App
def getText(filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
print('\n'.join(fullText))
return '\n'.join(fullText)
# nltk.download('punkt') # if necessary...
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
'''remove punctuation, lowercase, stem'''
def normalize(text):
return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
def cosine_sim(text1, text2):
text11 = text1
text22 = open(text2, 'r', encoding='utf-8', errors='ignore').read()
tfidf = vectorizer.fit_transform([text11, text22])
n = (((tfidf * tfidf.T) * 100).A)[0, 1]
return '%.3f%% similarity' % n
file = 'BB.docx'
spath = r'C:\Users\Black Laptop\Desktop\Work'
print('---------------------------------')
text = getText(file)
if os.path.exists(spath):
for path in Path(spath).iterdir():
print(path)
print(os.path.basename(path))
print(cosine_sim(text, path))
print('')
GUI ...
import sys
from PyQt5.QtCore import (QDate, QDateTime, QRegExp, QSortFilterProxyModel, Qt,
QTime)
from PyQt5.QtGui import QStandardItemModel
from PyQt5.QtWidgets import (QApplication, QCheckBox, QComboBox, QGridLayout,
QGroupBox, QHBoxLayout, QLabel, QLineEdit, QTreeView, QVBoxLayout,
QWidget, QTableView, QTableWidget)
import docx
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from pathlib import Path
class App(QWidget):
FILES, SIMILAR = range(2)
def __init__(self):
super().__init__()
self.title = 'Plagiarism Checker'
self.left = 50
self.top = 50
self.width = 640
self.height = 240
# self.initUI()
self.one()
def initUI(self):[...]
def getText(self, filename):
doc = docx.Document(filename)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
print('\n'.join(fullText))
return '\n'.join(fullText)
# nltk.download('punkt') # if necessary...
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def stem_tokens(self, tokens):
return [self.stemmer.stem(item) for item in tokens]
'''remove punctuation, lowercase, stem'''
def normalize(self, text):
return self.stem_tokens(nltk.word_tokenize(text.lower().translate(self.remove_punctuation_map)))
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
def cosine_sim(self, text1, text2):
text11 = text1
text22 = open(text2, 'r', encoding='utf-8', errors='ignore').read()
tfidf = self.vectorizer.fit_transform([text11, text22])
n = (((tfidf * tfidf.T) * 100).A)[0, 1]
return '%.3f%% similarity' % n
def one(self):
file = 'BB.docx'
spath = r'C:\Users\Black Laptop\Desktop\Work'
print('---------------------------------')
text = self.getText(file)
if os.path.exists(spath):
for path in Path(spath).iterdir():
print(path)
print(os.path.basename(path))
print(self.cosine_sim(text, path))
print('')
if __name__ == '__main__':
app = QApplication(sys.argv)
ex = App()
sys.exit(app.exec_())
错误..
Traceback (most recent call last):
File "C:/Users/Black Laptop/PycharmProjects/StringPatternMatcher/CT.py", line 117, in <module>
ex = App()
File "C:/Users/Black Laptop/PycharmProjects/StringPatternMatcher/CT.py", line 29, in __init__
self.one()
File "C:/Users/Black Laptop/PycharmProjects/StringPatternMatcher/CT.py", line 111, in one
print(self.cosine_sim(text, path))
File "C:/Users/Black Laptop/PycharmProjects/StringPatternMatcher/CT.py", line 93, in cosine_sim
tfidf = self.vectorizer.fit_transform([text11, text22])
File "C:\Program Files (x86)\Python36-32\lib\site-packages\sklearn\feature_extraction\text.py", line 1652, in fit_transform
X = super().fit_transform(raw_documents)
File "C:\Program Files (x86)\Python36-32\lib\site-packages\sklearn\feature_extraction\text.py", line 1058, in fit_transform
self.fixed_vocabulary_)
File "C:\Program Files (x86)\Python36-32\lib\site-packages\sklearn\feature_extraction\text.py", line 970, in _count_vocab
for feature in analyze(doc):
File "C:\Program Files (x86)\Python36-32\lib\site-packages\sklearn\feature_extraction\text.py", line 352, in <lambda>
tokenize(preprocess(self.decode(doc))), stop_words)
TypeError: normalize() missing 1 required positional argument: 'text'
Process finished with exit code 1
任何帮助将不胜感激...谢谢