TypeError:normalize()缺少1个必需的位置参数:“ text”

时间:2019-06-22 00:06:10

标签: python python-3.x pyqt5

我正在一个需要文件的项目中,经过指定的目录并以%返回它们的相似性...我已经成功了,但是问题是将其转换为GUI,原始代码有效,但是当在GUI(PyQt5)中实现时,会引发TypeError: normalize() missing 1 required positional argument: 'text'错误...

这是原始代码

import docx
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from pathlib import Path
from TV1 import App


def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    print('\n'.join(fullText))
    return '\n'.join(fullText)


# nltk.download('punkt')  # if necessary...

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)


def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]


'''remove punctuation, lowercase, stem'''


def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))


vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')


def cosine_sim(text1, text2):
    text11 = text1
    text22 = open(text2, 'r', encoding='utf-8', errors='ignore').read()
    tfidf = vectorizer.fit_transform([text11, text22])

    n = (((tfidf * tfidf.T) * 100).A)[0, 1]
    return '%.3f%% similarity' % n

file = 'BB.docx'
spath = r'C:\Users\Black Laptop\Desktop\Work'

print('---------------------------------')

text = getText(file)
if os.path.exists(spath):
    for path in Path(spath).iterdir():
        print(path)
        print(os.path.basename(path))
        print(cosine_sim(text, path))
        print('')

GUI ...

import sys

from PyQt5.QtCore import (QDate, QDateTime, QRegExp, QSortFilterProxyModel, Qt,
                          QTime)
from PyQt5.QtGui import QStandardItemModel
from PyQt5.QtWidgets import (QApplication, QCheckBox, QComboBox, QGridLayout,
                             QGroupBox, QHBoxLayout, QLabel, QLineEdit, QTreeView, QVBoxLayout,
                             QWidget, QTableView, QTableWidget)

import docx
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from pathlib import Path


class App(QWidget):
    FILES, SIMILAR = range(2)

    def __init__(self):
        super().__init__()
        self.title = 'Plagiarism Checker'
        self.left = 50
        self.top = 50
        self.width = 640
        self.height = 240
        # self.initUI()
        self.one()

    def initUI(self):[...]

    def getText(self, filename):
        doc = docx.Document(filename)
        fullText = []
        for para in doc.paragraphs:
            fullText.append(para.text)
        print('\n'.join(fullText))
        return '\n'.join(fullText)

    # nltk.download('punkt')  # if necessary...

    stemmer = nltk.stem.porter.PorterStemmer()
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

    def stem_tokens(self, tokens):
        return [self.stemmer.stem(item) for item in tokens]

    '''remove punctuation, lowercase, stem'''

    def normalize(self, text):
        return self.stem_tokens(nltk.word_tokenize(text.lower().translate(self.remove_punctuation_map)))

    vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

    def cosine_sim(self, text1, text2):
        text11 = text1
        text22 = open(text2, 'r', encoding='utf-8', errors='ignore').read()
        tfidf = self.vectorizer.fit_transform([text11, text22])

        n = (((tfidf * tfidf.T) * 100).A)[0, 1]
        return '%.3f%% similarity' % n

    def one(self):
        file = 'BB.docx'
        spath = r'C:\Users\Black Laptop\Desktop\Work'

        print('---------------------------------')

        text = self.getText(file)
        if os.path.exists(spath):
            for path in Path(spath).iterdir():
                print(path)
                print(os.path.basename(path))
                print(self.cosine_sim(text, path))
                print('')


if __name__ == '__main__':
    app = QApplication(sys.argv)
    ex = App()
    sys.exit(app.exec_())

错误..

Traceback (most recent call last):
  File "C:/Users/Black Laptop/PycharmProjects/StringPatternMatcher/CT.py", line 117, in <module>
    ex = App()
  File "C:/Users/Black Laptop/PycharmProjects/StringPatternMatcher/CT.py", line 29, in __init__
    self.one()
  File "C:/Users/Black Laptop/PycharmProjects/StringPatternMatcher/CT.py", line 111, in one
    print(self.cosine_sim(text, path))
  File "C:/Users/Black Laptop/PycharmProjects/StringPatternMatcher/CT.py", line 93, in cosine_sim
    tfidf = self.vectorizer.fit_transform([text11, text22])
  File "C:\Program Files (x86)\Python36-32\lib\site-packages\sklearn\feature_extraction\text.py", line 1652, in fit_transform
    X = super().fit_transform(raw_documents)
  File "C:\Program Files (x86)\Python36-32\lib\site-packages\sklearn\feature_extraction\text.py", line 1058, in fit_transform
    self.fixed_vocabulary_)
  File "C:\Program Files (x86)\Python36-32\lib\site-packages\sklearn\feature_extraction\text.py", line 970, in _count_vocab
    for feature in analyze(doc):
  File "C:\Program Files (x86)\Python36-32\lib\site-packages\sklearn\feature_extraction\text.py", line 352, in <lambda>
    tokenize(preprocess(self.decode(doc))), stop_words)
TypeError: normalize() missing 1 required positional argument: 'text'

Process finished with exit code 1

任何帮助将不胜感激...谢谢

0 个答案:

没有答案