如何在Python 2D矩阵上找到数据库的ID

时间:2019-06-20 10:25:14

标签: python mysql

我有一个python + flask作业,用于查找文档相似度值并在界面上显示相似度数据。我的数据是存储在MySQL中的文章文档。

我已经完成了算法 结果是矩阵2d,如下所示:[(100,0,0),(50,0,1),(70,0,2)... N]

import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import string
import itertools
import mysql.connector

mydb = mysql.connector.connect(
    host="localhost",
    user= "root",
    passwd="",
    database="news"
)

mycursor = mydb.cursor()
mycursor.execute("SELECT content FROM news_tb")
x = mycursor.fetchall()

documents = list(itertools.chain(*x))

#1. tokenizing stopword dan stemming
dictOfWords = {}

for index, sentence in enumerate(documents):
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    # print(sentence)
    tokenizedWords = word_tokenize(sentence)

    listStopword = set(stopwords.words('indonesian'))

    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    wordsFiltered = []
    for t in tokenizedWords:
        if t not in listStopword:
            wordsFiltered.append(t)
    wordsFiltered = [stemmer.stem(word) for word in wordsFiltered]
    # print(wordsFiltered)
    dictOfWords[index] = [word for word in wordsFiltered]
# print(dictOfWords)

#2. Menghilangkan kata duplikat
termFrequency = []
for i in range(0, len(documents)):
    termFrequency.append([])
    for wordFreq in dictOfWords[i]:
        if wordFreq not in termFrequency[i]:
            termFrequency[i].append(wordFreq)
    # termFrequency = listOfNoDuplicates
# print(termFrequency)

#TF hitung kemunculan kata (tf murni) dalam kalimat
def term_frequency(term, tokenized_document):
    return tokenized_document.count(term)

#tf logaritmik
def sublinear_term_frequency(term, tokenized_document):
    count = tokenized_document.count(term)
    if count == 0:
        return 0
    return 1 + math.log10(count)

#rumus buat TF normalisasi
def augmented_term_frequency(term, tokenized_document):
    max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
    return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))

# IDF RUMUS = log(n/df) n= jumlah dokumen df = jumlah dokumen dimana istilah/kata itu muncul
def inverse_document_frequencies(tokenized_documents):
    idf_values = {}
    all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    # print(all_tokens_set)
    for tkn in all_tokens_set:
        contains_token = map(lambda doc: tkn in doc, tokenized_documents)
        idf_values[tkn] = math.log10(len(tokenized_documents)/(sum(contains_token)))
    return idf_values

def tfidf(documents):
    # tokenized_documents = [tokenize(d) for d in documents]
    # print(documents)
    idf = inverse_document_frequencies(documents)
    # print(idf.keys())
    tfidf_documents = []
    for document in documents:
        doc_tfidf = []
        for term in idf.keys():
            tf = term_frequency(term, document)
            doc_tfidf.append(tf * idf[term])
        tfidf_documents.append(doc_tfidf)
    return tfidf_documents

def cosine_similarity(vector1, vector2):
    dot_product = sum(p*q for p,q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product/magnitude

def hasil():
    tfidf_representation = tfidf(termFrequency)
    our_tfidf_comparisons = []
    for count_0, doc_0 in enumerate(tfidf_representation):
        for count_1, doc_1 in enumerate(tfidf_representation):
            our_tfidf_comparisons.append((cosine_similarity(doc_0, doc_1), count_0, count_1))
    # print(termFrequency)
    # print(our_tfidf_comparisons)
    return our_tfidf_comparisons

print(hasil())

这意味着=

- article 0 compared to article 0 the value is 100
- article 0 compared to article 1 the value is 50
- article 0 compared to article 2 the value is 70

我的问题,如何找出商品0-N的ID 例如,我的数据是5,它是1-5,如何知道商品0是ID 1,商品1是ID 2,依此类推

非常感谢您

0 个答案:

没有答案