所以,我有点问题。我有一个程序可以根据给定的问题列表和答案来计算词汇量。
以下是计算长度的代码:
def lenVocabulary(questions,answers):
# This section returns each question and answer as a list of words.
questions_as_word_sequences = list(map(keras.preprocessing.text.text_to_word_sequence,questions))
answers_as_word_sequences = list(map(keras.preprocessing.text.text_to_word_sequence,answers))
#print(questions_as_word_sequences[0])
#print(answers_as_word_sequences[0])
from numpy import array,column_stack
#print(questions[0:3])
print("Question as word sequence: ", questions_as_word_sequences[0])
questions_as_word_sequences = array(questions_as_word_sequences)
answers_as_word_sequences = array(answers_as_word_sequences)
# This section should return a numpy array with only two axes, but it returns one with three instead. Why?
vocabulary = column_stack((questions_as_word_sequences,answers_as_word_sequences))
vocabulary = set(vocabulary.flatten())
#print(vocabulary)
lenVocabulary = len(vocabulary)
return lenVocabulary
现在,当我对问题和答案进行硬编码时,以上代码可以很好地工作:
def lenVocabulary(questions,answers):
# This section returns each question and answer as a list of words.
questions_as_word_sequences = list(map(keras.preprocessing.text.text_to_word_sequence,questions))
answers_as_word_sequences = list(map(keras.preprocessing.text.text_to_word_sequence,answers))
#print(questions_as_word_sequences[0])
#print(answers_as_word_sequences[0])
from numpy import array,column_stack
#print(questions[0:3])
print("Question as word sequence: ", questions_as_word_sequences[0])
questions_as_word_sequences = array(questions_as_word_sequences)
answers_as_word_sequences = array(answers_as_word_sequences)
# This section should return a numpy array with only two axes, but it returns one with three instead. Why?
vocabulary = column_stack((questions_as_word_sequences,answers_as_word_sequences))
vocabulary = set(vocabulary.flatten())
#print(vocabulary)
lenVocabulary = len(vocabulary)
return lenVocabulary
输出:
Montanas-MacBook-Pro:~ montana$ /var/folders/qp/1n_8zftx3c799d_h5wqj1_4c0000gn/T/com.barebones.bbedit-558742823.535-python3.sh ; exit;
Using TensorFlow backend.
Question as word sequence: ['do', 'you', 'sell', 'seashells', 'by', 'the', 'seashore']
12
但是,当我使用真实数据在主程序中对其进行测试时,这就是我得到的:
主程序输出:
Parsing data.
Saint Bernadette Soubirous
Using TensorFlow backend.
Question as word sequence: ['to', 'whom', 'did', 'the', 'virgin', 'mary', 'allegedly', 'appear', 'in', '1858', 'in', 'lourdes', 'france']
主程序代码:
import json
import sys
import time
#print("Question n-grams: " + str(questionNgrams))
from collections import Counter
# Get N-gram vocabulary.
# Iterate through all questions and answers, pulling out our inputs
from string import punctuation
file = open("train-v1.1.json")
json = json.loads(file.read().replace('\n', ''))
data = json["data"]
questions = []
answers = []
print("Parsing data.")
import string
def isEnglish(s):
try:
s.encode(encoding='utf-8').decode('ascii')
except UnicodeDecodeError:
return False
else:
return True
for article in data:
# Articles
for paragraph in article["paragraphs"]:
# Paragraphs
for qas in paragraph["qas"]:
# Questions/Answers
# Remove all punctuation and non-English characters
question = qas["question"]
#question = "".join([string for string in question if isEnglish(string)])
# Remove common words that probably won't affect the accuracy of the end result.
#stop = ["The ","the ","A ","a "]
#for word in stop:
#question = question.replace(word,"")
answer = qas["answers"][0]["text"]
#answer = " ".join([string for string in answer if string not in punctuation])
if (answer != ""):
questions.append(question)
answers.append(answer)
# By now, we have a list of questions and answers, each of which is a string.
print(answers[0])
#print(questions_as_word_sequences[0])
#print(answers_as_word_sequences[0])
import skipgramModule
lengthVocabulary = skipgramModule.lenVocabulary(questions,answers)
问题在于词汇表数组具有 3 轴,而不是2轴。问题/按词顺序的大小和形状与问题/答案的格式相同数组,所以我不知道会导致此错误的原因。