使用马尔可夫链的Chatbot

时间:2016-08-17 15:31:51

标签: python machine-learning artificial-intelligence markov-chains chatbot

Hello开发人员,

我正在尝试使用马尔科夫链建立一个聊天机器人,我遇到了问题。我在下面的代码中,我制作了一个从电影脚本中学习的随机句子生成器。问题是,如何让这个句子生成器不是随机的并响应用户的输入?我该怎么做呢?是否与输入/输出训练有关:

In: how are you today
Out: I'm good thanks how are you

这是我的代码。大多数函数用于将数据放入csv文件中,所以不要介意。

from collections import defaultdict
import random, itertools, nltk, pandas, csv, string, re, os, time

class Chatbot:
    def __init__(self, name, txt_transcript_filedir, character=None):
        self.name = name
        self.txt_transcript_filedir = txt_transcript_filedir
        self.character = character
        print("Hello my name is " + name + ".")

    def parse_transcript(self):
        parsed_lines = []
        self.csv_transcript_filedir = self.txt_transcript_filedir.replace('.txt', '.csv')

        with open(self.txt_transcript_filedir, encoding='utf-8') as txt_file:
            lines = txt_file.readlines()
            for line in lines:
                line = line.replace(', ', ' ')
                line = re.sub(r'\[.*?\]', '', line)
                if ': ' in line:
                    line = line.replace(': ', ',')
                parsed_lines.append(line)

        with open(self.csv_transcript_filedir, 'w', encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(['person', 'text'])
            for line in parsed_lines:
                csv_file.write(line)

    def tokenize_transcript(self):
        csv_file = pandas.read_csv(self.csv_transcript_filedir)
        textss = []
        final_sents = []

        if self.character == None:
            texts = csv_file['text']
            for text in texts:
                sent = nltk.sent_tokenize(text)
                textss.append(sent)
        else:
            char_sets = csv_file[csv_file['person'] == self.character]
            texts = char_sets['text']
            for text in texts:
                sent = nltk.sent_tokenize(text)
                textss.append(sent)

        for text in textss:
            for sent in text:
                if sent[0] == ' ':
                    sent = sent[1:]
                final_sent = [w for w in sent if w not in string.punctuation]
                final_sent = ''.join(final_sent)
                final_sents.append(final_sent)

        self.training_data = [sent for sent in final_sents]

    def learn(self):
        self.parse_transcript()
        self.tokenize_transcript()
        self.make_word_dict(self.training_data)

    def make_word_dict(self, text):
        word_dict = defaultdict(list)

        for sent in text:
            words = nltk.word_tokenize(sent)

            for i in range(len(words) - 1):
                if i+2 >= (len(words)):
                    word_dict[(words[i], words[i+1])].append('<end>')
                else:
                    word_dict[(words[i], words[i+1])].append(words[i+2])

        self.vocabulary = word_dict

    def generate_text(self, num):
        for i in range(0, num):
            start_key = random.choice(list(self.vocabulary.keys()))
            text = []
            text.append(start_key[0])
            text.append(start_key[1])

            for i in itertools.count():
                key = (text[i], text[i+1])
                if key[1] == '<end>':
                    break
                else:
                    text.append(random.choice(self.vocabulary[text[i], text[i+1]]))

            text = ' '.join(text)

            if text.endswith('<end>'):
                text = text[:-6]
                text = text + '.'

            return text

    def say(self, text):
        os.system('say -v Oliver ' + text)


def main():
    num = 100

    bot = Chatbot("J.A.R.V.I.S", "avengers_age_of_ultron.txt", "JARVIS")
    bot.learn()

    for i in range(num):
        text = bot.generate_text(1)
        print(text)

if __name__ == '__main__':
    main()

0 个答案:

没有答案