NLTK Ngrams给出相同的概率

时间:2019-03-02 19:22:32

标签: python machine-learning nltk

我在下面编写了用于计算不同单词和句子的概率的代码,但是即使我更改了要评估的文本,它也给了我相同的数字,我不知道为什么!(我复制了完整的代码,因此您可以复制并运行它。

using System;
using System.Collections.Generic;
 using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace T_2060_ParserEstadoDeCuenta
{
 class Program
 {
    static void Main(string[] args)
    {
        Console.WriteLine("Parsing the csv file");

        List<clsEstadoCuenta> resp = new List<clsEstadoCuenta>();
        var lines = File.ReadAllLines("d:\\ztemp\\parseEstcta.csv");
        for (int i = 1; i < lines.Count(); i++)
        {
            try
            {
                /*

                 */
                var campos = lines[i].Split(',');
                clsEstadoCuenta nR = new clsEstadoCuenta();

                nR.NumeroCuenta = (String.IsNullOrEmpty(campos[1])) ? "" : campos[1];
                nR.CodigoPais = 504;
                nR.Banco = "Fichosa";
                nR.Moneda = (String.IsNullOrEmpty(campos[2])) ? "" : campos[2];
                nR.TasaCambio = 24.6;
                var tmpFecha = campos[0].Split('/');
                nR.FechaTransaccion = new DateTime(Convert.ToInt32(tmpFecha[2]), Convert.ToInt32(tmpFecha[1]), Convert.ToInt32(tmpFecha[0]));
                nR.Descripcion = (String.IsNullOrEmpty(campos[1])) ? "" : campos[1];
                nR.Referencia = (String.IsNullOrEmpty(campos[2])) ? "" : campos[2];
                nR.Debito = (String.IsNullOrEmpty(campos[4])) ? 0 : Convert.ToDouble(campos[4]);
                nR.Credito = (String.IsNullOrEmpty(campos[5])) ? 0 : Convert.ToDouble(campos[5]);
                nR.Payee = "A";




            }
            catch (Exception ex)
            {

                Console.WriteLine("error on line {0} : {1}", i, ex.Message);
                continue;
            }
        }
        Console.WriteLine("Parsing has ended, we have {0} rows \n", resp.Count);

        foreach (var item in resp)
        {

            Console.WriteLine(item.NumeroCuenta+"\t" +item.CodigoPais+"\t"+item.Banco+"t"+item.Moneda+"\t"+item.Debito);
        }
        Console.ReadLine();
    }

    class clsEstadoCuenta
    {
        private string _NumeroCuenta;

        public string NumeroCuenta
        {
            get { return _NumeroCuenta; }
            set { _NumeroCuenta = value; }
        }
        private int _CodigoPais;

        public int CodigoPais
        {
            get { return _CodigoPais; }
            set { _CodigoPais = value; }
        }

        private string _Banco;

        public string Banco
        {
            get { return _Banco; }
            set { _Banco = value; }
        }

        private string _Moneda;

        public string Moneda
        {
            get { return _Moneda; }
            set { _Moneda = value; }
        }

        private double _TasaCambio;

        public double TasaCambio
        {
            get { return _TasaCambio; }
            set { _TasaCambio = value; }
        }

        private double _Debito;

        public double Debito
        {
            get { return _Debito; }
            set { _Debito = value; }
        }

        private double _Credito;

        public double Credito
        {
            get { return _Credito; }
            set { _Credito = value; }
        }

        private DateTime _FechaTrasaccion;

        public DateTime FechaTransaccion
        {
            get { return _FechaTrasaccion; }
            set { _FechaTrasaccion = value; }
        }

        private string _Payee;

        public string Payee
        {
            get { return _Payee; }
            set { _Payee = value; }
        }

        private string _Descripcion;

        public string Descripcion
        {
            get { return _Descripcion; }
            set { _Descripcion = value; }
        }

        private string _Referencia;

        public string Referencia
        {
            get { return _Referencia; }
            set { _Referencia = value; }
        }

        private string _CodigoBancario;

        public string CodigoBancario
        {
            get { return _CodigoBancario; }
            set { _CodigoBancario = value; }
        }

        private string _Categoria;

        public string Categoria
        {
            get { return _Categoria; }
            set { _Categoria = value; }
        }

        private string _Sector;

        public string Sector
        {
            get { return _Sector; }
            set { _Sector = value; }
        }

        private double _ValorLocal;

        public double ValorLocal
        {
            get
            {
                _ValorLocal = Credito - Debito;
                return _ValorLocal;
            }
            //set { _ValorLocal = value; }
        }

        private double _ValorDolares;

        public double ValorDolares
        {
            get
            {
                _ValorDolares = ValorLocal / TasaCambio;
                return _ValorDolares;
            }
           // set { _ValorDolares = value; }
        }

        private string _NombreEmpresa;

        public string NombreEmpresa
        {
            get { return _NombreEmpresa; }
            set { _NombreEmpresa = value; }
        }

    }
  }
}

无论插入文本from nltk.util import bigrams from nltk.lm.preprocessing import pad_both_ends from nltk.util import everygrams from nltk.lm import Vocabulary from nltk.lm import MLE from nltk.lm import Laplace from nltk.lm.preprocessing import flatten import codecs import re from nltk.lm.preprocessing import padded_everygram_pipeline from nltk.lm.preprocessing import padded_everygrams from nltk.tokenize import word_tokenize from nltk import ngrams s = "The Internet may be overflowing with new technology but crime in cyberspace is still of the old-fashioned variety."\ "The National Consumers League said Wednesday that the most popular scam on the Internet was the pyramid scheme, in which early investors in a bogus fund are paid off with deposits of later investors." max_len_ngram = 2; paddedLine =list(pad_both_ends(word_tokenize(s),n=2)); vocab = Vocabulary(word_tokenize(s),1); train = [everygrams(paddedLine,max_len = max_len_ngram)]; lm = MLE(max_len_ngram); lm.fit(train,vocab); print(lm.score("WHY?")); 是什么,它总是返回0.03636363636363636,这是绝对错误的!我该如何解决? 我怎样才能得分完整的句子而不是单词?

0 个答案:

没有答案