尝试使用Term-Document-Incidence-Matrix回答一些布尔查询

时间:2013-05-09 18:00:20

标签: c# winforms information-retrieval

我尝试以这些方式回复一些简单的布尔查询NOT x NOT y NOT zx AND y AND z,并且这个x OR y OR z x,y,z也是一些单词,其中任何一个都属于不同的{ {1}}或者所有这些都属于同一个file.txt,无论如何。

我写过file.txt

它必须能够回答一个布尔查询,我为class TermDocMatrix准备了一些方法用于这些目的,但它不起作用。我甚至一步一步调试代码,我意识到循环没有转向。我不知道为什么,代码似乎很好。

你可以在这里看到代码:

class TermDocMatrix{ }

您现在需要了解另一个名为class TermDocMatrix { //stores distinct terms public HashSet<string> distinctTerm = new HashSet<string>(); //stores document id and its contents without splitting public Dictionary<int, string> documentContentList = new Dictionary<int, string>(); //stores document and its terms collection public Dictionary<string, List<string>> documentCollection = new Dictionary<string, List<string>>(); public Dictionary<string, List<int>> termDocumentIncidenceMatrix = new Dictionary<string, List<int>>(); //stop words collection public List<string> stopWords = new List<string> { "on", "of", "The", "an", "a", "in" }; //boolean operators list public string[] booleanOperator = new string[] { "AND", "OR", "NOT" }; private string _FileName = "words"; public string _Path = ""; int _lastDocNum = 0; public TermDocMatrix(string IndexPath,string FileName) { if (_Path.EndsWith("\\") == false) _Path += "\\"; if (!Directory.Exists(IndexPath)) Directory.CreateDirectory(IndexPath); LogManager.Configure(_Path + _FileName + ".txt", false); // read all files LoadFiles(); } private void LoadFiles() { int count = 0; if (File.Exists(_Path + _FileName + ".txt") == false) return; // load words string b = File.ReadAllText(_Path + _FileName + ".txt"); String[] TermsCollection = RemoveStopsWords(b.ToUpper().Split(' ')); foreach (string term in TermsCollection) { //prepeare distinct terms collection //remove stop words if (!stopWords.Contains(term)) { distinctTerm.Add(term); } } //add document and their terms collection documentCollection.Add(_FileName, TermsCollection.ToList()); //add document and its content for displaying the search result documentContentList.Add(count, b); count++; } public string ProcessFiles(string query) { termDocumentIncidenceMatrix = GetTermDocumentIncidenceMatrix(distinctTerm, documentCollection); do { List<int> lst = ProcessQuery(query); int count = 0; if (lst != null) { foreach (int a in lst) { if (a == 1) { return documentContentList[count]; } count++; } } else { return "No search result found"; } } while (1 == 1); } public int WordCount() { return documentCollection.Count; } public int DocumentCount { get { return _lastDocNum; } } private void FilterQueryTerm(ref string[] str) { List<string> _queryTerm = new List<string>(); foreach (string queryTerm in str) { if (queryTerm.ToUpper().Equals("BUT") || termDocumentIncidenceMatrix.ContainsKey(queryTerm.ToUpper()) || booleanOperator.Contains(queryTerm)) { _queryTerm.Add(queryTerm); } } str = _queryTerm.ToArray(); } //prepares Term Document Incidence Matrix public Dictionary<string, List<int>> GetTermDocumentIncidenceMatrix(HashSet<string> distinctTerms, Dictionary<string, List<string>> documentCollection) { Dictionary<string, List<int>> termDocumentIncidenceMatrix = new Dictionary<string, List<int>>(); List<int> incidenceVector = new List<int>(); foreach (string term in distinctTerms) { //incidence vector for each terms incidenceVector = new List<int>(); foreach (KeyValuePair<string, List<string>> p in documentCollection) { if (p.Value.Contains(term)) { //document contains the term incidenceVector.Add(1); } else { //document do not contains the term incidenceVector.Add(0); } } termDocumentIncidenceMatrix.Add(term, incidenceVector); } return termDocumentIncidenceMatrix; } //removes all stop words public string[] RemoveStopsWords(string[] str) { List<string> terms = new List<string>(); foreach (string term in str) { if (!stopWords.Contains(term)) { terms.Add(term); } } return terms.ToArray(); } //process the boolean query public List<int> ProcessQuery(string query) { //query boolean operator string bitWiseOp = string.Empty; string[] queryTerm = RemoveStopsWords(query.ToUpper().Split(' ')); //remove query term that doesnot appears on document collection FilterQueryTerm(ref queryTerm); List<int> previousTermIncidenceV = null; List<int> nextTermsIncidenceV = null; //holds the bitwise operation result List<int> resultSet = null; //suppose on query X AND Y, X is previousTerm term and Y is nextTerm Boolean hasPreviousTerm = false; Boolean hasNotOperation = false; foreach (string term in queryTerm) { //is a term if (!booleanOperator.Contains(term) && !term.Equals("BUT")) { //query case: structure AND NOT analysis if (hasNotOperation) { if (hasPreviousTerm) { nextTermsIncidenceV = ProcessBooleanOperator("NOT", GetTermIncidenceVector(term), nextTermsIncidenceV); } //query case: eg.NOT analysis else { previousTermIncidenceV = ProcessBooleanOperator("NOT", GetTermIncidenceVector(term), nextTermsIncidenceV); resultSet = previousTermIncidenceV; } hasNotOperation = false; } else if (!hasPreviousTerm) { previousTermIncidenceV = GetTermIncidenceVector(term); resultSet = previousTermIncidenceV; hasPreviousTerm = true; } else { nextTermsIncidenceV = GetTermIncidenceVector(term); } } else if (term.Equals("NOT")) { //indicates that the term in the next iteration should be complemented. hasNotOperation = true; } else { //'BUT' also should be evaluated as AND eg. structure BUT NOT semantic should be evaluated as structure AND NOT semantic if (term.Equals("BUT")) { bitWiseOp = "AND"; } else bitWiseOp = term; } if (nextTermsIncidenceV != null && !hasNotOperation) { resultSet = ProcessBooleanOperator(bitWiseOp, previousTermIncidenceV, nextTermsIncidenceV); previousTermIncidenceV = resultSet; hasPreviousTerm = true; nextTermsIncidenceV = null; } } return resultSet; } //Process Boolean operators public List<int> ProcessBooleanOperator(string op, List<int> previousTermV, List<int> nextTermV) { List<int> resultSet = new List<int>(); if (op.Equals("NOT")) { foreach (int a in previousTermV) { if (a == 1) { resultSet.Add(0); } else { resultSet.Add(1); } } } else if (op.ToUpper().Equals("AND")) //bitwise AND operation { for (int a = 0; a < previousTermV.Count; a++) { if (previousTermV[a] == 1 && nextTermV[a] == 1) { resultSet.Add(1); } else { resultSet.Add(0); } } } else if (op.ToUpper().Equals("OR")) //bitwise OR operation { for (int a = 0; a < previousTermV.Count; a++) { if (previousTermV[a] == 0 && nextTermV[a] == 0) { resultSet.Add(0); } else { resultSet.Add(1); } } } return resultSet; } //returns term incidence vector public List<int> GetTermIncidenceVector(string term) { return termDocumentIncidenceMatrix[term.ToUpper()]; } } 的类,我在Class LogManager上使用了它。在这里:

Class TermDocMatrix

}

它必须有效,但事实并非如此。请告诉我为什么它不起作用。当我要求回答时,我只是看到“找不到搜索结果”,无论我输入什么样的布尔查询。

1 个答案:

答案 0 :(得分:1)

您的问题在这一行:(ProcessFiles函数)

String[] termsCollection = RemoveStopsWords(file.ToUpper().Split(' '));

您要拆分文件的名称而不是内容 这就是你没有搜索结果的原因

你应该做这样的事情:

String[] termsCollection = RemoveStopsWords(File.ReadAllText(file).ToUpper().Split(' '));

现在更改您的TermDocMatrix构造函数:

public TermDocMatrix(string IndexPath,string FileName)
{
    if (!Directory.Exists(IndexPath)) Directory.CreateDirectory(IndexPath);
    LogManager.Configure(System.IO.Path.Combine(_Path, _FileName + ".txt"), false);
    // read all files
    LoadFiles();
}

您的LoadFiles功能:

private void LoadFiles()
{
    int count = 0;

    if (File.Exists(System.IO.Path.Combine(_Path, _FileName + ".txt")) == false)
        return;
    // load words
    string b = File.ReadAllText(System.IO.Path.Combine(_Path, _FileName + ".txt"));

    .....
}