我尝试以这些方式回复一些简单的布尔查询NOT x NOT y NOT z
也x AND y AND z
,并且这个x OR y OR z
x,y,z
也是一些单词,其中任何一个都属于不同的{ {1}}或者所有这些都属于同一个file.txt
,无论如何。
我写过file.txt
:
它必须能够回答一个布尔查询,我为class TermDocMatrix
准备了一些方法用于这些目的,但它不起作用。我甚至一步一步调试代码,我意识到循环没有转向。我不知道为什么,代码似乎很好。
你可以在这里看到代码:
class TermDocMatrix{ }
您现在需要了解另一个名为class TermDocMatrix
{
//stores distinct terms
public HashSet<string> distinctTerm = new HashSet<string>();
//stores document id and its contents without splitting
public Dictionary<int, string> documentContentList = new Dictionary<int, string>();
//stores document and its terms collection
public Dictionary<string, List<string>> documentCollection = new Dictionary<string, List<string>>();
public Dictionary<string, List<int>> termDocumentIncidenceMatrix = new Dictionary<string, List<int>>();
//stop words collection
public List<string> stopWords = new List<string> { "on", "of", "The", "an", "a", "in" };
//boolean operators list
public string[] booleanOperator = new string[] { "AND", "OR", "NOT" };
private string _FileName = "words";
public string _Path = "";
int _lastDocNum = 0;
public TermDocMatrix(string IndexPath,string FileName)
{
if (_Path.EndsWith("\\") == false) _Path += "\\";
if (!Directory.Exists(IndexPath)) Directory.CreateDirectory(IndexPath);
LogManager.Configure(_Path + _FileName + ".txt", false);
// read all files
LoadFiles();
}
private void LoadFiles()
{
int count = 0;
if (File.Exists(_Path + _FileName + ".txt") == false)
return;
// load words
string b = File.ReadAllText(_Path + _FileName + ".txt");
String[] TermsCollection = RemoveStopsWords(b.ToUpper().Split(' '));
foreach (string term in TermsCollection)
{
//prepeare distinct terms collection
//remove stop words
if (!stopWords.Contains(term))
{
distinctTerm.Add(term);
}
}
//add document and their terms collection
documentCollection.Add(_FileName, TermsCollection.ToList());
//add document and its content for displaying the search result
documentContentList.Add(count, b);
count++;
}
public string ProcessFiles(string query)
{
termDocumentIncidenceMatrix = GetTermDocumentIncidenceMatrix(distinctTerm, documentCollection);
do
{
List<int> lst = ProcessQuery(query);
int count = 0;
if (lst != null)
{
foreach (int a in lst)
{
if (a == 1)
{
return documentContentList[count];
}
count++;
}
}
else
{
return "No search result found";
}
} while (1 == 1);
}
public int WordCount()
{
return documentCollection.Count;
}
public int DocumentCount
{
get
{
return _lastDocNum;
}
}
private void FilterQueryTerm(ref string[] str)
{
List<string> _queryTerm = new List<string>();
foreach (string queryTerm in str)
{
if (queryTerm.ToUpper().Equals("BUT") || termDocumentIncidenceMatrix.ContainsKey(queryTerm.ToUpper()) || booleanOperator.Contains(queryTerm))
{
_queryTerm.Add(queryTerm);
}
}
str = _queryTerm.ToArray();
}
//prepares Term Document Incidence Matrix
public Dictionary<string, List<int>> GetTermDocumentIncidenceMatrix(HashSet<string> distinctTerms, Dictionary<string, List<string>> documentCollection)
{
Dictionary<string, List<int>> termDocumentIncidenceMatrix = new Dictionary<string, List<int>>();
List<int> incidenceVector = new List<int>();
foreach (string term in distinctTerms)
{
//incidence vector for each terms
incidenceVector = new List<int>();
foreach (KeyValuePair<string, List<string>> p in documentCollection)
{
if (p.Value.Contains(term))
{
//document contains the term
incidenceVector.Add(1);
}
else
{
//document do not contains the term
incidenceVector.Add(0);
}
}
termDocumentIncidenceMatrix.Add(term, incidenceVector);
}
return termDocumentIncidenceMatrix;
}
//removes all stop words
public string[] RemoveStopsWords(string[] str)
{
List<string> terms = new List<string>();
foreach (string term in str)
{
if (!stopWords.Contains(term))
{
terms.Add(term);
}
}
return terms.ToArray();
}
//process the boolean query
public List<int> ProcessQuery(string query)
{
//query boolean operator
string bitWiseOp = string.Empty;
string[] queryTerm = RemoveStopsWords(query.ToUpper().Split(' '));
//remove query term that doesnot appears on document collection
FilterQueryTerm(ref queryTerm);
List<int> previousTermIncidenceV = null;
List<int> nextTermsIncidenceV = null;
//holds the bitwise operation result
List<int> resultSet = null;
//suppose on query X AND Y, X is previousTerm term and Y is nextTerm
Boolean hasPreviousTerm = false;
Boolean hasNotOperation = false;
foreach (string term in queryTerm)
{
//is a term
if (!booleanOperator.Contains(term) && !term.Equals("BUT"))
{
//query case: structure AND NOT analysis
if (hasNotOperation)
{
if (hasPreviousTerm)
{
nextTermsIncidenceV = ProcessBooleanOperator("NOT", GetTermIncidenceVector(term), nextTermsIncidenceV);
}
//query case: eg.NOT analysis
else
{
previousTermIncidenceV = ProcessBooleanOperator("NOT", GetTermIncidenceVector(term), nextTermsIncidenceV);
resultSet = previousTermIncidenceV;
}
hasNotOperation = false;
}
else if (!hasPreviousTerm)
{
previousTermIncidenceV = GetTermIncidenceVector(term);
resultSet = previousTermIncidenceV;
hasPreviousTerm = true;
}
else
{
nextTermsIncidenceV = GetTermIncidenceVector(term);
}
}
else if (term.Equals("NOT"))
{
//indicates that the term in the next iteration should be complemented.
hasNotOperation = true;
}
else
{
//'BUT' also should be evaluated as AND eg. structure BUT NOT semantic should be evaluated as structure AND NOT semantic
if (term.Equals("BUT"))
{
bitWiseOp = "AND";
}
else
bitWiseOp = term;
}
if (nextTermsIncidenceV != null && !hasNotOperation)
{
resultSet = ProcessBooleanOperator(bitWiseOp, previousTermIncidenceV, nextTermsIncidenceV);
previousTermIncidenceV = resultSet;
hasPreviousTerm = true;
nextTermsIncidenceV = null;
}
}
return resultSet;
}
//Process Boolean operators
public List<int> ProcessBooleanOperator(string op, List<int> previousTermV, List<int> nextTermV)
{
List<int> resultSet = new List<int>();
if (op.Equals("NOT"))
{
foreach (int a in previousTermV)
{
if (a == 1)
{
resultSet.Add(0);
}
else
{
resultSet.Add(1);
}
}
}
else if (op.ToUpper().Equals("AND")) //bitwise AND operation
{
for (int a = 0; a < previousTermV.Count; a++)
{
if (previousTermV[a] == 1 && nextTermV[a] == 1)
{
resultSet.Add(1);
}
else
{
resultSet.Add(0);
}
}
}
else if (op.ToUpper().Equals("OR")) //bitwise OR operation
{
for (int a = 0; a < previousTermV.Count; a++)
{
if (previousTermV[a] == 0 && nextTermV[a] == 0)
{
resultSet.Add(0);
}
else
{
resultSet.Add(1);
}
}
}
return resultSet;
}
//returns term incidence vector
public List<int> GetTermIncidenceVector(string term)
{
return termDocumentIncidenceMatrix[term.ToUpper()];
}
}
的类,我在Class LogManager
上使用了它。在这里:
Class TermDocMatrix
}
它必须有效,但事实并非如此。请告诉我为什么它不起作用。当我要求回答时,我只是看到“找不到搜索结果”,无论我输入什么样的布尔查询。
答案 0 :(得分:1)
您的问题在这一行:(ProcessFiles
函数)
String[] termsCollection = RemoveStopsWords(file.ToUpper().Split(' '));
您要拆分文件的名称而不是内容 这就是你没有搜索结果的原因
你应该做这样的事情:
String[] termsCollection = RemoveStopsWords(File.ReadAllText(file).ToUpper().Split(' '));
现在更改您的TermDocMatrix
构造函数:
public TermDocMatrix(string IndexPath,string FileName)
{
if (!Directory.Exists(IndexPath)) Directory.CreateDirectory(IndexPath);
LogManager.Configure(System.IO.Path.Combine(_Path, _FileName + ".txt"), false);
// read all files
LoadFiles();
}
您的LoadFiles
功能:
private void LoadFiles()
{
int count = 0;
if (File.Exists(System.IO.Path.Combine(_Path, _FileName + ".txt")) == false)
return;
// load words
string b = File.ReadAllText(System.IO.Path.Combine(_Path, _FileName + ".txt"));
.....
}