我正在实现一个反向索引结构,特别是允许布尔查询和字级粒度的结构。
我有大量的文本数据库,我保留一个索引,告诉我,对于每个单词,它在哪个文件中(IDdoc
),以及它在文件中的位置(position
) 。 (一个单词可以在许多文件中,也可以在一个文件中的许多地方。)
因此,我为每个单词保留一个向量:
vector<pair<IDdoc,position>> occurences_of_word;
(向量按IDdoc排序,然后按位置按升序排序。)
我有一个由{strong>字组成的string
对象。这是我正在寻找的短语。
对于词组中的每个词,我想知道哪些文档包含此词组,因此返回IDdoc
s的向量。
以下是我尝试解决方案:
typedef std::string Word_t;
typedef unsigned int WordPosition_t;
typedef unsigned int IDdocument_t;
vector<pair<IDdocument_t,WordPosition_t> > IndiceInvertidoBooleanoConPosicion::_interseccion_dos_listas
(const vector<pair<IDdocument_t,WordPosition_t>> & v1,
const vector<pair<IDdocument_t,WordPosition_t>> & v2)
{
vector<pair<IDdocument_t,WordPosition_t> > intersection;
IDdocument_t ID_doc_one, ID_doc_two;
int i = 0;
int j = 0;
const int MAX_INDEX_V1 = v1.size() -1;
const int MAX_INDEX_V2 = v2.size() -1;
while(i <= MAX_INDEX_V1 && j <= MAX_INDEX_V2)
{
ID_doc_one = v1[i].first;
ID_doc_two = v2[j].first;
if (ID_doc_one < ID_doc_two)
i++;
else if (ID_doc_one > ID_doc_two)
j++;
else // The words were found in the same document!
{
WordPosition_t pos_word_one = v1[i].second;
WordPosition_t pos_word_two = v2[j].second;
// The words make a phrase! Return pos_two for the next intersection finding step
if (pos_word_one + 1 == pos_word_two)
{
intersection.push_back(make_pair(ID_doc_one,pos_word_two));
i++;
j++;
}
// Phrase not found
else
{
if (pos_word_one < pos_word_two)
i++;
else
j++;
}
}
}
return intersection;
}
int find_phrase(const string phrase, vector<IDdocument_t> & id_docs)
{
Word_t word;
id_docs.clear();
Text parsed_phrase;
// Extract the relevant words from the phrase
parsed_phrase.parse(phrase);
vector<pair<IDdocument_t,WordPosition_t> > intersection;
vector<pair<IDdocument_t,WordPosition_t> > second_vector;
while (parsed_phrase.get_next_word(word) != RES_END)
{
_find_vector_words(word,intersection);
while (parsed_phrase.get_next_word(word) != RES_END)
{
_find_vector_words(word,second_vector);
intersection = _intersect_two_words(intersection,second_vector);
}
}
for (unsigned int i = 0; i < intersection.size(); i ++)
{
IDdocument_t id_doc = intersection[i].first;
if(std::find(id_docs.begin(), id_docs.end(), id_doc) == id_docs.end())
id_docs.push_back(id_doc);
}
return RES_OK;
}
答案 0 :(得分:2)
要从字符串表示中查找特定的Word,您可能希望查看map之类的内容。要创建简单的结果联合,您可能需要set。这个实现更多地作为一个演示而不是一个非常理想的最终实现(c.f. sloppy短语解析)。
#include <vector>
#include <map>
#include <set>
#include <iostream>
#include <string>
typedef std::string IDdoc;
typedef int position;
typedef std::pair<IDdoc,position> Occurrence;
typedef std::vector<Occurrence> OccurrencesOfWord;
typedef std::map<std::string /*word*/, OccurrencesOfWord> Dictionary;
typedef std::set<IDdoc> Matches;
bool findMatchesForPhrase(const std::string& phrase, const Dictionary& dictionary, Matches& matches)
{
size_t pos = 0;
size_t len = 0;
while (pos < phrase.length()) {
size_t end = phrase.find(' ', pos);
size_t len = ((end == phrase.npos) ? phrase.length() : end) - pos;
std::string word(phrase, pos, len);
pos += len + 1; // to skip the space.
// ignore words not in the dictionary.
auto dictIt = dictionary.find(word);
if (dictIt == dictionary.end())
continue;
auto& occurrences = dictIt->second; // shortcut/alias,.
for (auto& occurIt : occurrences) {
// Add all the IDdoc's of this occurence to the set.
matches.insert(occurIt.first);
}
}
return !matches.empty();
}
void addToDictionary(Dictionary& dict, const char* word, const char* doc, int position)
{
dict[word].push_back(std::make_pair(std::string(doc), position));
}
int main(int argc, const char** argv)
{
std::string phrase("pizza is life");
Dictionary dict;
addToDictionary(dict, "pizza", "book1", 10);
addToDictionary(dict, "pizza", "book2", 30);
addToDictionary(dict, "life", "book1", 1);
addToDictionary(dict, "life", "book3", 1);
addToDictionary(dict, "goat", "book4", 99);
Matches matches;
bool result = findMatchesForPhrase(phrase, dict, matches);
std::cout << "result = " << result << std::endl;
for (auto& ent : matches) {
std::cout << ent << std::endl;
}
return 0;
}
跟进以解决您的更改:
while(i < SIZE_VECTOR_ONE && j < SIZE_VECTOR_TWO)
{
if (ID_doc_one < ID_doc_two)
{
ID_doc_one = v1[++i].first;
让我们说“SIZE_VECTOR 1”是1.这意味着向量中有一个元素,元素[0]。如果ID_doc_one为0且ID_doc_two为1,则
if (0 < 1) {
ID_doc_one = v1[1].first;
无效。你可能最好使用迭代器或指针:
while (oneIt != v1.end() && twoIt != v2.end()) {
if (oneIt->first < twoIt->first) {
++oneIt;
continue;
} else if (*twoIt < *oneIt) {
++twoIt;
continue;
}
// same documentId in both lists, snag positions.
...
}
接下来,这看起来有点破碎:
else {
} // To avoid "out of range" errors <-- but also ends the "else"
if (i < SIZE_VECTOR_ONE - 1)
ID_doc_one = v1[++i].first;
if (j < SIZE_VECTOR_TWO - 1)
ID_doc_two = v2[++j].first;
}
我想知道如果你有相同的文件但有多个职位会发生什么?
接下来是挑剔的,但我花了很长时间才解析
WordPosition_t pos_one = v1[i].second;
WordPosition_t pos_two = v2[j].second;
// The words make a phrase! Return pos_two for the next intersection finding step
if (pos_one + 1 == pos_two)
如你所说的那样写这个似乎非常清楚“(如果第二个单词在第一个单词之后的位置):
WordPosition_t posFirstWord = v1[i].second;
WordPosition_t posSecondWord = v2[j].second;
// The words make a phrase! Return pos_two for the next intersection finding step
if (posSecondWord == posFirstWord + 1)
下一部分有点令人困惑,因为这两个条款似乎都是为了增加i和j并更新ID_doc_one和2,所以在if块之后将该部分提升到公共部分是有意义的,但是else {}
让你很难说出你在做什么。
if (pos_one + 1 == pos_two)
{
intersection.push_back(make_pair(ID_doc_one,pos_two));
ID_doc_one = v1[++i].first;
ID_doc_two = v2[++j].first;
}
else {
} // To avoid "out of range" errors
if (i < SIZE_VECTOR_ONE - 1)
ID_doc_one = v1[++i].first;
if (j < SIZE_VECTOR_TWO - 1)
ID_doc_two = v2[++j].first;
}
当你匹配两个数组时,你总是希望增加i和j,它不是条件,我也不确定你为什么使用pos_two,因为这个短语实际上是在pos_one找到的?
这就是我写的方式:
#include<iostream>
#include<map>
#include<vector>
#include<string>
typedef std::string Word_t;
typedef unsigned int WordPosition_t;
typedef unsigned int IDdocument_t;
typedef std::pair<IDdocument_t, WordPosition_t> DocumentPosition_t;
typedef std::vector<DocumentPosition_t> WordReferences_t;
WordReferences_t _intersect_two_words(const WordReferences_t& v1, const WordReferences_t& v2)
{
// all the locations where the words occur one after the other.
WordReferences_t intersection;
auto firstIt = v1.begin();
auto secondIt = v2.begin();
while (firstIt != v1.end() && secondIt != v2.end())
{
if (firstIt->first < secondIt->first)
{
++firstIt;
continue;
}
// find the second word in the same document and AFTER the first word.
if (secondIt->first < firstIt->first || secondIt->second < firstIt->second + 1)
{
++secondIt;
continue;
}
// first word wasn't just before the second, it's not a phrase.
if (secondIt->second > firstIt->second + 1)
{
++firstIt;
continue;
}
// We found a phrase.
intersection.emplace_back(*firstIt);
++firstIt;
++secondIt;
}
return intersection;
}
int main()
{
WordReferences_t v1, v2;
v1.push_back(std::make_pair(10, 5));
v1.push_back(std::make_pair(10, 25));
v1.push_back(std::make_pair(11, 10));
v1.push_back(std::make_pair(12, 1));
v1.push_back(std::make_pair(12, 11));
v1.push_back(std::make_pair(12, 21));
v1.push_back(std::make_pair(12, 31));
v1.push_back(std::make_pair(15, 11));
v1.push_back(std::make_pair(100, 1));
v1.push_back(std::make_pair(100, 11));
v1.push_back(std::make_pair(100, 21));
v1.push_back(std::make_pair(101, 11));
v1.push_back(std::make_pair(102, 11));
v1.push_back(std::make_pair(102, 13));
v1.push_back(std::make_pair(102, 14));
v1.push_back(std::make_pair(103, 11));
v1.push_back(std::make_pair(103, 13));
v2.push_back(std::make_pair(10, 11));
v2.push_back(std::make_pair(12, 10));
v2.push_back(std::make_pair(12, 40));
v2.push_back(std::make_pair(16, 11));
v2.push_back(std::make_pair(100, 12)); // match
v2.push_back(std::make_pair(101, 12)); // match
v2.push_back(std::make_pair(101, 13));
v2.push_back(std::make_pair(101, 14));
v2.push_back(std::make_pair(102, 12)); //match
v2.push_back(std::make_pair(103, 1));
v2.push_back(std::make_pair(103, 10));
v2.push_back(std::make_pair(103, 12)); // match
v2.push_back(std::make_pair(103, 15));
auto intersection = _intersect_two_words(v1, v2);
for (auto entry : intersection)
{
std::cout << entry.first << ", " << entry.second << "+" << (entry.second + 1) << std::endl;
}
return 0;
}
答案 1 :(得分:0)
我不知道这是否最有效,但你可以从words[0]
的文件/位置开始。然后转到words[1]
并查找相同文档的位置等于words[0].position + words[0].length + 1
的交叉文档。然后同样迭代words
的其余部分。对于较长的短语,它应该很快缩小范围吗?
答案 2 :(得分:0)
正如您所说,您使用的数据结构实际上是一个完整的倒排索引,如维基百科所述:
反向索引有两种主要变体:记录级反向索引(或反向文件索引或仅反向文件)包含每个单词的文档引用列表。 单词级别倒排索引(或完整倒排索引或倒排列表)还包含文档中每个单词的位置。[2] 后一种形式提供更多功能(如短语搜索),但需要更多时间和空间要创造。
话虽这么说,你也可以尝试创建一个短语索引:
http://ww2.cs.mu.oz.au/~jz/fulltext/acmtois04.pdf
(参见图2作为演示)。
如果你没有创建一个短语索引,那么你可以做的(我相信),就是简单地检索包含特定单词的文档,当你将查询从单词增加到单词时,将与你所拥有的文档集相交短语,然后最后回到文档,看看你所回复的每个文件是否实际上包含“短语”而不是“在不同位置彼此分开的单词”。