我肯定在这里做错了什么。我正在尝试让我们在使用Elastic Search的搜索引擎项目中使用“ More Like This”查询。这个想法是,CMS可以在Meta标签或其他内容中将标签(如类别)写入页面,我们将这些标签读入Elastic并使用它们基于输入文档ID进行“更像这样”的搜索。 / p>
因此,如果输入文档的标签为catfish, chicken, goat
,我希望Elastic Search可以找到共享这些标签的其他文档,而不会为racecar
和airplane
返回标签。
我通过以下方式构建了概念验证控制台应用程序:
按照https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html
创建一个新的.NET Framework 4.6.1控制台应用程序
为NEST 6.5.0和ElasticSearch.Net 6.5.0添加NuGet软件包
然后,我创建了一个新的弹性索引,该索引包含具有“标签”属性的对象(“ MyThing”类型)。此标签是一组可能的值中随机逗号分隔的一组单词。在测试中,我已经在索引中插入了100到5000个项目。我尝试了越来越少的可能单词。
无论我尝试执行什么MoreLikeThis
查询,都永远不会返回任何内容,而且我也不明白为什么。
不返回结果的查询:
var result = EsClient.Search<MyThing>(s => s
.Index(DEFAULT_INDEX)
.Query(esQuery =>
{
var mainQuery = esQuery
.MoreLikeThis(mlt => mlt
.Include(true)
.Fields(f => f.Field(ff => ff.Tags, 5))
.Like(l => l.Document(d => d.Id(id)))
);
return mainQuery;
}
完整的“ program.cs”源:
using Nest;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Test_MoreLikeThis_ES6
{
class Program
{
public class MyThing
{
public string Tags { get; set; }
}
const string ELASTIC_SERVER = "http://localhost:9200";
const string DEFAULT_INDEX = "my_index";
const int NUM_RECORDS = 1000;
private static Uri es_node = new Uri(ELASTIC_SERVER);
private static ConnectionSettings settings = new ConnectionSettings(es_node).DefaultIndex(DEFAULT_INDEX);
private static ElasticClient EsClient = new ElasticClient(settings);
private static Random rnd = new Random();
static void Main(string[] args)
{
Console.WriteLine("Rebuild index? (y):");
var answer = Console.ReadLine().ToLower();
if (answer == "y")
{
RebuildIndex();
for (int i = 0; i < NUM_RECORDS; i++)
{
AddToIndex();
}
}
Console.WriteLine("");
Console.WriteLine("Getting a Thing...");
var aThingId = GetARandomThingId();
Console.WriteLine("");
Console.WriteLine("Looking for something similar to document with id " + aThingId);
Console.WriteLine("");
Console.WriteLine("");
GetMoreLikeAThing(aThingId);
}
private static string GetARandomThingId()
{
var firstdocQuery = EsClient
.Search<MyThing>(s =>
s.Size(1)
.Query(q => {
return q.FunctionScore(fs => fs.Functions(fn => fn.RandomScore(rs => rs.Seed(DateTime.Now.Ticks).Field("_seq_no"))));
})
);
if (!firstdocQuery.IsValid || firstdocQuery.Hits.Count == 0) return null;
var hit = firstdocQuery.Hits.First();
Console.WriteLine("Found a thing with id '" + hit.Id + "' and tags: " + hit.Source.Tags);
return hit.Id;
}
private static void GetMoreLikeAThing(string id)
{
var result = EsClient.Search<MyThing>(s => s
.Index(DEFAULT_INDEX)
.Query(esQuery =>
{
var mainQuery = esQuery
.MoreLikeThis(mlt => mlt
.Include(true)
.Fields(f => f.Field(ff => ff.Tags, 5))
.Like(l => l.Document(d => d.Id(id)))
);
return mainQuery;
}
));
if (result.IsValid)
{
if (result.Hits.Count > 0)
{
Console.WriteLine("These things are similar:");
foreach (var hit in result.Hits)
{
Console.WriteLine(" " + hit.Id + " : " + hit.Source.Tags);
}
}
else
{
Console.WriteLine("No similar things found.");
}
}
else
{
Console.WriteLine("There was an error running the ES query.");
}
Console.WriteLine("");
Console.WriteLine("Enter (y) to get another thing, or anything else to exit");
var y = Console.ReadLine().ToLower();
if (y == "y")
{
var aThingId = GetARandomThingId();
GetMoreLikeAThing(aThingId);
}
Console.WriteLine("");
Console.WriteLine("Any key to exit...");
Console.ReadKey();
}
private static void RebuildIndex()
{
var existsResponse = EsClient.IndexExists(DEFAULT_INDEX);
if (existsResponse.Exists) //delete existing mapping (and data)
{
EsClient.DeleteIndex(DEFAULT_INDEX);
}
var rebuildResponse = EsClient.CreateIndex(DEFAULT_INDEX, c => c.Settings(s => s.NumberOfReplicas(1).NumberOfShards(5)));
var response2 = EsClient.Map<MyThing>(m => m.AutoMap());
}
private static void AddToIndex()
{
var myThing = new MyThing();
var tags = new List<string> {
"catfish",
"tractor",
"racecar",
"airplane",
"chicken",
"goat",
"pig",
"horse",
"goose",
"duck"
};
var randNum = rnd.Next(0, tags.Count);
//get randNum random tags
var rand = tags.OrderBy(o => Guid.NewGuid().ToString()).Take(randNum);
myThing.Tags = string.Join(", ", rand);
var ir = new IndexRequest<MyThing>(myThing);
var indexResponse = EsClient.Index(ir);
Console.WriteLine("Index response: " + indexResponse.Id + " : " + string.Join(" " , myThing.Tags));
}
}
}
答案 0 :(得分:1)
这里的问题是原型文档的任何条款永远都无法满足默认min_term_freq
值2,因为所有文档一次仅包含每个标签( term )。如果将min_term_freq
减为1,则会得到结果。可能还希望将min_doc_freq
设置为1,并与排除原型文档的查询结合。
这是一个玩的例子
const string ELASTIC_SERVER = "http://localhost:9200";
const string DEFAULT_INDEX = "my_index";
const int NUM_RECORDS = 1000;
private static readonly Random _random = new Random();
private static readonly IReadOnlyList<string> Tags =
new List<string>
{
"catfish",
"tractor",
"racecar",
"airplane",
"chicken",
"goat",
"pig",
"horse",
"goose",
"duck"
};
private static ElasticClient _client;
private static void Main()
{
var pool = new SingleNodeConnectionPool(new Uri(ELASTIC_SERVER));
var settings = new ConnectionSettings(pool)
.DefaultIndex(DEFAULT_INDEX);
_client = new ElasticClient(settings);
Console.WriteLine("Rebuild index? (y):");
var answer = Console.ReadLine().ToLower();
if (answer == "y")
{
RebuildIndex();
AddToIndex();
}
Console.WriteLine();
Console.WriteLine("Getting a Thing...");
var aThingId = GetARandomThingId();
Console.WriteLine();
Console.WriteLine("Looking for something similar to document with id " + aThingId);
Console.WriteLine();
Console.WriteLine();
GetMoreLikeAThing(aThingId);
}
public class MyThing
{
public List<string> Tags { get; set; }
}
private static string GetARandomThingId()
{
var firstdocQuery = _client
.Search<MyThing>(s =>
s.Size(1)
.Query(q => q
.FunctionScore(fs => fs
.Functions(fn => fn
.RandomScore(rs => rs
.Seed(DateTime.Now.Ticks)
.Field("_seq_no")
)
)
)
)
);
if (!firstdocQuery.IsValid || firstdocQuery.Hits.Count == 0) return null;
var hit = firstdocQuery.Hits.First();
Console.WriteLine($"Found a thing with id '{hit.Id}' and tags: {string.Join(", ", hit.Source.Tags)}");
return hit.Id;
}
private static void GetMoreLikeAThing(string id)
{
var result = _client.Search<MyThing>(s => s
.Index(DEFAULT_INDEX)
.Query(esQuery => esQuery
.MoreLikeThis(mlt => mlt
.Include(true)
.Fields(f => f.Field(ff => ff.Tags))
.Like(l => l.Document(d => d.Id(id)))
.MinTermFrequency(1)
.MinDocumentFrequency(1)
) && !esQuery
.Ids(ids => ids
.Values(id)
)
)
);
if (result.IsValid)
{
if (result.Hits.Count > 0)
{
Console.WriteLine("These things are similar:");
foreach (var hit in result.Hits)
{
Console.WriteLine($" {hit.Id}: {string.Join(", ", hit.Source.Tags)}");
}
}
else
{
Console.WriteLine("No similar things found.");
}
}
else
{
Console.WriteLine("There was an error running the ES query.");
}
Console.WriteLine();
Console.WriteLine("Enter (y) to get another thing, or anything else to exit");
var y = Console.ReadLine().ToLower();
if (y == "y")
{
var aThingId = GetARandomThingId();
GetMoreLikeAThing(aThingId);
}
Console.WriteLine();
Console.WriteLine("Any key to exit...");
}
private static void RebuildIndex()
{
var existsResponse = _client.IndexExists(DEFAULT_INDEX);
if (existsResponse.Exists) //delete existing mapping (and data)
{
_client.DeleteIndex(DEFAULT_INDEX);
}
var rebuildResponse = _client.CreateIndex(DEFAULT_INDEX, c => c
.Settings(s => s
.NumberOfShards(1)
)
.Mappings(m => m
.Map<MyThing>(mm => mm.AutoMap())
)
);
}
private static void AddToIndex()
{
var bulkAllObservable = _client.BulkAll(GetMyThings(), b => b
.RefreshOnCompleted()
.Size(1000));
var waitHandle = new ManualResetEvent(false);
Exception exception = null;
var bulkAllObserver = new BulkAllObserver(
onNext: r =>
{
Console.WriteLine($"Indexed page {r.Page}");
},
onError: e =>
{
exception = e;
waitHandle.Set();
},
onCompleted: () => waitHandle.Set());
bulkAllObservable.Subscribe(bulkAllObserver);
waitHandle.WaitOne();
if (exception != null)
{
throw exception;
}
}
private static IEnumerable<MyThing> GetMyThings()
{
for (int i = 0; i < NUM_RECORDS; i++)
{
var randomTags = Tags.OrderBy(o => Guid.NewGuid().ToString())
.Take(_random.Next(0, Tags.Count))
.OrderBy(t => t)
.ToList();
yield return new MyThing { Tags = randomTags };
}
}
这是示例输出
Found a thing with id 'Ugg9LGkBPK3n91HQD1d5' and tags: airplane, goat
These things are similar:
4wg9LGkBPK3n91HQD1l5: airplane, goat
9Ag9LGkBPK3n91HQD1l5: airplane, goat
Vgg9LGkBPK3n91HQD1d5: airplane, goat, goose
sQg9LGkBPK3n91HQD1d5: airplane, duck, goat
lQg9LGkBPK3n91HQD1h5: airplane, catfish, goat
9gg9LGkBPK3n91HQD1l5: airplane, catfish, goat
FQg9LGkBPK3n91HQD1p5: airplane, goat, goose
Jwg9LGkBPK3n91HQD1p5: airplane, goat, goose
Fwg9LGkBPK3n91HQD1d5: airplane, duck, goat, tractor
Kwg9LGkBPK3n91HQD1d5: airplane, goat, goose, horse