我有以下管道:
var mlContext = new MLContext();
var data = mlContext.Data.LoadFromEnumerable(new[]
{
new Input {Message = "one two three one two three"},
new Input {Message = "one two"},
new Input {Message = "two three"}
});
var pipeline =
mlContext.Transforms.Text.FeaturizeText("TextFeatures", "Message")
.Append(mlContext.Transforms.Text.TokenizeCharacters(
"MessageTokens", "Message"))
.Append(mlContext.Transforms.Text.ProduceNgrams(
"MessageNgrams", "MessageTokens", 2));
var transformedData = pipeline.Fit(data).Transform(data);
管道执行后的“ MessageNgrams”列包含浮点向量。如何获得实际的Ngram,即“一二”,“两棵树”?
答案 0 :(得分:1)
List<string> getNgram(string str, int nsize)
{
var mlContext = new MLContext();
var dataview = mlContext.Data.LoadFromEnumerable(new List<TextData>() { new TextData { Text = str } });
var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text")
.Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
.Append(mlContext.Transforms.Text.ProduceNgrams("NgramFeatures", "Tokens",
ngramLength: nsize,
useAllLengths: false,
weighting: NgramExtractingEstimator.WeightingCriteria.Tf));
var textTransformer = textPipeline.Fit(dataview);
var transformedDataView = textTransformer.Transform(dataview);
VBuffer<ReadOnlyMemory<char>> slotNames = default;
transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames);
var NgramFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["NgramFeatures"]);
var ngrams = slotNames.GetValues().ToArray().Select(x=>x.Span.ToString().Replace('|',' '));
return ngrams.ToList();
}
答案 1 :(得分:0)
这是一个适用于我的SCDA的代码段,应该非常相似,否则会提示您如何实现它。
var slotLabelBuffer = default(VBuffer<ReadOnlyMemory<char>>);
transformedData.Schema["MessageNgrams"].GetSlotNames(ref slotLabelBuffer);
var slotLabels = new Dictionary<int, string>();
for (int i = 0; i < slotLabelBuffer.Length; i++)
slotLabels.Add(i, slotLabelBuffer.GetItemOrDefault(i).ToString());
slotLabels的索引对应于您的浮点向量。