public static List <ModelResult <FoundValue> > FindValues(string utterance, List <SortedValue> values, FindValuesOptions options = null) { // Sort values in descending order by length so that the longest value is searched over first. var list = values; list.Sort((a, b) => b.Value.Length - a.Value.Length); // Search for each value within the utterance. var matches = new List <ModelResult <FoundValue> >(); var opt = options ?? new FindValuesOptions(); var tokenizer = opt.Tokenizer ?? Tokenizer.DefaultTokenizer; var tokens = tokenizer(utterance, opt.Locale); var maxDistance = opt.MaxTokenDistance ?? 2; for (var index = 0; index < list.Count; index++) { var entry = list[index]; // Find all matches for a value // - To match "last one" in "the last time I chose the last one" we need // to re-search the string starting from the end of the previous match. // - The start & end position returned for the match are token positions. var startPos = 0; var vTokens = tokenizer(entry.Value.Trim(), opt.Locale); while (startPos < tokens.Count) { var match = MatchValue(tokens, maxDistance, opt, entry.Index, entry.Value, vTokens, startPos); if (match != null) { startPos = match.End + 1; matches.Add(match); } else { break; } } } // Sort matches by score descending matches.Sort((a, b) => (int)(b.Resolution.Score - a.Resolution.Score)); // Filter out duplicate matching indexes and overlapping characters. // - The start & end positions are token positions and need to be translated to // character positions before returning. We also need to populate the "text" // field as well. var results = new List <ModelResult <FoundValue> >(); var foundIndexes = new HashSet <int>(); var usedTokens = new HashSet <int>(); foreach (var match in matches) { // Apply filters bool add = !foundIndexes.Contains(match.Resolution.Index); for (var i = match.Start; i <= match.End; i++) { if (usedTokens.Contains(i)) { add = false; break; } } // Add to results if (add) { // Update filter info foundIndexes.Add(match.Resolution.Index); for (var i = match.Start; i <= match.End; i++) { usedTokens.Add(i); } // Translate start & end and populate text field match.Start = tokens[match.Start].Start; match.End = tokens[match.End].End; // Note: JavaScript Substring is (start, end) whereas .NET is (start, len) match.Text = utterance.Substring(match.Start, (match.End + 1) - match.Start); results.Add(match); } } // Return the results sorted by position in the utterance results.Sort((a, b) => a.Start - b.Start); return(results); }
private static ModelResult <FoundValue> MatchValue(List <Token> tokens, int maxDistance, FindValuesOptions options, int index, string value, List <Token> vTokens, int startPos) { // Match value to utterance and calculate total deviation. // - The tokens are matched in order so "second last" will match in // "the second from last one" but not in "the last from the second one". // - The total deviation is a count of the number of tokens skipped in the // match so for the example above the number of tokens matched would be // 2 and the total deviation would be 1. var matched = 0; var totalDeviation = 0; var start = -1; var end = -1; foreach (var token in vTokens) { // Find the position of the token in the utterance. var pos = IndexOfToken(tokens, token, startPos); if (pos >= 0) { // Calculate the distance between the current tokens position and the previous tokens distance. var distance = matched > 0 ? pos - startPos : 0; if (distance <= maxDistance) { // Update count of tokens matched and move start pointer to search for next token after // the current token. matched++; totalDeviation += distance; startPos = pos + 1; // Update start & end position that will track the span of the utterance that's matched. if (start < 0) { start = pos; } end = pos; } } } // Calculate score and format result // - The start & end positions and the results text field will be corrected by the caller. ModelResult <FoundValue> result = null; if (matched > 0 && (matched == vTokens.Count || options.AllowPartialMatches)) { // Percentage of tokens matched. If matching "second last" in // "the second from last one" the completeness would be 1.0 since // all tokens were found. var completeness = matched / vTokens.Count; // Accuracy of the match. The accuracy is reduced by additional tokens // occurring in the value that weren't in the utterance. So an utterance // of "second last" matched against a value of "second from last" would // result in an accuracy of 0.5. var accuracy = (matched / (matched + totalDeviation)); // The final score is simply the completeness multiplied by the accuracy. var score = completeness * accuracy; // Format result result = new ModelResult <FoundValue> { Start = start, End = end, TypeName = "value", Resolution = new FoundValue { Value = value, Index = index, Score = score } }; } return(result); }