Ejemplo n.º 1
0
        public static List <ModelResult <FoundValue> > FindValues(string utterance, List <SortedValue> values, FindValuesOptions options = null)
        {
            // Sort values in descending order by length so that the longest value is searched over first.
            var list = values;

            list.Sort((a, b) => b.Value.Length - a.Value.Length);

            // Search for each value within the utterance.
            var matches     = new List <ModelResult <FoundValue> >();
            var opt         = options ?? new FindValuesOptions();
            var tokenizer   = opt.Tokenizer ?? Tokenizer.DefaultTokenizer;
            var tokens      = tokenizer(utterance, opt.Locale);
            var maxDistance = opt.MaxTokenDistance ?? 2;

            for (var index = 0; index < list.Count; index++)
            {
                var entry = list[index];
                // Find all matches for a value
                // - To match "last one" in "the last time I chose the last one" we need
                //   to re-search the string starting from the end of the previous match.
                // - The start & end position returned for the match are token positions.
                var startPos = 0;
                var vTokens  = tokenizer(entry.Value.Trim(), opt.Locale);
                while (startPos < tokens.Count)
                {
                    var match = MatchValue(tokens, maxDistance, opt, entry.Index, entry.Value, vTokens, startPos);
                    if (match != null)
                    {
                        startPos = match.End + 1;
                        matches.Add(match);
                    }
                    else
                    {
                        break;
                    }
                }
            }

            // Sort matches by score descending
            matches.Sort((a, b) => (int)(b.Resolution.Score - a.Resolution.Score));

            // Filter out duplicate matching indexes and overlapping characters.
            // - The start & end positions are token positions and need to be translated to
            //   character positions before returning. We also need to populate the "text"
            //   field as well.
            var results      = new List <ModelResult <FoundValue> >();
            var foundIndexes = new HashSet <int>();
            var usedTokens   = new HashSet <int>();

            foreach (var match in matches)
            {
                // Apply filters
                bool add = !foundIndexes.Contains(match.Resolution.Index);
                for (var i = match.Start; i <= match.End; i++)
                {
                    if (usedTokens.Contains(i))
                    {
                        add = false;
                        break;
                    }
                }

                // Add to results
                if (add)
                {
                    // Update filter info
                    foundIndexes.Add(match.Resolution.Index);

                    for (var i = match.Start; i <= match.End; i++)
                    {
                        usedTokens.Add(i);
                    }

                    // Translate start & end and populate text field
                    match.Start = tokens[match.Start].Start;
                    match.End   = tokens[match.End].End;
                    // Note: JavaScript Substring is (start, end) whereas .NET is (start, len)
                    match.Text = utterance.Substring(match.Start, (match.End + 1) - match.Start);
                    results.Add(match);
                }
            }

            // Return the results sorted by position in the utterance
            results.Sort((a, b) => a.Start - b.Start);
            return(results);
        }
Ejemplo n.º 2
0
        private static ModelResult <FoundValue> MatchValue(List <Token> tokens, int maxDistance, FindValuesOptions options, int index, string value, List <Token> vTokens, int startPos)
        {
            // Match value to utterance and calculate total deviation.
            // - The tokens are matched in order so "second last" will match in
            //   "the second from last one" but not in "the last from the second one".
            // - The total deviation is a count of the number of tokens skipped in the
            //   match so for the example above the number of tokens matched would be
            //   2 and the total deviation would be 1.
            var matched        = 0;
            var totalDeviation = 0;
            var start          = -1;
            var end            = -1;

            foreach (var token in vTokens)
            {
                // Find the position of the token in the utterance.
                var pos = IndexOfToken(tokens, token, startPos);
                if (pos >= 0)
                {
                    // Calculate the distance between the current tokens position and the previous tokens distance.
                    var distance = matched > 0 ? pos - startPos : 0;
                    if (distance <= maxDistance)
                    {
                        // Update count of tokens matched and move start pointer to search for next token after
                        // the current token.
                        matched++;
                        totalDeviation += distance;
                        startPos        = pos + 1;

                        // Update start & end position that will track the span of the utterance that's matched.
                        if (start < 0)
                        {
                            start = pos;
                        }
                        end = pos;
                    }
                }
            }

            // Calculate score and format result
            // - The start & end positions and the results text field will be corrected by the caller.
            ModelResult <FoundValue> result = null;

            if (matched > 0 && (matched == vTokens.Count || options.AllowPartialMatches))
            {
                // Percentage of tokens matched. If matching "second last" in
                // "the second from last one" the completeness would be 1.0 since
                // all tokens were found.
                var completeness = matched / vTokens.Count;

                // Accuracy of the match. The accuracy is reduced by additional tokens
                // occurring in the value that weren't in the utterance. So an utterance
                // of "second last" matched against a value of "second from last" would
                // result in an accuracy of 0.5.
                var accuracy = (matched / (matched + totalDeviation));

                // The final score is simply the completeness multiplied by the accuracy.
                var score = completeness * accuracy;

                // Format result
                result = new ModelResult <FoundValue>
                {
                    Start      = start,
                    End        = end,
                    TypeName   = "value",
                    Resolution = new FoundValue
                    {
                        Value = value,
                        Index = index,
                        Score = score
                    }
                };
            }
            return(result);
        }