예제 #1
0
        /// <summary>
        /// Looks for instances of query terms within the given transcript text.
        /// </summary>
        /// <param name="queryTerms">A string of query terms separated by whitespace.</param>
        /// <param name="transcriptText">The segment's transcript text.</param>
        /// <returns>A list of MatchTerm instances on success; null otherwise.</returns>
        /// <remarks>Wilcard after a prefix (term*), negated term (-term) and phrase search ("phraseword1 phraseword2") all supported.</remarks>
        private async Task <List <MatchTerm> > FindMatches(string queryTerms, string transcriptText)
        {
            var matches = new List <MatchTerm>();

            // If query and/or transcript are empty, no work is needed: there are no matches.
            if (queryTerms != null && queryTerms.Trim().Length > 0 && transcriptText != null && transcriptText.Trim().Length > 0)
            {
                // In the Lucene-based simple query parser syntax, "-" at the start of a term negates that term,
                // e.g., buffalo -soldier matches buffalo and not soldier (with default and'ing of terms active).
                // Do not analyze any terms starting with - (as they won't match anyway, but save a bit of work).
                //
                // More importantly, if a term ends with *, do a prefix search, e.g., buff* matches buffalo, buffet, etc.
                // Handle prefix searching in its own loop below.
                //
                // Also, do not tokenize and expand out terms within a phrase.  Instead, just match the phrase in the transcript.

                var prefixTerms = new List <string>();
                var phraseTerms = new List <string>();

                string cleanedQueryTerms = "";

                if (!queryTerms.Contains("*") && !queryTerms.Contains("-") && !queryTerms.Contains("\""))
                {
                    // a quick check that there are no prefix wildcard searches with * and no negated terms with - and no phrases with "
                    cleanedQueryTerms = queryTerms;
                }
                else
                { // process query with its potential use of negated search, prefix/wildcard search, and/or phrase search
                    // TODO: By being very literal with phrase search, we will match "real union hall" if those 3 words are in order in a transcript,
                    // and will not match "real 7union hall" if that phrase is not, but we mess up on whatever Azure Search is doing with a query
                    // like "real &union hall" - more tuning would be necessary to accommodate unusual input cases.
                    queryTerms = HandlePhraseExtractionWithNegation(queryTerms, phraseTerms);

                    // TODO: Not sure what to do about the operator characters +|() ...for now, treating them as query term separators.
                    char[] termSplitters = { ' ', '\t', '\n', '\r', '+', '|', '(', ')' };

                    var listOfQueryTerms = queryTerms.Split(termSplitters, StringSplitOptions.RemoveEmptyEntries);

                    foreach (var term in listOfQueryTerms)
                    {
                        if (!term.StartsWith("-"))
                        {
                            if (term.EndsWith("*"))
                            {
                                prefixTerms.Add(term.Substring(0, term.Length - 1)); // add term without its ending "*" as a prefix to match later
                            }
                            else
                            {
                                cleanedQueryTerms += " " + term; // Note: extra " " at start of non-empty cleanedQueryTerms will do no harm later
                            }
                        }
                        // else do not consider a negated term further.
                    }
                }

                // Find and sort all the tokens which match with the query terms (or match the start of the prefix terms) (or match a phrase in phrase list).
                var tokens = new SortedList <int, TokenInfo>();

                if (phraseTerms.Count > 0)
                {
                    // Also find all the prefix matches
                    char[] phraseTermSplitters = { ' ' }; // based on how phrases are put together (in HandlePhraseExtractionWithNegation) this is kept simple

                    string[] listOfPhraseTerms;
                    string   patternToMatchForPhrase;
                    foreach (var phrase in phraseTerms)
                    {
                        // TODO: This exact matching is likely more strict than what Azure Search allows.  The phrase words need to be
                        // in order, but likely they can be separated by 1 or more term break characters rather than exactly one space,
                        // as is stored in phraseTerms via HandlePhraseExtractionWithNegation().
                        // What is done here: take phrase, break it into individual words.  Use RegEx to match those words in order separated
                        // by one or more non-word characters (\W+).
                        listOfPhraseTerms       = phrase.Split(phraseTermSplitters, StringSplitOptions.RemoveEmptyEntries);
                        patternToMatchForPhrase = "";
                        foreach (var phraseWord in listOfPhraseTerms)
                        {
                            patternToMatchForPhrase += phraseWord + @"\W+";
                        }
                        // Get a collection of matches.
                        MatchCollection matchesToPhrase = Regex.Matches(transcriptText, patternToMatchForPhrase, RegexOptions.IgnoreCase);

                        foreach (Match match in matchesToPhrase)
                        {
                            foreach (Capture capture in match.Captures)
                            {
                                Console.WriteLine("Index={0}, Value={1}", capture.Index, capture.Value);
                                tokens.Add(capture.Index, new TokenInfo(capture.Value, capture.Index, capture.Index + capture.Length - 1));
                            }
                        }
                    }
                }

                // NOTE: there might only be a phrase search;
                // if so, there will be no prefix terms and no terms to search, and hence no need for a transcript tokenizing, either.
                // Only tokenize if there are follow-up steps.
                if (prefixTerms.Count > 0 || cleanedQueryTerms.Length > 0)
                {
                    AnalyzeResult transcriptAnalysis = await azureSearch.AnalyzeText(transcriptText);

                    // Each word in the transcript may contain multiple tokens, for instance
                    // running becomes both running and run.  Colored becomes colored, color, and colour.
                    // This dictionary will aggregate like tokens together for faster recall.
                    var dictionary = new Dictionary <string, List <TokenInfo> >();
                    foreach (var t in transcriptAnalysis.Tokens)
                    {
                        if (dictionary.ContainsKey(t.Token))
                        {
                            dictionary[t.Token].Add(t);
                        }
                        else
                        {
                            dictionary.Add(t.Token, new List <TokenInfo> {
                                t
                            });
                        }
                    }

                    if (cleanedQueryTerms.Length > 0)
                    {
                        AnalyzeResult queryTermsAnalysis = await azureSearch.AnalyzeText(cleanedQueryTerms);

                        // Find and sort all the tokens which match with the query terms.  We only care about
                        // where the match occurred so ignore additional word forms that match with a given position.
                        foreach (var q in queryTermsAnalysis.Tokens)
                        {
                            if (dictionary.ContainsKey(q.Token))
                            {
                                // Put each new instance of a word into the list of matching tokens.
                                foreach (TokenInfo t in dictionary[q.Token])
                                {
                                    // Check if there is already a match at this transcript position first...
                                    int position = t.Position ?? 0;
                                    if (!tokens.ContainsKey(position))
                                    {
                                        tokens.Add(position, t);
                                    }
                                }
                            }
                        }
                    }

                    // Also find all the prefix matches
                    foreach (var prefix in prefixTerms)
                    {
                        foreach (var k in dictionary.Keys)
                        {
                            if (k.StartsWith(prefix))
                            {
                                // Put each new instance of a word into the list of matching tokens.
                                foreach (TokenInfo t in dictionary[k])
                                {
                                    // Check if there is already a match at this transcript position first...
                                    int position = t.Position ?? 0;
                                    if (!tokens.ContainsKey(position))
                                    {
                                        tokens.Add(position, t);
                                    }
                                }
                            }
                        }
                    }
                }
                // Convert the list of tokens into the the proper return type.
                foreach (var t in tokens)
                {
                    matches.Add(new MatchTerm
                    {
                        StartOffset = t.Value.StartOffset ?? 0,
                        EndOffset   = t.Value.EndOffset ?? 0
                    });
                }
            }

            return(matches);
        }