/// <summary> /// Looks for instances of query terms within the given transcript text. /// </summary> /// <param name="queryTerms">A string of query terms separated by whitespace.</param> /// <param name="transcriptText">The segment's transcript text.</param> /// <returns>A list of MatchTerm instances on success; null otherwise.</returns> /// <remarks>Wilcard after a prefix (term*), negated term (-term) and phrase search ("phraseword1 phraseword2") all supported.</remarks> private async Task <List <MatchTerm> > FindMatches(string queryTerms, string transcriptText) { var matches = new List <MatchTerm>(); // If query and/or transcript are empty, no work is needed: there are no matches. if (queryTerms != null && queryTerms.Trim().Length > 0 && transcriptText != null && transcriptText.Trim().Length > 0) { // In the Lucene-based simple query parser syntax, "-" at the start of a term negates that term, // e.g., buffalo -soldier matches buffalo and not soldier (with default and'ing of terms active). // Do not analyze any terms starting with - (as they won't match anyway, but save a bit of work). // // More importantly, if a term ends with *, do a prefix search, e.g., buff* matches buffalo, buffet, etc. // Handle prefix searching in its own loop below. // // Also, do not tokenize and expand out terms within a phrase. Instead, just match the phrase in the transcript. var prefixTerms = new List <string>(); var phraseTerms = new List <string>(); string cleanedQueryTerms = ""; if (!queryTerms.Contains("*") && !queryTerms.Contains("-") && !queryTerms.Contains("\"")) { // a quick check that there are no prefix wildcard searches with * and no negated terms with - and no phrases with " cleanedQueryTerms = queryTerms; } else { // process query with its potential use of negated search, prefix/wildcard search, and/or phrase search // TODO: By being very literal with phrase search, we will match "real union hall" if those 3 words are in order in a transcript, // and will not match "real 7union hall" if that phrase is not, but we mess up on whatever Azure Search is doing with a query // like "real &union hall" - more tuning would be necessary to accommodate unusual input cases. queryTerms = HandlePhraseExtractionWithNegation(queryTerms, phraseTerms); // TODO: Not sure what to do about the operator characters +|() ...for now, treating them as query term separators. char[] termSplitters = { ' ', '\t', '\n', '\r', '+', '|', '(', ')' }; var listOfQueryTerms = queryTerms.Split(termSplitters, StringSplitOptions.RemoveEmptyEntries); foreach (var term in listOfQueryTerms) { if (!term.StartsWith("-")) { if (term.EndsWith("*")) { prefixTerms.Add(term.Substring(0, term.Length - 1)); // add term without its ending "*" as a prefix to match later } else { cleanedQueryTerms += " " + term; // Note: extra " " at start of non-empty cleanedQueryTerms will do no harm later } } // else do not consider a negated term further. } } // Find and sort all the tokens which match with the query terms (or match the start of the prefix terms) (or match a phrase in phrase list). var tokens = new SortedList <int, TokenInfo>(); if (phraseTerms.Count > 0) { // Also find all the prefix matches char[] phraseTermSplitters = { ' ' }; // based on how phrases are put together (in HandlePhraseExtractionWithNegation) this is kept simple string[] listOfPhraseTerms; string patternToMatchForPhrase; foreach (var phrase in phraseTerms) { // TODO: This exact matching is likely more strict than what Azure Search allows. The phrase words need to be // in order, but likely they can be separated by 1 or more term break characters rather than exactly one space, // as is stored in phraseTerms via HandlePhraseExtractionWithNegation(). // What is done here: take phrase, break it into individual words. Use RegEx to match those words in order separated // by one or more non-word characters (\W+). listOfPhraseTerms = phrase.Split(phraseTermSplitters, StringSplitOptions.RemoveEmptyEntries); patternToMatchForPhrase = ""; foreach (var phraseWord in listOfPhraseTerms) { patternToMatchForPhrase += phraseWord + @"\W+"; } // Get a collection of matches. MatchCollection matchesToPhrase = Regex.Matches(transcriptText, patternToMatchForPhrase, RegexOptions.IgnoreCase); foreach (Match match in matchesToPhrase) { foreach (Capture capture in match.Captures) { Console.WriteLine("Index={0}, Value={1}", capture.Index, capture.Value); tokens.Add(capture.Index, new TokenInfo(capture.Value, capture.Index, capture.Index + capture.Length - 1)); } } } } // NOTE: there might only be a phrase search; // if so, there will be no prefix terms and no terms to search, and hence no need for a transcript tokenizing, either. // Only tokenize if there are follow-up steps. if (prefixTerms.Count > 0 || cleanedQueryTerms.Length > 0) { AnalyzeResult transcriptAnalysis = await azureSearch.AnalyzeText(transcriptText); // Each word in the transcript may contain multiple tokens, for instance // running becomes both running and run. Colored becomes colored, color, and colour. // This dictionary will aggregate like tokens together for faster recall. var dictionary = new Dictionary <string, List <TokenInfo> >(); foreach (var t in transcriptAnalysis.Tokens) { if (dictionary.ContainsKey(t.Token)) { dictionary[t.Token].Add(t); } else { dictionary.Add(t.Token, new List <TokenInfo> { t }); } } if (cleanedQueryTerms.Length > 0) { AnalyzeResult queryTermsAnalysis = await azureSearch.AnalyzeText(cleanedQueryTerms); // Find and sort all the tokens which match with the query terms. We only care about // where the match occurred so ignore additional word forms that match with a given position. foreach (var q in queryTermsAnalysis.Tokens) { if (dictionary.ContainsKey(q.Token)) { // Put each new instance of a word into the list of matching tokens. foreach (TokenInfo t in dictionary[q.Token]) { // Check if there is already a match at this transcript position first... int position = t.Position ?? 0; if (!tokens.ContainsKey(position)) { tokens.Add(position, t); } } } } } // Also find all the prefix matches foreach (var prefix in prefixTerms) { foreach (var k in dictionary.Keys) { if (k.StartsWith(prefix)) { // Put each new instance of a word into the list of matching tokens. foreach (TokenInfo t in dictionary[k]) { // Check if there is already a match at this transcript position first... int position = t.Position ?? 0; if (!tokens.ContainsKey(position)) { tokens.Add(position, t); } } } } } } // Convert the list of tokens into the the proper return type. foreach (var t in tokens) { matches.Add(new MatchTerm { StartOffset = t.Value.StartOffset ?? 0, EndOffset = t.Value.EndOffset ?? 0 }); } } return(matches); }