public QueryTranslator(
     IIndexData <TKey> standardMatchIndexData,
     IIndexData <TKey> preciseMatchIndexData,
     MatchCombiner matchCombiner)
     : this(standardMatchIndexData, preciseMatchIndexData, null, matchCombiner)
 {
 }
 public QueryTranslator(
     IIndexData <TKey> standardMatchIndexData,
     IIndexData <TKey> preciseMatchIndexData,
     ITokenBreaker optionalQuotedValueConsecutiveTermTokenBreaker,
     MatchCombiner matchCombiner)
     : this(standardMatchIndexData, preciseMatchIndexData, optionalQuotedValueConsecutiveTermTokenBreaker, null, null, matchCombiner)
 {
 }
        public QueryTranslator(
            IIndexData <TKey> standardMatchIndexData,
            IIndexData <TKey> preciseMatchIndexData,
            ITokenBreaker optionalQuotedValueConsecutiveTermTokenBreaker,
            IndexGenerator.WeightedEntryCombiner optionalQuotedValueConsecutiveWeightCombinerForConsecutiveRuns,
            IndexGenerator.WeightedEntryCombiner optionalQuotedValueConsecutiveWeightCombinerForFinalMatches,
            MatchCombiner matchCombiner)
        {
            if (standardMatchIndexData == null)
            {
                throw new ArgumentNullException("standardMatchIndexData");
            }
            if (preciseMatchIndexData == null)
            {
                throw new ArgumentNullException("preciseMatchIndexData");
            }
            if (matchCombiner == null)
            {
                throw new ArgumentNullException("matchCombiner");
            }

            if (!preciseMatchIndexData.SourceLocationsAvailable)
            {
                throw new ArgumentException($"The {nameof(preciseMatchIndexData)} must include source location data in order to use the Query Translator");
            }

            // Can't actually determine for sure that the KeyComparer of the standardMatchIndexData is equivalent to that of the preciseMatchIndexData
            // (can't do an instance comparison since they may be different instances of the same implementation, they could even feasibly be different
            // classes with identical functionality) so we'll have to assume that the caller is behaving themselves. We'll take the KeyComparer of the
            // standardMatchIndexData for use when combining keys, excluding keys or otherwise processing the query segment requirements.
            _standardMatcher = new CachingResultMatcher(standardMatchIndexData.GetMatches);
            _preciseMatcher  = new CachingResultMatcher(
                source => preciseMatchIndexData.GetConsecutiveMatches(
                    source,
                    optionalQuotedValueConsecutiveTermTokenBreaker ?? IndexData_Extensions_ConsecutiveMatches.DefaultTokenBreaker,
                    optionalQuotedValueConsecutiveWeightCombinerForConsecutiveRuns ?? IndexData_Extensions_ConsecutiveMatches.DefaultConsecutiveRunsWeightCombiner,
                    optionalQuotedValueConsecutiveWeightCombinerForFinalMatches ?? IndexData_Extensions_ConsecutiveMatches.DefaultFinalMatchWeightCombiner
                    )
                );
            _keyComparer   = standardMatchIndexData.KeyComparer;
            _matchCombiner = matchCombiner;
        }
Exemplo n.º 4
0
        public Querier(IIndexData <TKey> standardMatchIndexData, IIndexData <TKey> preciseMatchIndexData, QueryTranslator <TKey> .MatchCombiner matchCombiner)
        {
            if (standardMatchIndexData == null)
            {
                throw new ArgumentNullException("standardMatchIndexData");
            }
            if (preciseMatchIndexData == null)
            {
                throw new ArgumentNullException("preciseMatchIndexData");
            }
            if (matchCombiner == null)
            {
                throw new ArgumentNullException("matchCombiner");
            }

            _queryTranslator = new QueryTranslator <TKey>(
                standardMatchIndexData,
                preciseMatchIndexData,
                matchCombiner
                );
        }
Exemplo n.º 5
0
        private PostIndexContent(SerializationInfo info, StreamingContext context)
        {
            if (info == null)
            {
                throw new ArgumentNullException(nameof(info));
            }

            var searchIndexData = (byte[])info.GetValue(SearchIndexName, typeof(byte[]));

            if (info.GetBoolean(SearchIndexIsCustomSerialisedName))
            {
                using var memoryStream = new MemoryStream(searchIndexData);
                _searchIndex           = IndexDataSerialiser <int> .Deserialise(memoryStream);
            }
            else
            {
                _searchIndex = Deserialise <IIndexData <int> >(searchIndexData);
            }

            AutoCompleteContent = Deserialise <NonNullOrEmptyStringList>(
                (byte[])info.GetValue(AutoCompleteContentName, typeof(byte[]))
                );
        }
Exemplo n.º 6
0
        /// <summary>
        /// This will break down a source search term into words (according to the logic of the specified token breaker) and then return matches where the words were found in a run in a
        /// content section. Unlike GetPartialMatches it is not possible for an entry to be considered a match because it contains all of the terms in its content, the terms must be
        /// present in one content field, together, in the order in which they are present in the search term. This allows for similar behaviour to that intended for the
        /// ConsecutiveTokenCombiningTokenBreaker, but this offers greater performance (constructing a TernarySearchTreeDictionary to back an IndexData instance can be expensive on
        /// processing time to generate, and disk / memory space to store, the runs of tokens). This also has the benefit that there is no cap on the number of tokens that can be
        /// matched consecutively (a limit on this had to be decided at index generation time when using the ConsecutiveTokenCombiningTokenBreaker). There are two sets of weight
        /// combining calculations required; the first (handled by the weightCombinerForConsecutiveRuns) determines a weight for run of consecutive tokens - each run is considered
        /// a single match, effectively. Each call to the first weight comber will have as many weights to combine as there are search terms, so if the "source" value is broken
        /// down into three words by the tokenBreaker then the weightCombinerForConsecutiveRuns will always be called with sets of three weights. The second weight combination
        /// is performed when multiple matches for a particular result must be combined to give a final match weight for that result.
        ///
        /// Note: This requires the index to have been built with source location data recorded - if the index's SourceLocationsAvailable property returns false then an ArgumentException
        /// will be thrown.
        /// </summary>
        public static NonNullImmutableList <WeightedEntry <TKey> > GetConsecutiveMatches <TKey>(
            this IIndexData <TKey> index,
            string source,
            ITokenBreaker tokenBreaker,
            IndexGenerator.WeightedEntryCombiner weightCombinerForConsecutiveRuns,
            IndexGenerator.WeightedEntryCombiner weightCombinerForFinalMatches)
        {
            if (index == null)
            {
                throw new ArgumentNullException("index");
            }
            if (source == null)
            {
                throw new ArgumentNullException("source");
            }
            if (tokenBreaker == null)
            {
                throw new ArgumentNullException("tokenBreaker");
            }
            if (weightCombinerForConsecutiveRuns == null)
            {
                throw new ArgumentNullException("weightCombinerForConsecutiveRuns");
            }
            if (weightCombinerForFinalMatches == null)
            {
                throw new ArgumentNullException("weightCombinerForFinalMatches");
            }

            if (!index.SourceLocationsAvailable)
            {
                throw new ArgumentException($"The {nameof(index)} must include source location data in order to use identify Consecutive token matches");
            }

            // If the token breaker won't actually translate the source value into multiple words then we can avoid all of the below work and just call index.GetMatches directly
            var weightAdjustedTokens = tokenBreaker.Break(source);

            if (weightAdjustedTokens.Count == 1)
            {
                return(index.GetMatches(source));
            }

            // The index of this list will correspond to the index of the broken-down search terms
            var matchesForSearchTerms = new List <WeightedEntry <TKey>[]>();

            foreach (var weightAdjustedToken in weightAdjustedTokens)
            {
                matchesForSearchTerms.Add(
                    index.GetMatches(weightAdjustedToken.Token).Select(w => new WeightedEntry <TKey>(
                                                                           w.Key,
                                                                           w.Weight * weightAdjustedToken.WeightMultiplier,
                                                                           w.SourceLocationsIfRecorded
                                                                           )).ToArray()
                    );
            }

            // For each match of the first search term, try to identify a run of token matches for the same key and source field. Any such runs will be recorded in the consecutiveMatches
            // list - these represent content segments that match the entirety of the search term (the "source" argument).
            var consecutiveMatches = new List <WeightedEntry <TKey> >();
            var searchTerms        = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token));

            foreach (var firstTermMatch in matchesForSearchTerms.First().SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations(m)))
            {
                var matchesForEntireTerm = NonNullImmutableList <WeightedEntry <TKey> > .Empty;
                matchesForEntireTerm = matchesForEntireTerm.Add(firstTermMatch);
                for (var termIndex = 1; termIndex < weightAdjustedTokens.Count; termIndex++)
                {
                    // Note: SourceLocationsIfRecorded should never be null because we checked that the index reported that SourceLocationsAvailable was true (so not checking for null
                    // source locations here)
                    var nTermMatch = matchesForSearchTerms[termIndex]
                                     .SelectMany(m => BreakWeightedEntryIntoIndividualSourceLocations <TKey>(m))
                                     .FirstOrDefault(m =>
                                                     index.KeyComparer.Equals(m.Key, firstTermMatch.Key) &&
                                                     (m.SourceLocationsIfRecorded.First().SourceFieldIndex == firstTermMatch.SourceLocationsIfRecorded.First().SourceFieldIndex) &&
                                                     (m.SourceLocationsIfRecorded.First().TokenIndex == firstTermMatch.SourceLocationsIfRecorded.First().TokenIndex + termIndex)
                                                     );
                    if (nTermMatch == null)
                    {
                        break;
                    }
                    matchesForEntireTerm = matchesForEntireTerm.Add(nTermMatch);
                }
                if (matchesForEntireTerm.Count < weightAdjustedTokens.Count)
                {
                    // If we didn't manage to get a full set of search terms then this isn't a full match
                    continue;
                }

                // Combine the WeightedEntry instances that represent a run of individual matches (one for each word in the "source" argument) into a single WeightedEntry that represents
                // the entirety of the search term (each of the matchesForEntireTerm WeightedEntry instances will have only a single Source Location since the match data was split up
                // above by calling BreakWeightedEntryIntoIndividualSourceLocations before trying to find the consecutive matches). See notes above about not checking whether
                // SourceLocationsIfRecorded is null (it shouldn't be because we index.SourceLocationsAvailable at the top of this method)
                var sourceLocationOfFirstTerm         = matchesForEntireTerm.First().SourceLocationsIfRecorded.Single();
                var sourceLocationOfLastTerm          = matchesForEntireTerm.Last().SourceLocationsIfRecorded.Single();
                var matchWeightForConsecutiveRunEntry = weightCombinerForConsecutiveRuns(
                    matchesForEntireTerm.Select(m => m.Weight).ToImmutableList()
                    );
                consecutiveMatches.Add(
                    new WeightedEntry <TKey>(
                        matchesForEntireTerm.First().Key,
                        matchWeightForConsecutiveRunEntry,
                        new NonNullImmutableList <SourceFieldLocation>(new[]
                {
                    // Since we're creating a new SourceFieldLocation instance that is derived from a run of multiple tokens, the TokenIndex is going to be an approximation -
                    // taking the TokenIndex from the first search term probably makes the most sense. The SourceIndex and SourceTokenLength will be taken such that the entire
                    // run is covered (from the start of the first search term to the end of the last). Since this is the only Source Location instance for the WeightedEntry,
                    // its MatchWeightContribution value is equal to the WeightedEntry's Weight.
                    new SourceFieldLocation(
                        sourceLocationOfFirstTerm.SourceFieldIndex,
                        sourceLocationOfFirstTerm.TokenIndex,
                        sourceLocationOfFirstTerm.SourceIndex,
                        (sourceLocationOfLastTerm.SourceIndex + sourceLocationOfLastTerm.SourceTokenLength) - sourceLocationOfFirstTerm.SourceIndex,
                        matchWeightForConsecutiveRunEntry
                        )
                })
                        )
                    );
            }

            // The matches need grouping by key before returning
            return(consecutiveMatches
                   .GroupBy(m => m.Key, index.KeyComparer)
                   .Cast <IEnumerable <WeightedEntry <TKey> > >()
                   .Select(matches => new WeightedEntry <TKey>(
                               matches.First().Key,
                               weightCombinerForFinalMatches(
                                   matches.Select(match => match.Weight).ToImmutableList()
                                   ),
                               matches.SelectMany(m => m.SourceLocationsIfRecorded).ToNonNullImmutableList()
                               ))
                   .ToNonNullImmutableList());
        }
Exemplo n.º 7
0
 /// <summary>
 /// This GetConsecutiveMatches signature will call GetConsecutiveMatches specifying the DefaultConsecutiveRunsWeightCombiner and DefaultFinalMatchWeightCombiner
 /// for the weightCombiner arguments and the DefaultTokenBreaker for the token breaker.
 /// </summary>
 public static NonNullImmutableList <WeightedEntry <TKey> > GetConsecutiveMatches <TKey>(this IIndexData <TKey> index, string source)
 {
     return(GetConsecutiveMatches(index, source, DefaultTokenBreaker));
 }
Exemplo n.º 8
0
 /// <summary>
 /// This GetConsecutiveMatches signature will call GetConsecutiveMatches specifying the DefaultConsecutiveRunsWeightCombiner and DefaultFinalMatchWeightCombiner
 /// for the weightCombiner arguments (the DefaultConsecutiveRunsWeightCombiner to calculate the combined weight of a run of tokens which should be considered as
 /// a single match and the DefaultFinalMatchWeightCombiner to combine all of these matches together for each result)
 /// </summary>
 public static NonNullImmutableList <WeightedEntry <TKey> > GetConsecutiveMatches <TKey>(this IIndexData <TKey> index, string source, ITokenBreaker tokenBreaker)
 {
     return(GetConsecutiveMatches(index, source, tokenBreaker, DefaultConsecutiveRunsWeightCombiner, DefaultFinalMatchWeightCombiner));
 }
Exemplo n.º 9
0
 public PostIndexContent(IIndexData <int> searchIndex, NonNullOrEmptyStringList autoCompleteContent)
 {
     _searchIndex        = searchIndex ?? throw new ArgumentNullException(nameof(searchIndex));
     AutoCompleteContent = autoCompleteContent ?? throw new ArgumentNullException(nameof(autoCompleteContent));
 }
Exemplo n.º 10
0
        /// <summary>
        /// This generates the "SearchIndex-SummaryDictionary.js" and "SearchIndex-{PostId}-CompleteDictionary.js" files that are used to perform the full
        /// text site search. The first file maps token matches onto Posts by Key, specifying the match Weight. It doesn't contain the source locations
        /// which map the token back onto the source content in order to keep the file size down. The "SearchIndex-{PostId}-CompleteDictionary.js" files
        /// contain the mappings with source locations for a single Post. These only need to be accessed once a Post has been identified as matching the
        /// search term(s). In order to display matched content, the source locations must be mapped onto the plain text content generated by the
        /// PlainTextContentRecorder. These two classes are very specific to my Blog site implementation.
        /// </summary>
        public static void Write(IIndexData <int> searchIndex, DirectoryInfo destination)
        {
            if (searchIndex == null)
            {
                throw new ArgumentNullException("searchIndexFile");
            }
            if (destination == null)
            {
                throw new ArgumentNullException("destination");
            }
            destination.Refresh();
            if (!destination.Exists)
            {
                throw new ArgumentException("destination does not exist");
            }

            // Get Search Index Data
            // - Generate "SearchIndex-SummaryDictionary.js"
            // - Generate all of "SearchIndex-{0}-CompleteDictionary.js"

            // Translate into combined detail data for all Posts
            var matchData = searchIndex.GetAllTokens().Select(token => new JsTokenMatch
            {
                t = token,
                l = searchIndex.GetMatches(token).Select(weightedEntry => new JsSourceLocation
                {
                    k = weightedEntry.Key,
                    w = weightedEntry.Weight,
                    l = weightedEntry.SourceLocationsIfRecorded.Select(sourceLocation => new JsSourceLocationDetail
                    {
                        f = sourceLocation.SourceFieldIndex,
                        w = sourceLocation.MatchWeightContribution,
                        t = sourceLocation.TokenIndex,
                        i = sourceLocation.SourceIndex,
                        l = sourceLocation.SourceTokenLength
                    })
                })
            });

            // The all-Post Summary data is going to be an associative array of token to Key/Weight matches (no Source Location data). This won't be
            // compressed so that the initial searching can be as quick as possible (the trade-off between valuable space at NeoCities hosting vs the
            // speed of native compression - ie. the gzip that happens over the wire but that doesn't benefit the backend storage - is worth it)
            var allPostsSummaryDictionary = matchData.ToDictionary(
                tokenMatch => tokenMatch.t,
                tokenMatch => tokenMatch.l.Select(weightedEntry => new JsSourceLocation
            {
                k = weightedEntry.k,
                w = weightedEntry.w
            })
                );
            var summaryFilename = "SearchIndex-SummaryDictionary.js";

            Console.WriteLine("Writing " + summaryFilename);
            File.WriteAllText(
                Path.Combine(destination.FullName, summaryFilename),
                SerialiseToJson(allPostsSummaryDictionary),
                new UTF8Encoding()
                );

            // The per-Post Detail data is going to be an associative array of token to Key/Weight matches (with Source Location) but only a single
            // Key will appear in each dictionary. This data WILL be compressed since it takes up a lot of space considering the NeoCities limits.
            var perPostData = new Dictionary <int, IEnumerable <JsTokenMatch> >();

            foreach (var entry in matchData)
            {
                foreach (var result in entry.l)
                {
                    var key = result.k;
                    if (!perPostData.ContainsKey(key))
                    {
                        perPostData.Add(key, new JsTokenMatch[0]);
                    }
                    perPostData[key] = perPostData[key].Concat(new[] {
                        new JsTokenMatch
                        {
                            t = entry.t,
                            l = new[] { result }
                        }
                    });
                }
            }
            foreach (var postId in perPostData.Keys)
            {
                var detailFilename = "SearchIndex-" + postId + "-CompleteDictionary.lz.txt";
                Console.WriteLine("Writing " + detailFilename);
                File.WriteAllText(
                    Path.Combine(destination.FullName, detailFilename),
                    LZStringCompress.CompressToUTF16(
                        SerialiseToJson(
                            perPostData[postId].ToDictionary(
                                entry => entry.t,
                                entry => entry.l
                                )
                            )
                        ),
                    new UTF8Encoding()
                    );
            }
        }
 public QueryTranslator(
     IIndexData <TKey> standardMatchIndexData,
     IIndexData <TKey> preciseMatchIndexData)
     : this(standardMatchIndexData, preciseMatchIndexData, DefaultMatchCombiner)
 {
 }
Exemplo n.º 12
0
        /// <summary>
        /// This will break a given source string and return results based upon the combination of partial matches (so results that only match part of the source string may be included
        /// in the returned data). The token breaker and the match combiner must be specified by the caller - if the match combiner returns zero then the result will not be included in
        /// the final data. To require that all tokens in the source content be present for any returned results, the following matchCombiner could be specified:
        ///  (tokenMatches, allTokens) => (tokenMatches.Count &lt; allTokens.Count) ? 0 : tokenMatches.Sum(m => m.Weight)
        /// </summary>
        public static NonNullImmutableList <WeightedEntry <TKey> > GetPartialMatches <TKey>(
            this IIndexData <TKey> index,
            string source,
            ITokenBreaker tokenBreaker,
            WeightCombiner weightCombiner)
        {
            if (index == null)
            {
                throw new ArgumentNullException("index");
            }
            if (source == null)
            {
                throw new ArgumentNullException("source");
            }
            if (tokenBreaker == null)
            {
                throw new ArgumentNullException("tokenBreaker");
            }
            if (weightCombiner == null)
            {
                throw new ArgumentNullException("weightCombiner");
            }

            // Break down the "source" search term and find matches for each token
            // - Each match maintains the weight multiplier applied to the string segment from the token breaker
            // - The Source Locations are annotated with additional data; the source segment string and what token index that is (so if the "source" value is broken into three, then
            //   each Source Location will have a SearchTerm property whose TokenIndex will be between 0 and 2, inclusive). This allows for a weightCombiner to be specified that
            //   ensures that every token that was extract from the source value can be matched against a given result, if so desired.
            var matches = new List <Tuple <WeightedEntry <TKey>, SearchTermDetails> >();
            var weightAdjustedTokens = tokenBreaker.Break(source);

            for (var tokenIndex = 0; tokenIndex < weightAdjustedTokens.Count; tokenIndex++)
            {
                var weightAdjustedToken = weightAdjustedTokens[tokenIndex];
                matches.AddRange(
                    index
                    .GetMatches(weightAdjustedToken.Token)
                    .Select(match => Tuple.Create(match, new SearchTermDetails(tokenIndex, weightAdjustedToken.Token)))
                    );
            }

            // Combine per-search-term results, grouping by result key and calculating the match weight for each token using the specified weightCombiner (this may also be
            // used to filter out results; if a match weight of zero is returned then the match will be ignored - this may used to filter out results that only match two
            // out of three of the search terms, for example)
            var finalResults = NonNullImmutableList <WeightedEntry <TKey> > .Empty;
            var searchTerms  = new NonNullOrEmptyStringList(weightAdjustedTokens.Select(w => w.Token));

            foreach (var matchesGroupedByKey in matches.GroupBy(m => m.Item1.Key, index.KeyComparer).Cast <IEnumerable <Tuple <WeightedEntry <TKey>, SearchTermDetails> > >())
            {
                var combinedWeight = weightCombiner(
                    matchesGroupedByKey
                    .Select(m => new MatchWeightWithSourceFieldLocations(
                                m.Item1.Weight,
                                m.Item2,
                                m.Item1.SourceLocationsIfRecorded
                                )).ToNonNullImmutableList(),
                    searchTerms
                    );
                if (combinedWeight < 0)
                {
                    throw new ArgumentException("weightCombiner returned a negative value - invalid");
                }
                else if (combinedWeight > 0)
                {
                    finalResults = finalResults.Add(
                        new WeightedEntry <TKey>(
                            matchesGroupedByKey.First().Item1.Key,
                            combinedWeight,
                            matchesGroupedByKey.Any(m => m.Item1.SourceLocationsIfRecorded == null)
                                                                ? null
                                                                : matchesGroupedByKey.SelectMany(m => m.Item1.SourceLocationsIfRecorded).ToNonNullImmutableList()
                            )
                        );
                }
            }
            return(finalResults);
        }