private AutomatedIndexGeneratorFactory <TSource, TKey> .WeightDeterminerGenerator GetDefaultTokenWeightDeterminerGenerator(IStringNormaliser stringNormaliser)
        {
            if (stringNormaliser == null)
            {
                throw new ArgumentNullException("stringNormaliser");
            }

            // Constructing a HashSet of the normalised versions of the stop words means that looking up whether normalised tokens are stop
            // words can be a lot faster (as neither the stop words nor the token need to be fed through the normaliser again)
            var hashSetOfNormalisedStopWords = new HashSet <string>(
                Constants.GetStopWords("en").Select(word => stringNormaliser.GetNormalisedString(word))
                );

            return(property =>
            {
                // Reverse the propertyWeightAppliers so that later values added to the set take precedence (eg. if, for some reason, a x5 weight is
                // given to a property and then later it's set to be ignored, then we want to ignore it - which this will achieve)
                var propertyWeightApplier = _propertyWeightAppliers.Reverse().FirstOrDefault(p => p.AppliesTo(property));
                if ((propertyWeightApplier != null) && (propertyWeightApplier.WeightMultiplier == 0))
                {
                    // A weight multiplier of zero means ignore this property, as does returning null from a WeightDeterminerGenerator call
                    return null;
                }

                var weightMultiplier = (propertyWeightApplier != null) ? propertyWeightApplier.WeightMultiplier : 1;
                return normalisedToken => weightMultiplier * (hashSetOfNormalisedStopWords.Contains(normalisedToken) ? 0.01f : 1f);
            });
        }
        /// <summary>
        /// This will return true if the specified key was found and will set the value output parameter to the corresponding value. If it return false then the
        /// value output parameter should not be considered to be defined.
        /// </summary>
        public bool TryGetValue(string key, out TValue value)
        {
            if (key == null)
            {
                throw new ArgumentNullException("key");
            }

            var normalisedKey = _keyNormaliser.GetNormalisedString(key);

            if (normalisedKey != "")
            {
                var nodeIndex = 0;
                var keyIndex  = 0;
                while (true)
                {
                    if (_nodes[nodeIndex].Character == normalisedKey[keyIndex])
                    {
                        keyIndex++;
                        if (keyIndex == normalisedKey.Length)
                        {
                            if (_nodes[nodeIndex].IsKey)
                            {
                                value = _values[_nodes[nodeIndex].ValueIndex];
                                return(true);
                            }
                            break;
                        }
                        if (_nodes[nodeIndex].MiddleChildIndex == -1)
                        {
                            break;
                        }
                        nodeIndex = _nodes[nodeIndex].MiddleChildIndex;
                    }
                    else if (normalisedKey[keyIndex] < _nodes[nodeIndex].Character)
                    {
                        if (_nodes[nodeIndex].LeftChildIndex == -1)
                        {
                            break;
                        }
                        nodeIndex = _nodes[nodeIndex].LeftChildIndex;
                    }
                    else
                    {
                        if (_nodes[nodeIndex].RightChildIndex == -1)
                        {
                            break;
                        }
                        nodeIndex = _nodes[nodeIndex].RightChildIndex;
                    }
                }
            }
            value = default(TValue);
            return(false);
        }
Пример #3
0
        private static ContentRetriever <Post, int> .BrokenTokenWeightDeterminer GetTokenWeightDeterminer(float multiplier, IStringNormaliser sourceStringComparer)
        {
            if (multiplier <= 0)
            {
                throw new ArgumentOutOfRangeException(nameof(multiplier), "must be greater than zero");
            }
            if (sourceStringComparer == null)
            {
                throw new ArgumentNullException(nameof(sourceStringComparer));
            }

            // Constructing a HashSet of the normalised versions of the stop words means that looking up whether normalised tokens are stop
            // words can be a lot faster (as neither the stop words nor the token need to be fed through the normaliser again)
            var hashSetOfNormalisedStopWords = new HashSet <string>(
                FullTextIndexer.Core.Constants.GetStopWords("en").Select(word => sourceStringComparer.GetNormalisedString(word))
                );

            return(normalisedToken => multiplier * (hashSetOfNormalisedStopWords.Contains(normalisedToken) ? 0.01f : 1f));
        }
Пример #4
0
        /// <summary>
        /// This will never return null. It will throw an exception for null input.
        /// </summary>
        public IndexData <TKey> Generate(NonNullImmutableList <TSource> data)
        {
            if (data == null)
            {
                throw new ArgumentNullException("data");
            }

            // Build up data about token occurences in the data
            // - We'll be using the token values in the indexContent dictionary after they have been normalised by the sourceStringComparer, this means that we
            //   don't need to specify the sourceStringComparer as the comparer for indexContent which may save some work depending upon the implementation of
            //   the sourceStringComparer
            var timer = new Stopwatch();

            timer.Start();
            var indexContent = new Dictionary <string, Dictionary <TKey, List <WeightedEntry <TKey> > > >();
            var timeElapsedForNextUpdateMessage = TimeSpan.FromSeconds(5);

            for (var index = 0; index < data.Count; index++)
            {
                var entry            = data[index];
                var sourceFieldIndex = 0;
                foreach (var contentRetriever in _contentRetrievers)
                {
                    PreBrokenContent <TKey> preBrokenContent;
                    try
                    {
                        preBrokenContent = contentRetriever.InitialContentRetriever(entry);
                    }
                    catch (Exception e)
                    {
                        throw new Exception("contentRetriever.InitialContentRetriever threw exception", e);
                    }
                    if (preBrokenContent == null)
                    {
                        throw new Exception("contentRetriever.InitialContentRetriever returned null - this is invalid");
                    }

                    if (timer.Elapsed >= timeElapsedForNextUpdateMessage)
                    {
                        _logger.LogIgnoringAnyError(LogLevel.Debug, () => String.Format("Work completed: {0}%", ((index * 100f) / (float)data.Count).ToString("0.000")));
                        timeElapsedForNextUpdateMessage = timer.Elapsed.Add(TimeSpan.FromSeconds(5));
                    }

                    foreach (var contentSection in preBrokenContent.Content)
                    {
                        foreach (var weightedTokenMatch in _tokenBreaker.Break(contentSection))
                        {
                            // Strings that are reduced to "" by the normaliser have no meaning (they can't be searched for) and should be ignored
                            var normalisedToken = _sourceStringComparer.GetNormalisedString(weightedTokenMatch.Token);
                            if (normalisedToken == "")
                            {
                                continue;
                            }

                            Dictionary <TKey, List <WeightedEntry <TKey> > > allDataForToken;
                            if (!indexContent.TryGetValue(normalisedToken, out allDataForToken))
                            {
                                allDataForToken = new Dictionary <TKey, List <WeightedEntry <TKey> > >(_dataKeyComparer);
                                indexContent.Add(normalisedToken, allDataForToken);
                            }

                            if (!allDataForToken.ContainsKey(preBrokenContent.Key))
                            {
                                allDataForToken.Add(preBrokenContent.Key, new List <WeightedEntry <TKey> >());
                            }

                            // Each WeightedEntry requires a sourceLocation set which specifies a location in a content field - the SourceLocation
                            // returned by the Token Breaker has the token index, start point and length but it needs a distinct field index. The
                            // index of the current Content Retriever will do fine.
                            var matchWeight = contentRetriever.TokenWeightDeterminer(normalisedToken) * weightedTokenMatch.WeightMultiplier;
                            allDataForToken[preBrokenContent.Key].Add(
                                new WeightedEntry <TKey>(
                                    preBrokenContent.Key,
                                    matchWeight,
                                    _captureSourceLocations
                                                                                ? (new[]
                            {
                                new SourceFieldLocation(
                                    sourceFieldIndex,
                                    weightedTokenMatch.SourceLocation.TokenIndex,
                                    weightedTokenMatch.SourceLocation.SourceIndex,
                                    weightedTokenMatch.SourceLocation.SourceTokenLength,
                                    matchWeight
                                    )
                            }).ToNonNullImmutableList()
                                                                                : null
                                    )
                                );
                        }

                        // This has to be incremented for each content section successfully extracted from the source data, to ensure that each
                        // section gets a unique SourceFieldLocation.SourceFieldIndex assigned to it
                        sourceFieldIndex++;
                    }
                    if (sourceFieldIndex == 0)
                    {
                        // The sourceFieldIndex should move at least once for the first content retriever (even if it didn't manage to extract any content using
                        // it) so that the index generator can be configured such that all source locations with SourceFieldIndex zero can be guaranteed to have
                        // come from a particular property (if it retrieves no content then there will be no source locations instances with a SourceFieldIndex
                        // value of zero). This can be used for search term highlighting. Only the first content retriever can be supported in this manner since
                        // if the first content retriever returns varying numbers of content sections then all bets are off for synchronising field index values
                        // for the subsequent retrievers.
                        sourceFieldIndex++;
                    }
                }
            }
            _logger.LogIgnoringAnyError(
                LogLevel.Debug,
                () => String.Format("Time taken to generate initial token data: {0}ms ({1:0.00}ms per item)", timer.ElapsedMilliseconds, (float)timer.ElapsedMilliseconds / (float)data.Count)
                );
            timer.Restart();

            // Combine entries where Token and Key values match (as with the indexContent dictionary, we don't need to specify the sourceStringComparer as the
            // combinedContent dictionary comparer as all values were stored in indexContent after being normalised - this may save some work depending upon
            // the sourceStringComparer implementation)
            var combinedContent = new Dictionary <string, List <WeightedEntry <TKey> > >();

            foreach (var token in indexContent.Keys)
            {
                combinedContent.Add(token, new List <WeightedEntry <TKey> >());
                foreach (var key in indexContent[token].Keys)
                {
                    var matches = indexContent[token][key];
                    combinedContent[token].Add(
                        new WeightedEntry <TKey>(
                            key,
                            _weightedEntryCombiner(matches.Select(m => m.Weight).ToImmutableList()),
                            matches.Any(m => m.SourceLocationsIfRecorded == null) ? null : matches.SelectMany(m => m.SourceLocationsIfRecorded).ToNonNullImmutableList()
                            )
                        );
                }
            }
            _logger.LogIgnoringAnyError(
                LogLevel.Debug,
                () => String.Format("Time taken to combine token data sets: {0}ms ({1:0.00}ms per item)", timer.ElapsedMilliseconds, (float)timer.ElapsedMilliseconds / (float)data.Count)
                );
            timer.Restart();

            // Translate this into an IndexData instance
            var indexData = new IndexData <TKey>(
                new TernarySearchTreeDictionary <NonNullImmutableList <WeightedEntry <TKey> > >(
                    combinedContent.Select(entry => new KeyValuePair <string, NonNullImmutableList <WeightedEntry <TKey> > >(entry.Key, entry.Value.ToNonNullImmutableList())),
                    _sourceStringComparer
                    ),
                _dataKeyComparer
                );

            _logger.LogIgnoringAnyError(
                LogLevel.Debug,
                () => String.Format("Time taken to generate final IndexData: {0}ms ({1:0.00}ms per item)", timer.ElapsedMilliseconds, (float)timer.ElapsedMilliseconds / (float)data.Count)
                );
            return(indexData);
        }
        public TernarySearchTreeStructDictionary(IEnumerable <KeyValuePair <string, TValue> > data, IStringNormaliser keyNormaliser)
        {
            if (data == null)
            {
                throw new ArgumentNullException("data");
            }
            if (keyNormaliser == null)
            {
                throw new ArgumentNullException("keyNormaliser");
            }

            var nodes = new List <Node> {
                GetUnintialisedNode()
            };
            var values = new List <TValue>();
            var keys   = new HashSet <string>(keyNormaliser);

            foreach (var entry in data)
            {
                var key = entry.Key;
                if (key == null)
                {
                    throw new ArgumentException("Null key encountered in data");
                }
                var normalisedKey = keyNormaliser.GetNormalisedString(key);
                if (normalisedKey == "")
                {
                    throw new ArgumentException("key value results in blank string when normalised: " + key);
                }
                if (keys.Contains(normalisedKey))
                {
                    throw new ArgumentException("key value results in duplicate normalised key:" + key);
                }
                keys.Add(key);

                if (nodes[0].Character == (char)0)
                {
                    nodes[0] = SetCharacter(nodes[0], normalisedKey[0]);
                }

                var nodeIndex = 0;
                var keyIndex  = 0;
                while (true)
                {
                    if (nodes[nodeIndex].Character == normalisedKey[keyIndex])
                    {
                        keyIndex++;
                        if (keyIndex == normalisedKey.Length)
                        {
                            var newValueIndex = values.Count;
                            values.Add(entry.Value);
                            nodes[nodeIndex] = SetValueIndex(nodes[nodeIndex], newValueIndex);
                            break;
                        }
                        if (nodes[nodeIndex].MiddleChildIndex == -1)
                        {
                            var newNode      = SetCharacter(GetUnintialisedNode(), normalisedKey[keyIndex]);
                            var newNodeIndex = nodes.Count;
                            nodes.Add(newNode);
                            nodes[nodeIndex] = SetMiddleChildIndex(nodes[nodeIndex], newNodeIndex);
                            nodeIndex        = newNodeIndex;
                        }
                        else
                        {
                            nodeIndex = nodes[nodeIndex].MiddleChildIndex;
                        }
                        continue;
                    }
                    else if (normalisedKey[keyIndex] < nodes[nodeIndex].Character)
                    {
                        if (nodes[nodeIndex].LeftChildIndex == -1)
                        {
                            var newNode      = SetCharacter(GetUnintialisedNode(), normalisedKey[keyIndex]);
                            var newNodeIndex = nodes.Count;
                            nodes.Add(newNode);
                            nodes[nodeIndex] = SetLeftChildIndex(nodes[nodeIndex], newNodeIndex);
                            nodeIndex        = newNodeIndex;
                        }
                        else
                        {
                            nodeIndex = nodes[nodeIndex].LeftChildIndex;
                        }
                    }
                    else
                    {
                        if (nodes[nodeIndex].RightChildIndex == -1)
                        {
                            var newNode      = SetCharacter(GetUnintialisedNode(), normalisedKey[keyIndex]);
                            var newNodeIndex = nodes.Count;
                            nodes.Add(newNode);
                            nodes[nodeIndex] = SetRightChildIndex(nodes[nodeIndex], newNodeIndex);
                            nodeIndex        = newNodeIndex;
                        }
                        else
                        {
                            nodeIndex = nodes[nodeIndex].RightChildIndex;
                        }
                    }
                }
            }

            _nodes         = nodes.ToArray();
            _values        = values.ToArray();
            _keyNormaliser = keyNormaliser;
            _keys          = keys.ToList().AsReadOnly();
        }
        private static Node Add(Node root, IStringNormaliser keyNormaliser, IEnumerable <KeyValuePair <string, TValue> > data, Combiner combine)
        {
            if (keyNormaliser == null)
            {
                throw new ArgumentNullException("keyNormaliser");
            }
            if (data == null)
            {
                throw new ArgumentNullException("keys");
            }
            if (combine == null)
            {
                throw new ArgumentNullException(nameof(combine));
            }

            if (!data.Any())
            {
                return(root);
            }

            if (root != null)
            {
                root = root.Clone();
            }
            foreach (var entry in data)
            {
                var key = entry.Key;
                if (key == null)
                {
                    throw new ArgumentException("Null key encountered in data");
                }
                var normalisedKey = keyNormaliser.GetNormalisedString(key);
                if (normalisedKey == "")
                {
                    throw new ArgumentException("key value results in blank string when normalised: " + key);
                }

                if (root == null)
                {
                    root = new Node()
                    {
                        Character = normalisedKey[0]
                    }
                }
                ;

                var node  = root;
                var index = 0;
                while (true)
                {
                    if (node.Character == normalisedKey[index])
                    {
                        index++;
                        if (index == normalisedKey.Length)
                        {
                            node.Value = ((node.Value != null) && (entry.Value != null))
                                                                ? combine(node.Value, entry.Value)
                                                                : ((node.Value != null) ? node.Value : entry.Value);
                            node.Key = (node.Value == null) ? null : normalisedKey;                             // If we ended up with a null Value then the Combiner may have removed it, in which case we should set the Key to null as well
                            break;
                        }
                        if (node.MiddleChild == null)
                        {
                            node.MiddleChild = new Node()
                            {
                                Character = normalisedKey[index], Parent = node
                            }
                        }
                        ;
                        node = node.MiddleChild;
                    }
                    else if (normalisedKey[index] < node.Character)
                    {
                        if (node.LeftChild == null)
                        {
                            node.LeftChild = new Node()
                            {
                                Character = normalisedKey[index], Parent = node
                            }
                        }
                        ;
                        node = node.LeftChild;
                    }
                    else
                    {
                        if (node.RightChild == null)
                        {
                            node.RightChild = new Node()
                            {
                                Character = normalisedKey[index], Parent = node
                            }
                        }
                        ;
                        node = node.RightChild;
                    }
                }
            }
            return(root);
        }
Пример #7
0
        /// <summary>
        /// Combine additional data with an existing root's content, returning a new Node (unless zero data entries were specified, in which case the original
        /// reference will be returned). This will throw an exception for a null keyNormalised or data (or if any keys in the data return null, empty string
        /// or duplicates when run through the keyNormaliser). If a null root is specified then a new root will be generated.
        /// </summary>
        private static Node Add(Node root, IStringNormaliser keyNormaliser, IEnumerable <KeyValuePair <string, TValue> > data)
        {
            if (keyNormaliser == null)
            {
                throw new ArgumentNullException("keyNormaliser");
            }
            if (data == null)
            {
                throw new ArgumentNullException("keys");
            }

            if (!data.Any())
            {
                return(root);
            }

            if (root != null)
            {
                root = root.Clone();
            }
            foreach (var entry in data)
            {
                var key = entry.Key;
                if (key == null)
                {
                    throw new ArgumentException("Null key encountered in data");
                }
                var normalisedKey = keyNormaliser.GetNormalisedString(key);
                if (normalisedKey == "")
                {
                    throw new ArgumentException("key value results in blank string when normalised: " + key);
                }

                if (root == null)
                {
                    root = new Node()
                    {
                        Character = normalisedKey[0]
                    }
                }
                ;

                var node  = root;
                var index = 0;
                while (true)
                {
                    if (node.Character == normalisedKey[index])
                    {
                        index++;
                        if (index == normalisedKey.Length)
                        {
                            node.Key   = normalisedKey;
                            node.Value = entry.Value;
                            break;
                        }
                        if (node.MiddleChild == null)
                        {
                            node.MiddleChild = new Node()
                            {
                                Character = normalisedKey[index], Parent = node
                            }
                        }
                        ;
                        node = node.MiddleChild;
                    }
                    else if (normalisedKey[index] < node.Character)
                    {
                        if (node.LeftChild == null)
                        {
                            node.LeftChild = new Node()
                            {
                                Character = normalisedKey[index], Parent = node
                            }
                        }
                        ;
                        node = node.LeftChild;
                    }
                    else
                    {
                        if (node.RightChild == null)
                        {
                            node.RightChild = new Node()
                            {
                                Character = normalisedKey[index], Parent = node
                            }
                        }
                        ;
                        node = node.RightChild;
                    }
                }
            }
            return(root);
        }