コード例 #1
0
        private IList <string> OrderByFrequency(string query)
        {
            var tokenizer = new Tokenizer();
            var tokens    = tokenizer.TokenizeOneLine(query).Where(t => t.TokenType != CharClass.Symbol);

            var frequencyByToken = new Dictionary <string, int>();

            foreach (var token in tokens)
            {
                var frequency = 0;
                if (PositionsByToken.TryGetValue(token.NormalizedText, out var list))
                {
                    frequency = list.Count;
                }

                if (frequency != 0)
                {
                    frequencyByToken[token.NormalizedText] = frequency;
                }
            }

            if (frequencyByToken.Count == 0)
            {
                return(new List <string>());
            }


            // most significant (less frequent) tokens first
            return(frequencyByToken.OrderBy(p => p.Value).Select(p => p.Key).Distinct().ToList());
        }
コード例 #2
0
        /// <summary>
        ///     Add a tokenized line to the full-text index
        /// </summary>
        /// <param name="line"></param>
        /// <param name="lineIndex"></param>
        /// <param name="primaryKey"></param>
        private void IndexLine(TokenizedLine line, int lineIndex, KeyValue primaryKey)
        {
            var pointer = new LinePointer(lineIndex, primaryKey);

            foreach (var token in line.Tokens)
            {
                var tooFrequentToken = false;

                if (!PositionsByToken.TryGetValue(token, out var positions))
                {
                    positions = new HashSet <LinePointer>();
                    PositionsByToken[token] = positions;
                }
                else
                {
                    if (positions.Count == 0)
                    {
                        tooFrequentToken = true;
                    }
                }

                if (!tooFrequentToken)
                {
                    if (positions.Add(pointer))
                    {
                        Entries = Entries + 1;

                        AddToSecondaryIndex(pointer);
                    }
                }
            }


            // Remove the most frequent (less discriminant) tokens in the index if the index is too big
            // Limit the entries in the index: try to limit to MaxCapacity but without removing more than MaxTokensToIgnore tokens
            if (NeedsCleanup())
            {
                string mostFrequentToken = null;

                var maxFrequency = 0;

                foreach (var p in PositionsByToken)
                {
                    if (p.Value.Count > maxFrequency)
                    {
                        mostFrequentToken = p.Key;
                        maxFrequency      = p.Value.Count;
                    }
                }

                Debug.Assert(mostFrequentToken != null);

                IgnoreToken(mostFrequentToken);


                Entries = Entries - maxFrequency;

                IgnoredTokens++;
            }
        }
コード例 #3
0
        public void Clear()
        {
            Entries       = 0;
            IgnoredTokens = 0;

            PositionsByToken.Clear();
            PositionsByDocument.Clear();
        }