Ejemplo n.º 1
0
        /// <summary>
        /// Construct a new object using configuration file and custom tokenizer
        /// </summary>
        /// <param name="config">Configuration file</param>
        /// <param name="tokenizer">Custom Tokenizer to parse text </param>
        public Hoot(IHootConfig config, ITokenizer tokenizer)
        {
            HootConfOptions = config;

            _tokenizer = (tokenizer != null) ? tokenizer : new tokenizer();

            if (!Directory.Exists(config.IndexPath))
            {
                Directory.CreateDirectory(config.IndexPath);
            }

            _log.Debug("Starting hOOt....");
            _log.Debug($"Storage Folder = {config.IndexPath}");

            _tokenizer.InitializeStopList(config.IndexPath);

            _log.Debug($"Stop List Words saved to {config.IndexPath}");

            if (config.DocMode)
            {
                _docs = new KeyStoreString(Path.Combine(config.IndexPath, $"files.docs"), false);
                //
                // read deleted
                //
                _deleted    = new BoolIndex(Path.Combine(config.IndexPath, $"_deleted.hoot"));
                _lastDocNum = (int)_docs.Count();
            }
            _bitmaps = new BitmapIndex(Path.Combine(config.IndexPath, $"{config.FileName}_hoot.bmp"));
            //
            // read words
            //
            LoadWords();
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Add a word to the dictionary
        /// </summary>
        /// <param name="dic"></param>
        /// <param name="word"></param>
        private static void AddDictionary(Dictionary <string, int> dic, string word, IHootConfig config)
        {
            if (word == null)
            {
                return;
            }

            int l = word.Length;

            //
            // too long
            //
            if (l > Global.DefaultStringKeySize)
            {
                return;
            }
            //
            // too short
            //
            if (l < 2)
            {
                return;
            }

            if (config.IgnoreNumerics)
            {
                if (wordIsNumeric(word))
                {
                    return;
                }
            }

            if (config.UseStopList)
            {
                if (m_stopWords.Contains(word))
                {
                    return;
                }
            }

            addword(dic, word);
        }
Ejemplo n.º 3
0
        public Dictionary <string, int> GenerateWordFreq(string text, IHootConfig config)
        {
            Dictionary <string, int> dic = new Dictionary <string, int>(500);

            char[] chars    = text.ToCharArray();
            int    index    = 0;
            int    look     = 0;
            int    count    = chars.Length;
            int    lastlang = langtype(chars[0]);

            while (index < count)
            {
                int lang = -1;
                while (look < count)
                {
                    char c = chars[look];
                    lang = langtype(c);
                    if (lang == lastlang)
                    {
                        look++;
                    }
                    else
                    {
                        break;
                    }
                }
                if (lastlang > -1)
                {
                    ParseString(dic, chars, look, index, config);
                }

                index    = look;
                lastlang = lang;
            }
            return(dic);
        }
Ejemplo n.º 4
0
        private static void ParseString(Dictionary <string, int> dic, char[] chars, int end, int start, IHootConfig config)
        {
            // check if upper lower case mix -> extract words
            int  uppers = 0;
            bool found  = false;

            for (int i = start; i < end; i++)
            {
                if (char.IsUpper(chars[i]))
                {
                    uppers++;
                }
            }
            // not all uppercase
            if (uppers != end - start - 1)
            {
                int lastUpper = start;

                string word = "";

                for (int i = start + 1; i < end; i++)
                {
                    char c = chars[i];
                    if (char.IsUpper(c))
                    {
                        found = true;
                        word  = new string(chars, lastUpper, i - lastUpper).ToLowerInvariant().Trim();

                        AddDictionary(dic, word, config);
                        lastUpper = i;
                    }
                }

                if (lastUpper > start)
                {
                    string last = new string(chars, lastUpper, end - lastUpper).ToLowerInvariant().Trim();

                    if (word != last)
                    {
                        AddDictionary(dic, last, config);
                    }
                }
            }
            if (found == false)
            {
                string s = new string(chars, start, end - start).ToLowerInvariant().Trim();
                AddDictionary(dic, s, config);
            }
        }
Ejemplo n.º 5
0
 /// <summary>
 /// Initialize with the Configuration file
 /// </summary>
 /// <param name="config">Configuration File</param>
 public Hoot(IHootConfig config)
     : this(config, new tokenizer())
 {
 }