/// <summary> /// Construct a new object using configuration file and custom tokenizer /// </summary> /// <param name="config">Configuration file</param> /// <param name="tokenizer">Custom Tokenizer to parse text </param> public Hoot(IHootConfig config, ITokenizer tokenizer) { HootConfOptions = config; _tokenizer = (tokenizer != null) ? tokenizer : new tokenizer(); if (!Directory.Exists(config.IndexPath)) { Directory.CreateDirectory(config.IndexPath); } _log.Debug("Starting hOOt...."); _log.Debug($"Storage Folder = {config.IndexPath}"); _tokenizer.InitializeStopList(config.IndexPath); _log.Debug($"Stop List Words saved to {config.IndexPath}"); if (config.DocMode) { _docs = new KeyStoreString(Path.Combine(config.IndexPath, $"files.docs"), false); // // read deleted // _deleted = new BoolIndex(Path.Combine(config.IndexPath, $"_deleted.hoot")); _lastDocNum = (int)_docs.Count(); } _bitmaps = new BitmapIndex(Path.Combine(config.IndexPath, $"{config.FileName}_hoot.bmp")); // // read words // LoadWords(); }
/// <summary> /// Add a word to the dictionary /// </summary> /// <param name="dic"></param> /// <param name="word"></param> private static void AddDictionary(Dictionary <string, int> dic, string word, IHootConfig config) { if (word == null) { return; } int l = word.Length; // // too long // if (l > Global.DefaultStringKeySize) { return; } // // too short // if (l < 2) { return; } if (config.IgnoreNumerics) { if (wordIsNumeric(word)) { return; } } if (config.UseStopList) { if (m_stopWords.Contains(word)) { return; } } addword(dic, word); }
public Dictionary <string, int> GenerateWordFreq(string text, IHootConfig config) { Dictionary <string, int> dic = new Dictionary <string, int>(500); char[] chars = text.ToCharArray(); int index = 0; int look = 0; int count = chars.Length; int lastlang = langtype(chars[0]); while (index < count) { int lang = -1; while (look < count) { char c = chars[look]; lang = langtype(c); if (lang == lastlang) { look++; } else { break; } } if (lastlang > -1) { ParseString(dic, chars, look, index, config); } index = look; lastlang = lang; } return(dic); }
private static void ParseString(Dictionary <string, int> dic, char[] chars, int end, int start, IHootConfig config) { // check if upper lower case mix -> extract words int uppers = 0; bool found = false; for (int i = start; i < end; i++) { if (char.IsUpper(chars[i])) { uppers++; } } // not all uppercase if (uppers != end - start - 1) { int lastUpper = start; string word = ""; for (int i = start + 1; i < end; i++) { char c = chars[i]; if (char.IsUpper(c)) { found = true; word = new string(chars, lastUpper, i - lastUpper).ToLowerInvariant().Trim(); AddDictionary(dic, word, config); lastUpper = i; } } if (lastUpper > start) { string last = new string(chars, lastUpper, end - lastUpper).ToLowerInvariant().Trim(); if (word != last) { AddDictionary(dic, last, config); } } } if (found == false) { string s = new string(chars, start, end - start).ToLowerInvariant().Trim(); AddDictionary(dic, s, config); } }
/// <summary> /// Initialize with the Configuration file /// </summary> /// <param name="config">Configuration File</param> public Hoot(IHootConfig config) : this(config, new tokenizer()) { }