Exemplo n.º 1
0
 private void AddWord(TrieNode node, string word)
 {
     for (int _index = 0, len = word.Length; _index < len; _index++)
     {
         string   _characterToAdd = word[_index].ToString();
         TrieNode _childNode      = node.GetChild(_characterToAdd);
         if (_childNode == null)
         {
             _childNode = TrieFactory.CreateTrieNode(_characterToAdd);
             node.SetChild(_childNode);
         }
         node = _childNode;
     }
     node.WordCount++;
 }
Exemplo n.º 2
0
        static ITrie BuildTestTrie()
        {
            ITrie trie = TrieFactory.CreateTrie();

            List <string> strings = new List <string>()
            {
                "123", "1", "23", "1", "this", "test", "the", "TEMP", "TOKEN", "TAKE", "THUMP", "Microsoft Inc"
            };

            foreach (string s in strings)
            {
                trie.AddWord(s);
            }

            return(trie);
        }
Exemplo n.º 3
0
        static void Main(string[] args)
        {
            string        _strCompaniesFileName = "companies.dat";
            Regex         _lastLine             = new Regex(@"\.[.]", RegexOptions.Compiled);
            Regex         _flattenNames         = new Regex(@"[^0-9a-zA-Z ]+", RegexOptions.Compiled);
            List <string> _lstExclusion         = new List <string>()
            {
                "a", "an", "the", "and", "or", "but"
            };
            ITrie _trie = null;
            List <CompanyNodes> _lstNodes = null;
            CompanyNodes        _node     = null;
            int _nTotalWordCound          = 0;

            //Check file existance
            if (File.Exists(_strCompaniesFileName))
            {
                string _strCompaniesFileText = File.ReadAllText(_strCompaniesFileName);

                //Check for contents
                if (!string.IsNullOrEmpty(_strCompaniesFileText))
                {
                    string[] _strCompanyLines = _strCompaniesFileText.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);

                    //Proceed only if at least one line is present
                    if (_strCompanyLines.Length > 0)
                    {
                        _trie     = TrieFactory.CreateTrie();
                        _lstNodes = new List <CompanyNodes>();

                        foreach (string _line in _strCompanyLines)
                        {
                            string _strParentName = string.Empty;

                            //Break each line by \t to get synonyms
                            string[] _namesToAdd = _line.Split(new[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);

                            foreach (string _name in _namesToAdd)
                            {
                                //Remove special chars from names and then add to trie
                                string _flattenedName = _flattenNames.Replace(_name, string.Empty);
                                _trie.AddWord(_flattenedName);
                                //To calculate the frequency
                                _node = new CompanyNodes()
                                {
                                    Name       = _flattenedName,
                                    ParentName = !string.IsNullOrEmpty(_strParentName) ? _strParentName : null
                                };
                                //The first name is parent for synonyms
                                if (string.IsNullOrEmpty(_strParentName))
                                {
                                    _strParentName = _flattenedName;
                                }

                                _lstNodes.Add(_node);
                            }
                        }

                        //If trie is correctly created
                        if (_trie.GetAllWords().Count > 0)
                        {
                            Console.WriteLine("Enter article:");
                            List <string> _lstFoundWords       = new List <string>();
                            bool          _bIncrementFrequency = false;

                            string _strInArticle = string.Empty;
                            //To increase the buffer size of the console
                            using (Stream inputStream = Console.OpenStandardInput(READLINE_BUFFER_SIZE))
                            {
                                byte[] bytes = new byte[READLINE_BUFFER_SIZE];
                                char[] chars = null;
                                string temp  = "";

                                while (!_lastLine.IsMatch(temp))
                                {
                                    int outputLength = inputStream.Read(bytes, 0, READLINE_BUFFER_SIZE);
                                    //Console.WriteLine(outputLength);
                                    chars = Encoding.UTF7.GetChars(bytes, 0, outputLength);

                                    temp          = new string(chars);
                                    _strInArticle = string.Format($"{_strInArticle }{ temp}");
                                }
                            }
                            _strInArticle = _lastLine.Replace(_strInArticle, " ");
                            string[] _strArticleWords = _strInArticle.Split(new[] { ' ', '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
                            if (_strArticleWords.Length > 0)
                            {
                                string _strLastString = _strArticleWords.Last();

                                //1. Create the list of strings
                                //2. search the string in trie
                                //3. if found add to the list
                                //4. join the words in the list to form a string and search it again.
                                //5. continue till mismatch occurs
                                //6. if the mismatch is due to a, an, the, and, or, but search again without them
                                //7. When mismatch, increament the count of string in the list in _lstNodes and clear the list

                                foreach (string _strArticleWord in _strArticleWords)
                                {
                                    bool _bLastWord = false;

                                    if (ReferenceEquals(_strLastString, _strArticleWord))
                                    {
                                        _bLastWord = true;
                                    }
                                    string _strFlattenedArticleWord = _flattenNames.Replace(_strArticleWord, string.Empty);

                                    string _strCurrentWord = _strFlattenedArticleWord;
                                    if (_lstFoundWords.Count > 0)
                                    {
                                        _strFlattenedArticleWord = string.Format($"{ string.Join(" ", _lstFoundWords)} {_strFlattenedArticleWord}").Trim();
                                    }

                                    _nTotalWordCound++;
                                    if (_trie.ContainsWord(_strFlattenedArticleWord) && !string.IsNullOrEmpty(_strCurrentWord))
                                    {
                                        if (!_bLastWord)
                                        {
                                            _lstFoundWords.Add(_strCurrentWord);
                                        }
                                        else
                                        {
                                            IncrementFrequency(_lstNodes, _lstFoundWords, _strFlattenedArticleWord);
                                        }
                                    }
                                    else
                                    {
                                        if (!_lstExclusion.Contains(_strCurrentWord))
                                        {
                                            _bIncrementFrequency = true;
                                        }

                                        _strFlattenedArticleWord = _strFlattenedArticleWord.Remove(_strFlattenedArticleWord.LastIndexOf(_strCurrentWord), _strCurrentWord.Length).Trim();

                                        if (!string.IsNullOrEmpty(_strFlattenedArticleWord) && _bIncrementFrequency)
                                        {
                                            IncrementFrequency(_lstNodes, _lstFoundWords, _strFlattenedArticleWord);
                                            _bIncrementFrequency = false;
                                        }

                                        //if current word is another company, then proceed with that
                                        if (!string.IsNullOrEmpty(_strFlattenedArticleWord) &&
                                            !string.IsNullOrEmpty(_strCurrentWord) &&
                                            !_strFlattenedArticleWord.Equals(_strCurrentWord))
                                        {
                                            _strFlattenedArticleWord = _strCurrentWord;
                                            if (!_bLastWord)
                                            {
                                                _lstFoundWords.Add(_strCurrentWord);
                                            }
                                            else
                                            {
                                                IncrementFrequency(_lstNodes, _lstFoundWords, _strFlattenedArticleWord);
                                            }
                                        }
                                    }
                                }
                            }

                            int _nHitCount = 0;
                            List <CompanyNodes> _lstParentNodes = _lstNodes.Where(x => x.ParentName == null).ToList();
                            int    _maxLength = _lstParentNodes.Max(x => x.Name.Trim().Length);
                            string _spaces    = new string(' ', _maxLength);
                            Console.WriteLine($"Company{_spaces}\tHit Count\tRelevance");
                            foreach (CompanyNodes item in _lstParentNodes)
                            {
                                _nHitCount += item.Frequency;
                                Console.WriteLine($"{item.Name}{_spaces.Remove(_spaces.Length - item.Name.Length)}\t\t{item.Frequency}\t\t{((double)item.Frequency / _nTotalWordCound) * 100}%");
                            }
                            Console.WriteLine();
                            Console.WriteLine($"Total{_spaces}\t\t{_nHitCount}\t\t{((double)_nHitCount / _nTotalWordCound) * 100}%");
                            Console.WriteLine($"Total Words{_spaces}\t{_nTotalWordCound}");
                        }
                        else
                        {
                            Console.WriteLine("Somthing went wrong while creating trie");
                        }
                    }
                    else
                    {
                        Console.WriteLine($"something went wrong with reading text of {_strCompaniesFileName}");
                    }
                }
                else
                {
                    Console.WriteLine($"'{_strCompaniesFileName}' is empty.");
                }
            }
            else
            {
                Console.WriteLine($"'{_strCompaniesFileName}' file not found.");
            }
            Console.ReadKey();
        }