Пример #1
0
        public Document parseDocument(string line, string id)
        {
            termFreqDict = new Dictionary <string, int>();

            line = line.ToLower();
            line = line.TrimEnd(' ');
            line = Regex.Replace(line, @"\t|\n|\r", "");

            Regex rgx = new Regex("[^a-z0-9 ]"); // keep just alphanumeric characters

            line = rgx.Replace(line, " ");

            line = Regex.Replace(line, string.Format(@"(\p{{L}}{{{0}}})\p{{L}}+", 11), ""); // remove 12 >
            line = Regex.Replace(line, @"\b\w{1,3}\b", "");                                 // remove words that have three letters or fewer
            line = Regex.Replace(line, @"\s+", " ");                                        // remove extra whitespace

            var noSpaces = line.Split(new String[] { " " }, StringSplitOptions.RemoveEmptyEntries);

            HashSet <string> uniqueWords = new HashSet <string>();

            Stemmer stemmer = new Stemmer();

            foreach (var s in noSpaces)
            {
                // stem words
                string word = stemmer.stem(s);
                if (!StopWords.stopWordsSet.Contains(word) && !word.Any(c => char.IsDigit(c)))
                {
                    addToLocalDict(word);

                    if (!uniqueWords.Contains(word))
                    {
                        MegaDictionary.AddToDictionary(word);
                        uniqueWords.Add(word);
                    }
                }
            }

            return(new Document(termFreqDict, id));
        }