Пример #1
0
        internal static EnglishWord CreateForTest(string text)
        {
            var word = new EnglishWord();

            word.Create(text);
            return(word);
        }
Пример #2
0
        private static List <string> GetVocabulary(string[] docs, out List <List <string> > stemmedDocs,
                                                   int vocabularyThreshold)
        {
            var wordCountList = new Dictionary <string, int>();

            stemmedDocs = new List <List <string> >();

            var docIndex = 0;

            foreach (var doc in docs)
            {
                var stemmedDoc = new List <string>();

                docIndex++;

                if (docIndex % 100 == 0)
                {
                    Console.WriteLine("Processing " + docIndex + "/" + docs.Length);
                }

                var parts2 = WordsProcessing(doc);

                var words = new List <string>();

                foreach (var part in parts2)
                {
                    var stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");

                    if (StopWords.StopWordsList.Contains(stripped.ToLower()))
                    {
                        continue;
                    }
                    try
                    {
                        var english = new EnglishWord(stripped);
                        var stem    = english.Stem;
                        words.Add(stem);

                        if (stem.Length > 0)
                        {
                            if (wordCountList.ContainsKey(stem))
                            {
                                wordCountList[stem]++;
                            }
                            else
                            {
                                wordCountList.Add(stem, 0);
                            }

                            stemmedDoc.Add(stem);
                        }
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(e);
                        throw;
                    }
                }

                stemmedDocs.Add(stemmedDoc);
            }

            var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold);

            return(vocabList.Select(item => item.Key).ToList());
        }