Пример #1
0
        public static int[][] GetWordIndexStemmedDocs(string[] docs, List <string> vocabulary)
        {
            List <int>[] wordIndex = Util.ArrayInit(docs.Length, d => new List <int>());

            int docIndex = 0;

            foreach (var doc in docs)
            {
                if (doc != null)
                {
                    string[] parts2 = Tokenize(doc.ToLower());

                    List <int> wordIndexDoc = new List <int>();
                    foreach (string part in parts2)
                    {
                        // Strip non-alphanumeric characters.
                        string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");

                        try
                        {
                            var    english = new EnglishWord(stripped);
                            string stem    = english.Stem;

                            if (vocabulary.Contains(stem))
                            {
                                wordIndexDoc.Add(vocabulary.IndexOf(stem));
                            }
                        }
                        catch
                        {
                            // ignored
                        }
                    }

                    wordIndex[docIndex] = wordIndexDoc.Distinct().ToList();
                    docIndex++;
                }
            }

            return(wordIndex.Select(list => list.Select(index => index).ToArray()).ToArray());
        }
Пример #2
0
        /// <summary>
        /// Parses and tokenizes a list of documents, returning a vocabulary of words.
        /// </summary>
        /// <param name="docs">string[]</param>
        /// <param name="stemmedDocs">List of List of string</param>
        /// <param name="vocabularyThreshold"></param>
        /// <returns>Vocabulary (list of strings)</returns>
        private static List <string> GetVocabulary(string[] docs, out List <List <string> > stemmedDocs, int vocabularyThreshold)
        {
            List <string>            vocabulary    = new List <string>();
            Dictionary <string, int> wordCountList = new Dictionary <string, int>();

            stemmedDocs = new List <List <string> >();
            var           stopWordsFile = File.ReadAllLines(Path.Combine("Data", "stopwords.txt"));
            var           stopWordsList = new List <string>(stopWordsFile).ToArray();
            int           docIndex      = 0;
            List <string> words         = new List <string>();

            foreach (var doc in docs)
            {
                List <string> stemmedDoc = new List <string>();

                docIndex++;

                if (docIndex % 10000 == 0)
                {
                    Console.WriteLine("Processing " + docIndex + "/" + docs.Length);
                }

                string[] parts2 = Tokenize(doc.ToLower());

                foreach (string part in parts2)
                {
                    // Strip non-alphanumeric characters.
                    string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");

                    if (!stopWordsList.Contains(stripped.ToLower()))
                    {
                        try
                        {
                            var    english = new EnglishWord(stripped);
                            string stem    = english.Original;

                            words.Add(stripped);

                            if (stem.Length > 0)
                            {
                                // Build the word count list.
                                if (wordCountList.ContainsKey(stem))
                                {
                                    wordCountList[stem]++;
                                }
                                else
                                {
                                    wordCountList.Add(stem, 0);
                                }

                                stemmedDoc.Add(stem);
                            }
                        }
                        catch
                        {
                            // ignored
                        }
                    }
                }

                stemmedDocs.Add(stemmedDoc);
            }

            // Get the top words.
            var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold);

            foreach (var item in vocabList)
            {
                vocabulary.Add(item.Key);
            }

            return(vocabulary);
        }