Esempio n. 1
0
 private void IncVocabulary(FrequencyDocument doc)
 {
     foreach (var token in doc.Tokens.Keys)
     {
         if (!Vocabulary.Contains(token))
         {
             Vocabulary.Add(token);
         }
     }
 }
Esempio n. 2
0
        private List <FrequencyDocument> TransformClassificatedDocuments(string folder, string classification)
        {
            string[] files = Directory.GetFiles(folder);
            var      docs  = new List <FrequencyDocument>();
            string   text;

            foreach (var file in files)
            {
                text = File.ReadAllText(file);
                FrequencyDocument doc = TransformText(text);
                doc.Classification = classification;
                IncVocabulary(doc);
                docs.Add(doc);
            }

            return(docs);
        }
Esempio n. 3
0
        private FrequencyDocument NormalizeDocument(FrequencyDocument doc)
        {
            double length = 0;

            foreach (var token in doc.Tokens)
            {
                length += token.Value * token.Value;
            }
            length = Math.Sqrt(length);

            string[] keys = doc.Tokens.Keys.ToArray();

            for (var i = 0; i < keys.Length; i++)
            {
                doc.Tokens[keys[i]] /= length;
            }
            return(doc);
        }
Esempio n. 4
0
        public FrequencyDocument TransformText(string text)
        {
            string[] tokens = Tokenize(text);

            var document = new FrequencyDocument
            {
                Classification = "-",
                Tokens         = new Dictionary <string, double>()
            };

            foreach (var token in tokens)
            {
                if (token == String.Empty)
                {
                    continue;
                }

                string stripped = Regex.Replace(token, "[^a-zA-Z0-9]", "");

                if (!StopWords.stopWordsList.Contains(stripped.ToLower()))
                {
                    var    english     = new EnglishWord(stripped);
                    string stemedToken = english.Stem;

                    if (document.Tokens.Keys.Contains(token))
                    {
                        document.Tokens[token]++;
                    }
                    else
                    {
                        document.Tokens.Add(token, 1);
                    }
                }
            }

            return(document);
        }