private void IncVocabulary(FrequencyDocument doc) { foreach (var token in doc.Tokens.Keys) { if (!Vocabulary.Contains(token)) { Vocabulary.Add(token); } } }
private List <FrequencyDocument> TransformClassificatedDocuments(string folder, string classification) { string[] files = Directory.GetFiles(folder); var docs = new List <FrequencyDocument>(); string text; foreach (var file in files) { text = File.ReadAllText(file); FrequencyDocument doc = TransformText(text); doc.Classification = classification; IncVocabulary(doc); docs.Add(doc); } return(docs); }
private FrequencyDocument NormalizeDocument(FrequencyDocument doc) { double length = 0; foreach (var token in doc.Tokens) { length += token.Value * token.Value; } length = Math.Sqrt(length); string[] keys = doc.Tokens.Keys.ToArray(); for (var i = 0; i < keys.Length; i++) { doc.Tokens[keys[i]] /= length; } return(doc); }
public FrequencyDocument TransformText(string text) { string[] tokens = Tokenize(text); var document = new FrequencyDocument { Classification = "-", Tokens = new Dictionary <string, double>() }; foreach (var token in tokens) { if (token == String.Empty) { continue; } string stripped = Regex.Replace(token, "[^a-zA-Z0-9]", ""); if (!StopWords.stopWordsList.Contains(stripped.ToLower())) { var english = new EnglishWord(stripped); string stemedToken = english.Stem; if (document.Tokens.Keys.Contains(token)) { document.Tokens[token]++; } else { document.Tokens.Add(token, 1); } } } return(document); }