public TextSource Process(IEnumerable<string> documents, string name) { TextSource result = new TextSource(); foreach (var item in documents) { var doc = new DocumentSource(); var sItem = item.Replace(".", " . ") .Replace(",", " , ") .Replace("\r\n", " \r##n ") .Replace("\n", " \n ") .Replace(",", " , ") .Replace("##n", "\n"); // consider punctuation marks/new line as seperate words foreach (var segment in sItem.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)) { doc.LanguageSegments.Add(segment); } result.Documents.Add(doc); } result.Name = name; return result; }
public TextSource Process(IEnumerable <string> documents, string name) { TextSource result = new TextSource(); foreach (var item in documents) { var doc = new DocumentSource(); var sItem = item.Replace(".", " . ") .Replace(",", " , ") .Replace("\r\n", " \r##n ") .Replace("\n", " \n ") .Replace(",", " , ") .Replace("##n", "\n"); // consider punctuation marks/new line as seperate words foreach (var segment in sItem.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)) { doc.LanguageSegments.Add(segment); } result.Documents.Add(doc); } result.Name = name; return(result); }
public double P_c(CategoryProbabilityDistribution trainingDistribution, DocumentSource testData, int n, double prob_c) { var result = Math.Log10(prob_c); var source = testData.LanguageSegments.ToArray(); for (int i = 0; i <= source.Length - n; i++) { string[] ngram = source.GetNGram(i, n); var logProb = Math.Log10(trainingDistribution.GetProbability(ngram)); result += logProb; } return result; }
public TextSource Process(IEnumerable<string> documents, string name) { TextSource result = new TextSource(); foreach (var item in documents) { var doc = new DocumentSource(); foreach (var segment in item) { doc.LanguageSegments.Add(segment.ToString()); } result.Documents.Add(doc); } result.Name = name; return result; }
public TextSource Process(IEnumerable <string> documents, string name) { TextSource result = new TextSource(); foreach (var item in documents) { var doc = new DocumentSource(); foreach (var segment in item) { doc.LanguageSegments.Add(segment.ToString()); } result.Documents.Add(doc); } result.Name = name; return(result); }