public TextSource Process(IEnumerable<string> documents, string name) { TextSource result = new TextSource(); foreach (var item in documents) { var doc = new DocumentSource(); var sItem = item.Replace(".", " . ") .Replace(",", " , ") .Replace("\r\n", " \r##n ") .Replace("\n", " \n ") .Replace(",", " , ") .Replace("##n", "\n"); // consider punctuation marks/new line as seperate words foreach (var segment in sItem.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)) { doc.LanguageSegments.Add(segment); } result.Documents.Add(doc); } result.Name = name; return result; }
public double P_c(CategoryProbabilityDistribution trainingDistribution, DocumentSource testData, int n, double prob_c) { var result = Math.Log10(prob_c); for (int i = 0; i <= testData.LanguageSegments.Count - n; i++) { IEnumerable<string> ngram = testData.LanguageSegments.Skip(i).Take(n).ToArray(); var xyz = Math.Log10(trainingDistribution.GetProbability(ngram)); if(xyz < -1000) { xyz = -1000; } result += xyz; } return result; }
public TextSource Process(IEnumerable<string> documents, string name) { TextSource result = new TextSource(); foreach (var item in documents) { var doc = new DocumentSource(); foreach (var segment in item) { doc.LanguageSegments.Add(segment.ToString()); } result.Documents.Add(doc); } result.Name = name; return result; }