public List<Document> Classify() { if (Documents == null || Documents.Count() < 3) return Documents; distance = new Distance(); Document oldNewsCentroid = null; Document oldReviewsCentroid = null; Document oldEditorialCentroid = null; // initial assignment- Three clusters seeds // seed newsCentroid = Documents.FirstOrDefault(); newsCentroid.Label = Label.news; reviewsCentroid = Documents.ElementAt(Documents.Count()/2); reviewsCentroid.Label = Label.reviews; editorialCentroid = Documents.ElementAt(Documents.Count() - 1); editorialCentroid.Label = Label.editorials; AssignLabel(); while (oldNewsCentroid != newsCentroid || oldReviewsCentroid != reviewsCentroid || oldEditorialCentroid != editorialCentroid) { oldNewsCentroid = newsCentroid; oldReviewsCentroid = reviewsCentroid; oldEditorialCentroid = editorialCentroid; FixCentroids(Label.news); FixCentroids(Label.reviews); FixCentroids(Label.editorials); AssignLabel(); } return Documents; }
public double GetNormalizedDistance(Document di, Document dj) { var emptyDocument = Document.GetEmptyDocument(di.Vocabulary); var KLDij = GetDistance(di, dj); var KLDi0 = GetDistance(di, emptyDocument); if (KLDi0 > 0) { return KLDij / KLDi0; } return KLDij; }
public static Document GetEmptyDocument(List<string> vocabulary) { Document d = new Document(); d.FreqDistributionOfTermsFromVocabulary = new Dictionary<string, int>(); d.Vocabulary = vocabulary; d.Probabilities = new List<double>(); foreach (var v in vocabulary) { d.Probabilities.Add(EPSILONPROBABILITY); } return d; }
public double GetDistance(Document di, Document dj) { Debug.Assert(di.Vocabulary.Count() == dj.Vocabulary.Count()); double total = 0; foreach (var term in di.Vocabulary) { var dik = di.GetProbability(term); var djk = dj.GetProbability(term); total += (dik - djk) * Math.Log(dik / djk); } return total; }
internal void FixCentroids(Label category) { var measure = new List<Double>(); var members = Documents.Where(x => x.Label == category).ToList(); int index = -1; switch (category) { case Label.reviews: members.ForEach(x => measure.Add(distance.GetNormalizedDistance(x, reviewsCentroid))); index = GetNewCentroidIndex(measure); if (index > 0 && index < measure.Count) reviewsCentroid = members[index]; break; case Label.news: members.ForEach(x => measure.Add(distance.GetNormalizedDistance(x, newsCentroid))); index = GetNewCentroidIndex(measure); if (index > 0 && index < measure.Count) newsCentroid = members[index]; break; case Label.editorials: members.ForEach(x => measure.Add(distance.GetNormalizedDistance(x, editorialCentroid))); index = GetNewCentroidIndex(measure); if (index > 0 && index < measure.Count) editorialCentroid = members[index]; break; } }