Esempio n. 1
0
        public List<Document> Classify()
        {
            if (Documents == null || Documents.Count() < 3) return Documents;
            distance = new Distance();

            Document oldNewsCentroid = null;
            Document oldReviewsCentroid = null;
            Document oldEditorialCentroid = null;

            // initial assignment- Three clusters seeds
            // seed
            newsCentroid = Documents.FirstOrDefault();
            newsCentroid.Label = Label.news;
            reviewsCentroid = Documents.ElementAt(Documents.Count()/2);
            reviewsCentroid.Label = Label.reviews;
            editorialCentroid = Documents.ElementAt(Documents.Count() - 1);
            editorialCentroid.Label = Label.editorials;

            AssignLabel();

            while (oldNewsCentroid != newsCentroid || oldReviewsCentroid != reviewsCentroid || oldEditorialCentroid != editorialCentroid)
            {

                oldNewsCentroid = newsCentroid;
                oldReviewsCentroid = reviewsCentroid;
                oldEditorialCentroid = editorialCentroid;

                FixCentroids(Label.news);
                FixCentroids(Label.reviews);
                FixCentroids(Label.editorials);

                AssignLabel();
            }
            return Documents;
        }
Esempio n. 2
0
 public double GetNormalizedDistance(Document di, Document dj)
 {
     var emptyDocument = Document.GetEmptyDocument(di.Vocabulary);
     var KLDij = GetDistance(di, dj);
     var KLDi0 = GetDistance(di, emptyDocument);
     if (KLDi0 > 0)
     {
         return KLDij / KLDi0;
     }
     return KLDij;
 }
Esempio n. 3
0
 public static Document GetEmptyDocument(List<string> vocabulary)
 {
     Document d = new Document();
     d.FreqDistributionOfTermsFromVocabulary = new Dictionary<string, int>();
     d.Vocabulary = vocabulary;
     d.Probabilities = new List<double>();
     foreach (var v in vocabulary)
     {
         d.Probabilities.Add(EPSILONPROBABILITY);
     }
     return d;
 }
Esempio n. 4
0
        public double GetDistance(Document di, Document dj)
        {
            Debug.Assert(di.Vocabulary.Count() == dj.Vocabulary.Count());

            double total = 0;
            foreach (var term in di.Vocabulary)
            {
                var dik = di.GetProbability(term);
                var djk = dj.GetProbability(term);
                total += (dik - djk) * Math.Log(dik / djk);
            }
            return total;
        }
Esempio n. 5
0
        internal void FixCentroids(Label category)
        {
            var measure = new List<Double>();
            var members = Documents.Where(x => x.Label == category).ToList();
            int index = -1;
            switch (category)
            {
                case Label.reviews:
                    members.ForEach(x => measure.Add(distance.GetNormalizedDistance(x, reviewsCentroid)));
                    index = GetNewCentroidIndex(measure);
                    if (index > 0 && index < measure.Count)
                        reviewsCentroid = members[index];
                    break;

                case Label.news:
                    members.ForEach(x => measure.Add(distance.GetNormalizedDistance(x, newsCentroid)));
                    index = GetNewCentroidIndex(measure);
                    if (index > 0 && index < measure.Count)
                        newsCentroid = members[index];
                    break;

                case Label.editorials: members.ForEach(x => measure.Add(distance.GetNormalizedDistance(x, editorialCentroid)));
                    index = GetNewCentroidIndex(measure);
                    if (index > 0 && index < measure.Count)
                        editorialCentroid = members[index];
                    break;
            }
        }