Exemplo n.º 1
0
        public override string Classify(DocumentVector Vector)
        {
            List<KeyValuePair<LabeledDocumentVector,double>> Sorted = Sort(Vector);

            Dictionary<String, double> LabelCount = new Dictionary<string, double>();
            int startIndex = Sorted.Count - 1;

            //should compare to K of the labeled documents!
            for (int i = 0; i < K; i++)
            {
                String Classification = Sorted[startIndex - i].Key.Classification;
                if (!LabelCount.ContainsKey(Classification))
                    LabelCount.Add(Classification, 0);

                //improvement over the standard count model (add the similarity measures)
                LabelCount[Classification]++;
            }

            double Max = Int32.MinValue;
            String Output = null;
            foreach (KeyValuePair<String, double> pair in LabelCount)
            {
                if (pair.Value > Max)
                {
                    Max = pair.Value;
                    Output = pair.Key;
                }
            }

            return Output;
        }
Exemplo n.º 2
0
        private DocumentVector nextCentroidVector(List <DocumentVector> vectors, List <CentroidVector> centroids)
        {
            double         maxClosedDistance = 0.0;
            DocumentVector maxClosedVector   = null;

            foreach (DocumentVector vector in vectors)
            {
                double minDistance = Double.MaxValue;
                foreach (CentroidVector centroid in centroids)
                {
                    double currentDistance = cosSimilarity(vector, centroid);
                    if (minDistance > currentDistance)
                    {
                        minDistance = currentDistance;
                    }
                }

                if (maxClosedDistance < minDistance)
                {
                    maxClosedDistance = minDistance;
                    maxClosedVector   = vector;
                }
            }

            return(maxClosedVector);
        }
Exemplo n.º 3
0
        private NormalizedVector Normalize(String id, DocumentVector doc)
        {
            NormalizedVector vec = new NormalizedVector(id);

            // find max value
            int max = 0;

            foreach (KeyValuePair <string, int> term in doc)
            {
                if (term.Value > max)
                {
                    max = term.Value;
                }
            }

            lengths.Add(id, 0);

            // add normalized frequencies
            foreach (KeyValuePair <string, int> term in doc)
            {
                double tf = term.Value / (double)max;
                vec.Add(term.Key, tf);
                lengths[id] += Math.Pow(tf, 2);
            }

            lengths[id] = Math.Sqrt(lengths[id]);

            return(vec);
        }
Exemplo n.º 4
0
        //optimised bubble sort
        //has O(Kn) time, ie linear O(n)
        private List<KeyValuePair<LabeledDocumentVector, double>> Sort( DocumentVector item )
        {
            //first calculate comparasin values
            List<KeyValuePair<LabeledDocumentVector,double>> Values = new List<KeyValuePair<LabeledDocumentVector,double>>();
            for (int i = 0; i < TrainingData.Count; i++)
            {
                Values.Add( new KeyValuePair<LabeledDocumentVector,double>(
                                TrainingData[i], Comparer.Compare(TrainingData[i].Document, item)));
            }

            int j = 0;
            bool flag = false;

            do
            {
                j++;
                flag = false;
                for (int i = 0; i < Values.Count - j; i++)
                {
                    if ( Values[i].Value > Values[i+1].Value  )
                    {
                        KeyValuePair<LabeledDocumentVector,double> dummy = Values[i];
                        Values[i] = Values[i + 1];
                        Values[i + 1] = dummy;
                        flag = true;
                    }
                }

            } while (flag && j<K);

            return Values;
        }
        private static DocDetails GetAllDetails(List <DocumentVector> docCollection, DocumentVector seedPoint, DocDetails docDetails)
        {
            float[] Weights = new float[docCollection.Count];
            float   minD    = float.MaxValue;
            float   Sum     = 0;
            int     i       = 0;

            foreach (DocumentVector point in docCollection)
            {
                if (point == seedPoint) //Delta is 0
                {
                    continue;
                }

                Weights[i] = KMeansPlus.GetEucliedeanDistance(point, seedPoint);
                Sum       += Weights[i];
                if (Weights[i] < minD)
                {
                    minD = Weights[i];
                }
                i++;
            }

            docDetails.SeedDocVect = seedPoint;
            docDetails.Weights     = Weights;
            docDetails.Sum         = Sum;
            docDetails.MinD        = minD;

            return(docDetails);
        }
 public SearchResultDocument(Uri searchUri, String originatingFoodName, Uri documentUri, DocumentVector documentVector)
 {
     SearchUri = searchUri;
     OriginatingFoodName = originatingFoodName;
     DocumentUri = documentUri;
     DocumentVector = documentVector;
 }
Exemplo n.º 7
0
        public IList <DocumentVector <T> > BuildVectorSpace(IList <Document <T> > documents)
        {
            var distinctTerms       = new HashSet <string>();
            var documentVectorSpace = new List <DocumentVector <T> >();
            DocumentVector <T> _documentVector;

            float[] space;

            documentValues = documents.Select(d => SplitExpression.Split(d.ToString().ToLower())).Where(x => x.Length >= 2).ToList();

            foreach (var documentContent in documents)
            {
                foreach (string term in SplitExpression.Split(documentContent.ToString()).Where(t => t.Length >= 2))
                {
                    distinctTerms.Add(term);
                }
            }

            foreach (var document in documents)
            {
                int count = 0;
                space = new float[distinctTerms.Count];
                Parallel.ForEach(distinctTerms, term =>
                {
                    space[count] = FindTFIDF(document.ToString(), term);
                    count++;
                });
                _documentVector             = new DocumentVector <T>();
                _documentVector.Content     = document.GetData();
                _documentVector.VectorSpace = space;
                documentVectorSpace.Add(_documentVector);
            }

            return(documentVectorSpace);
        }
Exemplo n.º 8
0
        private NormalizedVector Normalize(String id, DocumentVector doc)
        {
            NormalizedVector vec = new NormalizedVector(id);

            // find max value
            int max = 0;
            foreach (KeyValuePair<string, int> term in doc)
            {
                if (term.Value > max)
                {
                    max = term.Value;
                }
            }

            lengths.Add(id, 0);

            // add normalized frequencies
            foreach (KeyValuePair<string, int> term in doc)
            {
                double tf = term.Value / (double)max;
                vec.Add(term.Key, tf);
                lengths[id] += Math.Pow(tf, 2);
            }

            lengths[id] = Math.Sqrt(lengths[id]);

            return vec;
        }
Exemplo n.º 9
0
        private static List <DocumentVector> transformTFIDFs2Vectors(List <string> documents, Dictionary <string, Dictionary <string, double> > tfidfs)
        {
            ISet <string> wordSet = getWordSetFromTFIDFs(tfidfs);

            if (wordSet == null)
            {
                return(null);
            }

            List <DocumentVector> vectors = new List <DocumentVector>(tfidfs.Count);
            int segmentIndex = 0;

            foreach (var tfidfItem in tfidfs)
            {
                DocumentVector vector = new DocumentVector();
                foreach (string word in wordSet)
                {
                    if (tfidfItem.Value.ContainsKey(word))
                    {
                        vector.addWeight(tfidfItem.Value[word]);
                    }
                    else
                    {
                        vector.addWeight(0.0);
                    }
                }
                vector.setLabel(documents[segmentIndex]);
                vectors.Add(vector);
                segmentIndex++;
            }

            return(vectors);
        }
        public override void Compute()
        {
            int              numDocs = (int)Workspace.Load("NumberOfDocuments");
            DocumentVector   df      = (DocumentVector)Workspace.Load("DocumentFrequencies");
            NormalizedVector idf     = Models.InverseDocumentFrequency.Compute(df, numDocs);

            Workspace.Store("InverseDocumentFrequencies", idf);
        }
Exemplo n.º 11
0
        private CentroidVector transform2CentroidVector(DocumentVector documentVector)
        {
            CentroidVector centroid = new CentroidVector();

            foreach (double weight in documentVector.getWeightVector())
            {
                centroid.addWeight(weight);
            }

            return(centroid);
        }
        private static float GetDocumentDistance(DocumentVector doc1, DocumentVector doc2)
        {
            var dist = 0.0f;

            for (var i = 0; i < doc1.VectorSpace.Length; i++)
            {
                dist += (float)Math.Pow((double)(doc1.VectorSpace[i] - doc2.VectorSpace[i]), 2.0);
            }
            dist = (float)Math.Pow((double)dist, 0.5);
            return(dist);
        }
        public static NormalizedVector Compute(DocumentVector df, int numDocs)
        {
            NormalizedVector idf = new NormalizedVector("InverseDocumentFrequencies");

            foreach (KeyValuePair<string, int> kvp in df)
            {
                idf.Add(kvp.Key, Math.Log(numDocs / (double) kvp.Value, 2));
            }

            return idf;
        }
 /*
  * Instead of looking at every term across all documents,
  * only look at the terms in the query, because all other terms
  * will be 0, resulting in q*d=0.
  * Typically the number of terms in a query is less than
  * the number of terms in a document.
  */
 private static double ComputeProduct(DocumentVector query, NormalizedVector doc)
 {
     double val = 0;
     foreach (KeyValuePair<string, int> term in query)
     {
         double d;
         doc.TryGetValue(term.Key, out d);
         val += term.Value * d;
     }
     return val;
 }
Exemplo n.º 15
0
        public static NormalizedVector Compute(DocumentVector df, int numDocs)
        {
            NormalizedVector idf = new NormalizedVector("InverseDocumentFrequencies");

            foreach (KeyValuePair <string, int> kvp in df)
            {
                idf.Add(kvp.Key, Math.Log(numDocs / (double)kvp.Value, 2));
            }

            return(idf);
        }
Exemplo n.º 16
0
        public Vectorizer(TLArtifactsCollection artifacts, String representation)
        {
            vectors = new DocumentVectorCollection();
            freq = new DocumentVector("DocumentFrequencies");

            foreach (KeyValuePair<string, TLArtifact> kvp in artifacts)
            {
                // vars
                String docID = kvp.Value.Id;
                String[] words = kvp.Value.Text.Split(' ');

                // create new document representation
                DocumentVector vec = new DocumentVector(docID);
                List<String> addedWords = new List<String>();

                // loop over each word and update its frequency
                foreach (String word in words)
                {
                    // update term-doc frequency only ONCE per document
                    if (!freq.ContainsKey(word))
                    {
                        freq.Add(word, 1);
                        addedWords.Add(word);
                    }
                    else if (!addedWords.Contains(word))
                    {
                        freq[word]++;
                        addedWords.Add(word);
                    }

                    // update word freqency
                    if (!vec.ContainsKey(word))
                    {
                        vec.Add(word, 1);
                    }
                    else
                    {
                        if (representation == "Ordinal")
                        {
                            vec[word]++;
                        }
                    }
                    // update MaxFreq
                    if (vec[word] > vec.MaxFreq.Value)
                    {
                        vec.MaxFreq = new KeyValuePair<string, int>(word, vec[word]);
                    }
                }

                // add document to vector collection
                vectors.Add(vec);
            }
        }
Exemplo n.º 17
0
        private double distance(DocumentVector vector1, DocumentVector vector2)
        {
            double distance   = 0.0;
            int    dimensions = vector1.getWeightVector().Count;

            for (int index = 0; index < dimensions; index++)
            {
                distance += ((vector1.getWeightVector()[index] - vector2.getWeightVector()[index]) * (vector1.getWeightVector()[index] - vector2.getWeightVector()[index]));
            }

            return(Math.Sqrt(distance));
        }
Exemplo n.º 18
0
        /*
         * Instead of looking at every term across all documents,
         * only look at the terms in the query, because all other terms
         * will be 0, resulting in q*d=0.
         * Typically the number of terms in a query is less than
         * the number of terms in a document.
         */
        private static double ComputeProduct(DocumentVector query, NormalizedVector doc)
        {
            double val = 0;

            foreach (KeyValuePair <string, int> term in query)
            {
                double d;
                doc.TryGetValue(term.Key, out d);
                val += term.Value * d;
            }
            return(val);
        }
        private static Centroid chose_Random_Centroid(List <string> docCollection, List <DocumentVector> vSpace, int document_Collection_length)
        {
            Centroid firstCentroid = new Centroid();

            firstCentroid.GroupedDocument = new List <DocumentVector>();
            Random         rand        = new Random();
            int            index       = rand.Next(0, document_Collection_length);
            DocumentVector firstvector = vSpace[index];

            firstCentroid.GroupedDocument.Add(firstvector);
            return(firstCentroid);
        }
Exemplo n.º 20
0
        public Vectorizer(TLArtifactsCollection artifacts, String representation)
        {
            vectors = new DocumentVectorCollection();
            freq    = new DocumentVector("DocumentFrequencies");

            foreach (KeyValuePair <string, TLArtifact> kvp in artifacts)
            {
                // vars
                String   docID = kvp.Value.Id;
                String[] words = kvp.Value.Text.Split(' ');

                // create new document representation
                DocumentVector vec        = new DocumentVector(docID);
                List <String>  addedWords = new List <String>();

                // loop over each word and update its frequency
                foreach (String word in words)
                {
                    // update term-doc frequency only ONCE per document
                    if (!freq.ContainsKey(word))
                    {
                        freq.Add(word, 1);
                        addedWords.Add(word);
                    }
                    else if (!addedWords.Contains(word))
                    {
                        freq[word]++;
                        addedWords.Add(word);
                    }

                    // update word freqency
                    if (!vec.ContainsKey(word))
                    {
                        vec.Add(word, 1);
                    }
                    else
                    {
                        if (representation == "Ordinal")
                        {
                            vec[word]++;
                        }
                    }
                    // update MaxFreq
                    if (vec[word] > vec.MaxFreq.Value)
                    {
                        vec.MaxFreq = new KeyValuePair <string, int>(word, vec[word]);
                    }
                }

                // add document to vector collection
                vectors.Add(vec);
            }
        }
Exemplo n.º 21
0
        /*
         * public double ComputeTDFDistance(DetailedDocumentVector doc2)
         * {
         *  double result = 0.0;
         *  if (this.GetTDFDimensions() != doc2.GetTDFDimensions())
         *      throw new ArgumentOutOfRangeException();
         *  for (var i = 0; i < doc2.GetTDFDimensions(); i++)
         *      result += Math.Pow(Math.Abs(tDF[i] - doc2.TDF[i]), 2.0);
         *  return result;
         *
         * }
         *
         * public double ComputeIDFDistance(DetailedDocumentVector doc2)
         * {
         *  double result = 0.0;
         *  if (this.GetIDFDimensions() != doc2.GetIDFDimensions())
         *      throw new ArgumentOutOfRangeException();
         *  for (var i = 0; i < doc2.GetIDFDimensions(); i++)
         *      result += Math.Pow(Math.Abs(iDF[i] - doc2.IDF[i]), 2.0);
         *  return result;
         *
         * }
         */
        #endregion

        public float ComputeTFIDFDistance(DocumentVector doc2)
        {
            float result = 0;

            if (this.GetTFIDFDimensions() != doc2.VectorSpace.Length)
            {
                throw new ArgumentOutOfRangeException();
            }
            for (int i = 0; i < doc2.VectorSpace.Length; i++)
            {
                result += (float)Math.Pow(Math.Abs(tfIDF[i] - doc2.VectorSpace[i]), 2);
            }
            return(result);
        }
Exemplo n.º 22
0
 protected CentroidsKMeansPPKP FindNearestClusterCenter(DocumentVector doc)
 {
     var minDistance = (double)dimensions;
     CentroidsKMeansPPKP bestClusterCenter = clusters.First();
     foreach (var cluster in clusters)
     {
         var distance = cluster.ComputeTFIDFDistance(doc);
         if (distance < minDistance)
         {
             bestClusterCenter = cluster;
             minDistance = distance;
         }
     }
     return bestClusterCenter;
 }
        public void FactMethodName()
        {
            var sut = Sys.ActorOf(Props.Create(() => new DocumentScoringActor()));
            var searchUri = new Uri("http://google.com");
            var foodTerms = new FoodNameTerms("food stuff, mucho");
            var documentUri = new Uri("http://example.com");
            var documentVector = new DocumentVector(new[] { "foo", "bar" });
            var document = new SearchResultDocument(searchUri, foodTerms.FoodName, documentUri, documentVector);
            var compareTerms = foodTerms;
            var originatingTerms = foodTerms;
            sut.Tell(new ScoreDocumentRequestMessage(document, compareTerms, originatingTerms));

            var result = ExpectMsg<ScoreDocumentResultMessage>(duration: TimeSpan.FromMinutes(2));
            Assert.NotNull(result);
        }
Exemplo n.º 24
0
        private CentroidVector getClosestCentroid(List <CentroidVector> centroids, DocumentVector documentVector)
        {
            double         minDistance       = Double.MaxValue;
            CentroidVector minCentroidVector = null;

            foreach (CentroidVector centroid in centroids)
            {
                double currentDistance = cosSimilarity(documentVector, centroid);
                if (minDistance > currentDistance)
                {
                    minDistance       = currentDistance;
                    minCentroidVector = centroid;
                }
            }

            return(minCentroidVector);
        }
        private static float Move(DocumentVector documentVector1, DocumentVector documentVector2, float G)
        {
            int length = documentVector1.VectorSpace.Count();

            float[] d        = new float[length];
            var     distance = GetDocumentDistance(documentVector1, documentVector2);

            for (int i = 0; i < length; i++)
            {
                d[i] = documentVector2.VectorSpace[i] - documentVector1.VectorSpace[i];
            }
            for (var i = 0; i < length; i++)
            {
                documentVector1.VectorSpace[i] = documentVector1.VectorSpace[i] + d[i] * (G / (float)Math.Pow(distance, 3.0));
            }
            return(distance);
        }
Exemplo n.º 26
0
        public void FactMethodName()
        {
            var sut              = Sys.ActorOf(Props.Create(() => new DocumentScoringActor()));
            var searchUri        = new Uri("http://google.com");
            var foodTerms        = new FoodNameTerms("food stuff, mucho");
            var documentUri      = new Uri("http://example.com");
            var documentVector   = new DocumentVector(new[] { "foo", "bar" });
            var document         = new SearchResultDocument(searchUri, foodTerms.FoodName, documentUri, documentVector);
            var compareTerms     = foodTerms;
            var originatingTerms = foodTerms;

            sut.Tell(new ScoreDocumentRequestMessage(document, compareTerms, originatingTerms));

            var result = ExpectMsg <ScoreDocumentResultMessage>(duration: TimeSpan.FromMinutes(2));

            Assert.NotNull(result);
        }
Exemplo n.º 27
0
        /// <summary>
        /// Takes an input document vector and returns the perceptrons
        /// guess on its classification using the training data provided
        /// </summary>
        /// <param name="Vector">Input document vector to classify</param>
        /// <returns>String label classification</returns>
        public override string Classify(DocumentVector Vector)
        {
            double Max = double.MinValue;
            string classification = null;

            foreach( KeyValuePair<string, int> pair in LabelsDictionary )
            {
                VectorN w = W[pair.Value];
                double result = Mult(w,Vector.Vector);

                if (result > Max)
                {
                    classification = pair.Key;
                    Max = result;
                }
            }

            return classification;
        }
Exemplo n.º 28
0
        private List <CentroidVector> randomCentroidVectorList2(List <DocumentVector> vectors)
        {
            List <CentroidVector> centroids = new List <CentroidVector>();
            Random randomSeed = new Random(1);

            for (int index = 0; index < k; index++)
            {
                int            randomIndex    = randomSeed.Next(vectors.Count);
                DocumentVector documentVector = vectors[randomIndex];
                CentroidVector centroid       = new CentroidVector();
                foreach (double weight in documentVector.getWeightVector())
                {
                    centroid.addWeight(weight);
                }
                centroids.Add(centroid);
            }

            return(centroids);
        }
Exemplo n.º 29
0
        private double cos(DocumentVector vector1, DocumentVector vector2)
        {
            int    dimensions       = vector1.getWeightVector().Count;
            double molecular        = 0.0;
            double denominator      = 0.0;
            double denominatorLeft  = 0.0;
            double denominatorRight = 0.0;

            for (int index = 0; index < dimensions; index++)
            {
                molecular        += (vector1.getWeightVector()[index] * vector2.getWeightVector()[index]);
                denominatorLeft  += (vector1.getWeightVector()[index] * vector1.getWeightVector()[index]);
                denominatorRight += (vector2.getWeightVector()[index] * vector2.getWeightVector()[index]);
            }

            denominator = Math.Sqrt(denominatorLeft) * Math.Sqrt(denominatorRight);

            return(molecular / denominator);
        }
        private int FindClosestClusterCenter(List <Centeroid <T> > clusterCenter, DocumentVector <T> obj)
        {
            float[] similarityMeasure = new float[clusterCenter.Count()];
            for (int i = 0; i < clusterCenter.Count(); i++)
            {
                similarityMeasure[i] = SimilarityMatrics.FindCosineSimilarity(clusterCenter[i].GroupedDocument[0].VectorSpace, obj.VectorSpace);
            }
            int   index    = 0;
            float maxValue = similarityMeasure[0];

            for (int i = 0; i < similarityMeasure.Count(); i++)
            {
                if (similarityMeasure[i] > maxValue)
                {
                    maxValue = similarityMeasure[i];
                    index    = i;
                }
            }
            return(index);
        }
        public static List <DocumentVector> GetSeedPoints2v(List <DocumentVector> docCollection, int k)
        {
            List <DocumentVector> seedPoints = new List <DocumentVector>(k);
            DocDetails            docDetails;
            List <DocDetails>     docDetailsList = new List <DocDetails>();
            int index = 0;

            int            firstIndex = KMeansPlus.GenerateRandomNumber(0, docCollection.Count);
            DocumentVector FirstPoint = docCollection[firstIndex];

            seedPoints.Add(FirstPoint);

            for (int i = 0; i < k - 1; i++)
            {
                if (seedPoints.Count >= 2)
                {
                    DocDetails minpd = GetMinimalPointDistance(docDetailsList);
                    index = GetWeightedProbDist(minpd.Weights, minpd.Sum);
                    DocumentVector SubsequentPoint = docCollection[index];

                    docDetails = new DocDetails();
                    docDetails = GetAllDetails(docCollection, SubsequentPoint, docDetails);
                    docDetailsList.Add(docDetails);
                }
                else
                {
                    docDetails = new DocDetails();
                    docDetails = GetAllDetails(docCollection, FirstPoint, docDetails);
                    docDetailsList.Add(docDetails);
                    index = GetWeightedProbDist(docDetails.Weights, docDetails.Sum);
                    DocumentVector SecondPoint = docCollection[index];
                    seedPoints.Add(SecondPoint);

                    docDetails = new DocDetails();
                    docDetails = GetAllDetails(docCollection, SecondPoint, docDetails);
                    docDetailsList.Add(docDetails);
                }
            }
            return(seedPoints);
        }
Exemplo n.º 32
0
        public IList <DocumentVector <T> > Build(IList <Document <T> > documents)
        {
            distinctTerms      = new HashSet <string>();
            documentCollection = documents;
            foreach (var documentContent in documents)
            {
                foreach (string term in r.Split(documentContent.ToString()))
                {
                    distinctTerms.Add(term);
                }
            }
            List <string> removeList = new List <string>()
            {
                "\"", "\r", "\n", "(", ")", "[", "]", "{", "}", "", ".", " ", ","
            };

            foreach (string s in removeList)
            {
                distinctTerms.Remove(s);
            }
            List <DocumentVector <T> > documentVectorSpace = new List <DocumentVector <T> >();
            DocumentVector <T>         _documentVector;

            float[] space;
            foreach (var document in documentCollection)
            {
                int count = 0;
                space = new float[distinctTerms.Count];
                foreach (string term in distinctTerms)
                {
                    space[count] = FindTFIDF(document.ToString(), term);
                    count++;
                }
                _documentVector             = new DocumentVector <T>();
                _documentVector.Content     = document.GetData();
                _documentVector.VectorSpace = space;
                documentVectorSpace.Add(_documentVector);
            }
            return(documentVectorSpace);
        }
Exemplo n.º 33
0
        /// <summary>
        /// K-means++
        /// </summary>
        private List <CentroidVector> randomCentroidVectorList3(List <DocumentVector> vectors)
        {
            List <CentroidVector> centroids = new List <CentroidVector>();
            int randomIndex = new Random(1).Next(vectors.Count);

            for (int index = 0; index < k; index++)
            {
                DocumentVector documentVector = null;
                if (index == 0)
                {
                    documentVector = vectors[randomIndex];
                }
                else
                {
                    documentVector = nextCentroidVector(vectors, centroids);
                }

                centroids.Add(transform2CentroidVector(documentVector));
            }

            return(centroids);
        }
Exemplo n.º 34
0
        private int FindClosestClusterCenter(List <Centeroid <T> > clusterCenter, DocumentVector <T> docVector)
        {
            float[] similarityMeasure = new float[clusterCenter.Count];
            int     index             = 0;
            float   maxValue          = similarityMeasure[0];



            Parallel.For(0, clusterCenter.Count, i =>
            {
                if (clusterCenter[i].GroupedDocument.Count > 0)
                {
                    similarityMeasure[i] = SimilarityMatrics
                                           .FindCosineSimilarity(clusterCenter[i].GroupedDocument[0].VectorSpace, docVector.VectorSpace);
                }
                if (similarityMeasure[i] > maxValue)
                {
                    maxValue = similarityMeasure[i];
                    index    = i;
                }
            });

            return(index);
        }
Exemplo n.º 35
0
        private List <CentroidVector> randomCentroidVectorList(List <DocumentVector> vectors)
        {
            if (vectors.Count < k)
            {
                Console.WriteLine("Sorry, no more vector to random.");
                return(null);
            }

            List <CentroidVector> centroids = new List <CentroidVector>();
            List <int>            indexs    = CommonUtils.randomSetByFloyd(0, vectors.Count, k);

            foreach (int index in indexs)
            {
                DocumentVector documentVector = vectors[index];
                CentroidVector centroid       = new CentroidVector();
                foreach (double weight in documentVector.getWeightVector())
                {
                    centroid.addWeight(weight);
                }
                centroids.Add(centroid);
            }

            return(centroids);
        }
Exemplo n.º 36
0
 public string Decision(DocumentVector vector)
 {
     return Classification;
 }
        public static List <Centroid> AverageMeansAssigned(List <Centroid> fillCentroidCollection, List <DocumentVector> vectorSpace)
        {
            List <Centroid>       result;
            List <DocumentVector> newVectorSpace = vectorSpace;
            int length = vectorSpace[0].VectorSpace.Length;

            float[] newVectorSpaceArray   = new float[length];
            float[] minDistancesToCluster = new float[0];

            for (int i = 0; i < length; i++)
            {
                newVectorSpaceArray[i] = 0.0F;
            }

            for (int c = 0; c < fillCentroidCollection.Count; c++)
            {
                for (int gd = 0; gd < fillCentroidCollection[c].GroupedDocument.Count; gd++)
                {
                    for (int k = 0; k < fillCentroidCollection[c].GroupedDocument[gd].VectorSpace.Length; k++)
                    {
                        newVectorSpaceArray[k] += fillCentroidCollection[c].GroupedDocument[gd].VectorSpace[k];
                    }
                }
            }

            for (int c1 = 0; c1 < fillCentroidCollection.Count; c1++)
            {
                for (int gd1 = 0; gd1 < fillCentroidCollection[c1].GroupedDocument.Count; gd1++)
                {
                    for (int k1 = 0; k1 < fillCentroidCollection[c1].GroupedDocument[gd1].VectorSpace.Length; k1++)
                    {
                        newVectorSpaceArray[k1] = newVectorSpaceArray[k1] / fillCentroidCollection[c1].GroupedDocument.Count;
                    }
                }
            }

            float minDist      = 0.1F;
            float currentValue = 0.1F;
            int   index        = 0;

            for (int i = 0; i < fillCentroidCollection.Count; i++)
            {
                minDistancesToCluster = new float[fillCentroidCollection[i].GroupedDocument.Count];
                for (int j = 0; j < fillCentroidCollection[i].GroupedDocument.Count; j++)
                {
                    //minDistancesToCluster = new float[fillCentroidCollection[i].GroupedDocument.Count];
                    minDistancesToCluster[j] = SimilarityMatrixCalculations.FindEuclideanDistance(fillCentroidCollection[i].GroupedDocument.First().VectorSpace, fillCentroidCollection[i].GroupedDocument[j].VectorSpace);
                    //}

                    for (int z = 0; z < minDistancesToCluster.Length; z++)
                    {
                        currentValue = minDistancesToCluster[z];
                        if (currentValue <= minDist && currentValue != 0)
                        {
                            minDist = currentValue;
                            index   = z;
                        }
                        //here we must to find the closest document to new vectorSpace;
                        //for all docs in cluster create the vectorSpace
                    }

                    /*
                     * DocumentVector newClusterCenter = fillCentroidCollection[i].GroupedDocument[index];
                     * fillCentroidCollection[i].GroupedDocument.Clear();
                     * fillCentroidCollection[i].GroupedDocument.Add(newClusterCenter);
                     */
                }
                DocumentVector newClusterCenter = fillCentroidCollection[i].GroupedDocument[index];
                index = 0;
                fillCentroidCollection[i].GroupedDocument.Clear();
                fillCentroidCollection[i].GroupedDocument.Add(newClusterCenter);
            }

            minDistancesToCluster = new float[0];
            result = new List <Centroid>(fillCentroidCollection);
            return(result);
        }
Exemplo n.º 38
0
 public void SetDocument(DocumentVector doc)
 {
     document = doc;
 }
        /// <summary>
        /// Here the description of Gravitational clustering algorithm.
        /// </summary>
        /// <param name="docCollection">List of entry elements.</param>
        /// <param name="G">Gravitational parameter value, for the test = 7*10^(-6).</param>
        /// <param name="deltaG">Gravitational forse loss = 0.01F.</param>
        /// <param name="M">Count of iteration, for test = 500.</param>
        /// <param name="epsilon">Minimum distance, for test = 10^(-4).</param>
        /// <returns>List<Centroid> result = sets stored in disjoint set union-find strukture.</returns>
        public static List <Centroid> Gravitational(List <DocumentVector> docCollection, float G, float deltaG, int M, float epsilon)
        {
            List <Centroid>       result        = new List <Centroid>();
            List <DocumentVector> docVectorCopy = new List <DocumentVector>(docCollection);
            int    docVectorCopy_Count          = docVectorCopy.Count;
            int    index      = 0;
            Random rand       = new Random();
            var    set_result = DisjointSet.Set(docVectorCopy);

            float[]         documentVectorOriginalFirst  = new float[docVectorCopy[0].VectorSpace.Length];
            float[]         documentVectorOriginalSecond = new float[docVectorCopy[0].VectorSpace.Length];
            int[]           parent       = set_result.Item1;
            int[]           rank         = set_result.Item2;
            List <Centroid> centroidSet  = set_result.Item3;
            List <Centroid> unionChanged = new List <Centroid>(centroidSet);

            for (int i = 0; i < M; i++)
            {
                for (int j = 0; j < unionChanged.Count; j++)
                {
                    if (j == 0)
                    {
                        index = rand.Next(0, docVectorCopy.Count - 1);
                    }
                    else
                    {
                        index = rand.Next(0, unionChanged.Count - 1);
                    }

                    if (index != j)
                    {
                        DocumentVector document = new DocumentVector();
                        //document.SaveOriginal(docVectorCopy[j]);
                        //documentVectorOriginalFirst = document.OriginalVectorSpace;
                        //float[,] distanceMatrix = Move(docVectorCopy[j], docVectorCopy[index], docVectorCopy_Count);
                        var distance = Move(docVectorCopy[j], docVectorCopy[index], G);

                        //if(distanceMatrix[j, index]<= epsilon)
                        if (distance <= epsilon)
                        {
                            if (j == 0)
                            {
                                var unionChangedResultTuple = DisjointSet.Union(j, index, centroidSet);
                                unionChanged = unionChangedResultTuple.Item3;
                                parent       = unionChangedResultTuple.Item1;
                            }
                            // how to make union between to clusters.
                            else
                            {
                                var unionChangedResultTuple = DisjointSet.Union(j, index, unionChanged);
                                unionChanged = unionChangedResultTuple.Item3;
                                parent       = unionChangedResultTuple.Item1;
                            }
                            //List<Centroid> unionChanged = DisjointSet.Union(docVectorCopy[j], docVectorCopy[index]);
                        }
                    }

                    G = (1 - deltaG) * G;
                    for (int z = 0; z < result.Count; z++)
                    {
                        for (int k = 0; k < result[z].GroupedDocument.Count; k++)
                        {
                            DocumentVector element = docVectorCopy[DisjointSet.Find(parent, k)];
                        }
                    }
                }
            }
            result = unionChanged;
            return(result);
        }
Exemplo n.º 40
0
 public override string Classify(DocumentVector Vector)
 {
     return RootNode.Decision(Vector);
 }
Exemplo n.º 41
0
        private void btnAdd_Click(object sender, EventArgs e)
        {
            int newDoc = 0;

            if (!string.IsNullOrEmpty(txtDoc1.Text))
            {
                docCollection.DocumentList.Add(txtDoc1.Text);
                newDoc++;
            }
            if (!string.IsNullOrEmpty(txtDoc2.Text))
            {
                newDoc++;
                docCollection.DocumentList.Add(txtDoc2.Text);
            }
            if (!string.IsNullOrEmpty(txtDoc3.Text))
            {
                docCollection.DocumentList.Add(txtDoc3.Text);
                newDoc++;
            }
            if (!string.IsNullOrEmpty(txtDoc4.Text))
            {
                newDoc++;
                docCollection.DocumentList.Add(txtDoc4.Text);
            }


            int totalDoc = 0;

            if (int.TryParse(docCollection.DocumentList.Count.ToString(), out totalDoc))
            {
                lblTotalDoc.Text = totalDoc.ToString();
            }

            txtDoc1.Clear();
            txtDoc2.Clear();
            txtDoc3.Clear();
            txtDoc4.Clear();

            if (ddlType.Text == "Incremental" && DocumnetClustering.mainCentroids.Count > 0)
            {
                switch (ddlIncAlg.Text)
                {
                case "KMeans":
                    List <DocumentVector> vSpace = VectorSpaceModel.ProcessDocumentCollection(docCollection);
                    for (int i = 1; i <= newDoc; i++)
                    {
                        DocumentVector obj   = vSpace[vSpace.Count - i];
                        int            index = DocumnetClustering.FindClosestClusterCenter(DocumnetClustering.mainCentroids, obj, ddl_sim.Text);
                        DocumnetClustering.mainCentroids[index].GroupedDocument.Add(obj);
                    }
                    break;

                case "CMeans":
                    List <DocumentVector> vSpace2 = VectorSpaceModel.ProcessDocumentCollection(docCollection);

                    string         outFilepath = @"E:\Dropbox\Masters\myMSc\PracticalPart\Sematic_K-MEANSClustering\FCM\HM_data_Out_centers.dat";
                    var            reader      = new StreamReader(File.OpenRead(outFilepath));
                    List <float[]> values      = new List <float[]>();
                    int            t           = 0;
                    while (!reader.EndOfStream)
                    {
                        var line = reader.ReadLine();
                        values.Add(Array.ConvertAll(line.Split(','), float.Parse));
                        t++;
                    }

                    for (int i = 0; i < newDoc; i++)
                    {
                        int            closeCenter = 0;
                        float          min         = 1000;
                        int            counter     = 1;
                        DocumentVector obj2        = vSpace2[vSpace2.Count - newDoc + i];
                        for (int l = 0; l < t; l++)
                        {
                            //                                float s = SimilarityMatrics.FindCosineSimilarity(values[l], obj2.VectorSpace);
                            float s = ArrayDistanceFunction(values[l], obj2.VectorSpace);
                            if (s < min)
                            {
                                min         = s;
                                closeCenter = counter;
                            }
                            counter++;
                        }

                        MessageBox.Show("Doc:" + (i + 1) + " Close is:" + closeCenter);
                        DocumnetClustering.mainCentroids[closeCenter - 1].GroupedDocument.Add(obj2);
                    }


                    break;
                }
                printAlll();
            }
        }
Exemplo n.º 42
0
 public abstract String Classify(DocumentVector Vector);
Exemplo n.º 43
0
 public String Add(DocumentVector Vector)
 {
     String Label = Classify(Vector);
     Documents.Add( new LabeledDocumentVector(Vector,Label) );
     return Label;
 }
Exemplo n.º 44
0
 public string Decision(DocumentVector vector)
 {
     if (vector.Vector[Index] < Value)
         return LeftChild.Decision(vector);
     else
         return RightChild.Decision(vector);
 }