public override string Classify(DocumentVector Vector) { List<KeyValuePair<LabeledDocumentVector,double>> Sorted = Sort(Vector); Dictionary<String, double> LabelCount = new Dictionary<string, double>(); int startIndex = Sorted.Count - 1; //should compare to K of the labeled documents! for (int i = 0; i < K; i++) { String Classification = Sorted[startIndex - i].Key.Classification; if (!LabelCount.ContainsKey(Classification)) LabelCount.Add(Classification, 0); //improvement over the standard count model (add the similarity measures) LabelCount[Classification]++; } double Max = Int32.MinValue; String Output = null; foreach (KeyValuePair<String, double> pair in LabelCount) { if (pair.Value > Max) { Max = pair.Value; Output = pair.Key; } } return Output; }
private DocumentVector nextCentroidVector(List <DocumentVector> vectors, List <CentroidVector> centroids) { double maxClosedDistance = 0.0; DocumentVector maxClosedVector = null; foreach (DocumentVector vector in vectors) { double minDistance = Double.MaxValue; foreach (CentroidVector centroid in centroids) { double currentDistance = cosSimilarity(vector, centroid); if (minDistance > currentDistance) { minDistance = currentDistance; } } if (maxClosedDistance < minDistance) { maxClosedDistance = minDistance; maxClosedVector = vector; } } return(maxClosedVector); }
private NormalizedVector Normalize(String id, DocumentVector doc) { NormalizedVector vec = new NormalizedVector(id); // find max value int max = 0; foreach (KeyValuePair <string, int> term in doc) { if (term.Value > max) { max = term.Value; } } lengths.Add(id, 0); // add normalized frequencies foreach (KeyValuePair <string, int> term in doc) { double tf = term.Value / (double)max; vec.Add(term.Key, tf); lengths[id] += Math.Pow(tf, 2); } lengths[id] = Math.Sqrt(lengths[id]); return(vec); }
//optimised bubble sort //has O(Kn) time, ie linear O(n) private List<KeyValuePair<LabeledDocumentVector, double>> Sort( DocumentVector item ) { //first calculate comparasin values List<KeyValuePair<LabeledDocumentVector,double>> Values = new List<KeyValuePair<LabeledDocumentVector,double>>(); for (int i = 0; i < TrainingData.Count; i++) { Values.Add( new KeyValuePair<LabeledDocumentVector,double>( TrainingData[i], Comparer.Compare(TrainingData[i].Document, item))); } int j = 0; bool flag = false; do { j++; flag = false; for (int i = 0; i < Values.Count - j; i++) { if ( Values[i].Value > Values[i+1].Value ) { KeyValuePair<LabeledDocumentVector,double> dummy = Values[i]; Values[i] = Values[i + 1]; Values[i + 1] = dummy; flag = true; } } } while (flag && j<K); return Values; }
private static DocDetails GetAllDetails(List <DocumentVector> docCollection, DocumentVector seedPoint, DocDetails docDetails) { float[] Weights = new float[docCollection.Count]; float minD = float.MaxValue; float Sum = 0; int i = 0; foreach (DocumentVector point in docCollection) { if (point == seedPoint) //Delta is 0 { continue; } Weights[i] = KMeansPlus.GetEucliedeanDistance(point, seedPoint); Sum += Weights[i]; if (Weights[i] < minD) { minD = Weights[i]; } i++; } docDetails.SeedDocVect = seedPoint; docDetails.Weights = Weights; docDetails.Sum = Sum; docDetails.MinD = minD; return(docDetails); }
public SearchResultDocument(Uri searchUri, String originatingFoodName, Uri documentUri, DocumentVector documentVector) { SearchUri = searchUri; OriginatingFoodName = originatingFoodName; DocumentUri = documentUri; DocumentVector = documentVector; }
public IList <DocumentVector <T> > BuildVectorSpace(IList <Document <T> > documents) { var distinctTerms = new HashSet <string>(); var documentVectorSpace = new List <DocumentVector <T> >(); DocumentVector <T> _documentVector; float[] space; documentValues = documents.Select(d => SplitExpression.Split(d.ToString().ToLower())).Where(x => x.Length >= 2).ToList(); foreach (var documentContent in documents) { foreach (string term in SplitExpression.Split(documentContent.ToString()).Where(t => t.Length >= 2)) { distinctTerms.Add(term); } } foreach (var document in documents) { int count = 0; space = new float[distinctTerms.Count]; Parallel.ForEach(distinctTerms, term => { space[count] = FindTFIDF(document.ToString(), term); count++; }); _documentVector = new DocumentVector <T>(); _documentVector.Content = document.GetData(); _documentVector.VectorSpace = space; documentVectorSpace.Add(_documentVector); } return(documentVectorSpace); }
private NormalizedVector Normalize(String id, DocumentVector doc) { NormalizedVector vec = new NormalizedVector(id); // find max value int max = 0; foreach (KeyValuePair<string, int> term in doc) { if (term.Value > max) { max = term.Value; } } lengths.Add(id, 0); // add normalized frequencies foreach (KeyValuePair<string, int> term in doc) { double tf = term.Value / (double)max; vec.Add(term.Key, tf); lengths[id] += Math.Pow(tf, 2); } lengths[id] = Math.Sqrt(lengths[id]); return vec; }
private static List <DocumentVector> transformTFIDFs2Vectors(List <string> documents, Dictionary <string, Dictionary <string, double> > tfidfs) { ISet <string> wordSet = getWordSetFromTFIDFs(tfidfs); if (wordSet == null) { return(null); } List <DocumentVector> vectors = new List <DocumentVector>(tfidfs.Count); int segmentIndex = 0; foreach (var tfidfItem in tfidfs) { DocumentVector vector = new DocumentVector(); foreach (string word in wordSet) { if (tfidfItem.Value.ContainsKey(word)) { vector.addWeight(tfidfItem.Value[word]); } else { vector.addWeight(0.0); } } vector.setLabel(documents[segmentIndex]); vectors.Add(vector); segmentIndex++; } return(vectors); }
public override void Compute() { int numDocs = (int)Workspace.Load("NumberOfDocuments"); DocumentVector df = (DocumentVector)Workspace.Load("DocumentFrequencies"); NormalizedVector idf = Models.InverseDocumentFrequency.Compute(df, numDocs); Workspace.Store("InverseDocumentFrequencies", idf); }
private CentroidVector transform2CentroidVector(DocumentVector documentVector) { CentroidVector centroid = new CentroidVector(); foreach (double weight in documentVector.getWeightVector()) { centroid.addWeight(weight); } return(centroid); }
private static float GetDocumentDistance(DocumentVector doc1, DocumentVector doc2) { var dist = 0.0f; for (var i = 0; i < doc1.VectorSpace.Length; i++) { dist += (float)Math.Pow((double)(doc1.VectorSpace[i] - doc2.VectorSpace[i]), 2.0); } dist = (float)Math.Pow((double)dist, 0.5); return(dist); }
public static NormalizedVector Compute(DocumentVector df, int numDocs) { NormalizedVector idf = new NormalizedVector("InverseDocumentFrequencies"); foreach (KeyValuePair<string, int> kvp in df) { idf.Add(kvp.Key, Math.Log(numDocs / (double) kvp.Value, 2)); } return idf; }
/* * Instead of looking at every term across all documents, * only look at the terms in the query, because all other terms * will be 0, resulting in q*d=0. * Typically the number of terms in a query is less than * the number of terms in a document. */ private static double ComputeProduct(DocumentVector query, NormalizedVector doc) { double val = 0; foreach (KeyValuePair<string, int> term in query) { double d; doc.TryGetValue(term.Key, out d); val += term.Value * d; } return val; }
public static NormalizedVector Compute(DocumentVector df, int numDocs) { NormalizedVector idf = new NormalizedVector("InverseDocumentFrequencies"); foreach (KeyValuePair <string, int> kvp in df) { idf.Add(kvp.Key, Math.Log(numDocs / (double)kvp.Value, 2)); } return(idf); }
public Vectorizer(TLArtifactsCollection artifacts, String representation) { vectors = new DocumentVectorCollection(); freq = new DocumentVector("DocumentFrequencies"); foreach (KeyValuePair<string, TLArtifact> kvp in artifacts) { // vars String docID = kvp.Value.Id; String[] words = kvp.Value.Text.Split(' '); // create new document representation DocumentVector vec = new DocumentVector(docID); List<String> addedWords = new List<String>(); // loop over each word and update its frequency foreach (String word in words) { // update term-doc frequency only ONCE per document if (!freq.ContainsKey(word)) { freq.Add(word, 1); addedWords.Add(word); } else if (!addedWords.Contains(word)) { freq[word]++; addedWords.Add(word); } // update word freqency if (!vec.ContainsKey(word)) { vec.Add(word, 1); } else { if (representation == "Ordinal") { vec[word]++; } } // update MaxFreq if (vec[word] > vec.MaxFreq.Value) { vec.MaxFreq = new KeyValuePair<string, int>(word, vec[word]); } } // add document to vector collection vectors.Add(vec); } }
private double distance(DocumentVector vector1, DocumentVector vector2) { double distance = 0.0; int dimensions = vector1.getWeightVector().Count; for (int index = 0; index < dimensions; index++) { distance += ((vector1.getWeightVector()[index] - vector2.getWeightVector()[index]) * (vector1.getWeightVector()[index] - vector2.getWeightVector()[index])); } return(Math.Sqrt(distance)); }
/* * Instead of looking at every term across all documents, * only look at the terms in the query, because all other terms * will be 0, resulting in q*d=0. * Typically the number of terms in a query is less than * the number of terms in a document. */ private static double ComputeProduct(DocumentVector query, NormalizedVector doc) { double val = 0; foreach (KeyValuePair <string, int> term in query) { double d; doc.TryGetValue(term.Key, out d); val += term.Value * d; } return(val); }
private static Centroid chose_Random_Centroid(List <string> docCollection, List <DocumentVector> vSpace, int document_Collection_length) { Centroid firstCentroid = new Centroid(); firstCentroid.GroupedDocument = new List <DocumentVector>(); Random rand = new Random(); int index = rand.Next(0, document_Collection_length); DocumentVector firstvector = vSpace[index]; firstCentroid.GroupedDocument.Add(firstvector); return(firstCentroid); }
public Vectorizer(TLArtifactsCollection artifacts, String representation) { vectors = new DocumentVectorCollection(); freq = new DocumentVector("DocumentFrequencies"); foreach (KeyValuePair <string, TLArtifact> kvp in artifacts) { // vars String docID = kvp.Value.Id; String[] words = kvp.Value.Text.Split(' '); // create new document representation DocumentVector vec = new DocumentVector(docID); List <String> addedWords = new List <String>(); // loop over each word and update its frequency foreach (String word in words) { // update term-doc frequency only ONCE per document if (!freq.ContainsKey(word)) { freq.Add(word, 1); addedWords.Add(word); } else if (!addedWords.Contains(word)) { freq[word]++; addedWords.Add(word); } // update word freqency if (!vec.ContainsKey(word)) { vec.Add(word, 1); } else { if (representation == "Ordinal") { vec[word]++; } } // update MaxFreq if (vec[word] > vec.MaxFreq.Value) { vec.MaxFreq = new KeyValuePair <string, int>(word, vec[word]); } } // add document to vector collection vectors.Add(vec); } }
/* * public double ComputeTDFDistance(DetailedDocumentVector doc2) * { * double result = 0.0; * if (this.GetTDFDimensions() != doc2.GetTDFDimensions()) * throw new ArgumentOutOfRangeException(); * for (var i = 0; i < doc2.GetTDFDimensions(); i++) * result += Math.Pow(Math.Abs(tDF[i] - doc2.TDF[i]), 2.0); * return result; * * } * * public double ComputeIDFDistance(DetailedDocumentVector doc2) * { * double result = 0.0; * if (this.GetIDFDimensions() != doc2.GetIDFDimensions()) * throw new ArgumentOutOfRangeException(); * for (var i = 0; i < doc2.GetIDFDimensions(); i++) * result += Math.Pow(Math.Abs(iDF[i] - doc2.IDF[i]), 2.0); * return result; * * } */ #endregion public float ComputeTFIDFDistance(DocumentVector doc2) { float result = 0; if (this.GetTFIDFDimensions() != doc2.VectorSpace.Length) { throw new ArgumentOutOfRangeException(); } for (int i = 0; i < doc2.VectorSpace.Length; i++) { result += (float)Math.Pow(Math.Abs(tfIDF[i] - doc2.VectorSpace[i]), 2); } return(result); }
protected CentroidsKMeansPPKP FindNearestClusterCenter(DocumentVector doc) { var minDistance = (double)dimensions; CentroidsKMeansPPKP bestClusterCenter = clusters.First(); foreach (var cluster in clusters) { var distance = cluster.ComputeTFIDFDistance(doc); if (distance < minDistance) { bestClusterCenter = cluster; minDistance = distance; } } return bestClusterCenter; }
public void FactMethodName() { var sut = Sys.ActorOf(Props.Create(() => new DocumentScoringActor())); var searchUri = new Uri("http://google.com"); var foodTerms = new FoodNameTerms("food stuff, mucho"); var documentUri = new Uri("http://example.com"); var documentVector = new DocumentVector(new[] { "foo", "bar" }); var document = new SearchResultDocument(searchUri, foodTerms.FoodName, documentUri, documentVector); var compareTerms = foodTerms; var originatingTerms = foodTerms; sut.Tell(new ScoreDocumentRequestMessage(document, compareTerms, originatingTerms)); var result = ExpectMsg<ScoreDocumentResultMessage>(duration: TimeSpan.FromMinutes(2)); Assert.NotNull(result); }
private CentroidVector getClosestCentroid(List <CentroidVector> centroids, DocumentVector documentVector) { double minDistance = Double.MaxValue; CentroidVector minCentroidVector = null; foreach (CentroidVector centroid in centroids) { double currentDistance = cosSimilarity(documentVector, centroid); if (minDistance > currentDistance) { minDistance = currentDistance; minCentroidVector = centroid; } } return(minCentroidVector); }
private static float Move(DocumentVector documentVector1, DocumentVector documentVector2, float G) { int length = documentVector1.VectorSpace.Count(); float[] d = new float[length]; var distance = GetDocumentDistance(documentVector1, documentVector2); for (int i = 0; i < length; i++) { d[i] = documentVector2.VectorSpace[i] - documentVector1.VectorSpace[i]; } for (var i = 0; i < length; i++) { documentVector1.VectorSpace[i] = documentVector1.VectorSpace[i] + d[i] * (G / (float)Math.Pow(distance, 3.0)); } return(distance); }
public void FactMethodName() { var sut = Sys.ActorOf(Props.Create(() => new DocumentScoringActor())); var searchUri = new Uri("http://google.com"); var foodTerms = new FoodNameTerms("food stuff, mucho"); var documentUri = new Uri("http://example.com"); var documentVector = new DocumentVector(new[] { "foo", "bar" }); var document = new SearchResultDocument(searchUri, foodTerms.FoodName, documentUri, documentVector); var compareTerms = foodTerms; var originatingTerms = foodTerms; sut.Tell(new ScoreDocumentRequestMessage(document, compareTerms, originatingTerms)); var result = ExpectMsg <ScoreDocumentResultMessage>(duration: TimeSpan.FromMinutes(2)); Assert.NotNull(result); }
/// <summary> /// Takes an input document vector and returns the perceptrons /// guess on its classification using the training data provided /// </summary> /// <param name="Vector">Input document vector to classify</param> /// <returns>String label classification</returns> public override string Classify(DocumentVector Vector) { double Max = double.MinValue; string classification = null; foreach( KeyValuePair<string, int> pair in LabelsDictionary ) { VectorN w = W[pair.Value]; double result = Mult(w,Vector.Vector); if (result > Max) { classification = pair.Key; Max = result; } } return classification; }
private List <CentroidVector> randomCentroidVectorList2(List <DocumentVector> vectors) { List <CentroidVector> centroids = new List <CentroidVector>(); Random randomSeed = new Random(1); for (int index = 0; index < k; index++) { int randomIndex = randomSeed.Next(vectors.Count); DocumentVector documentVector = vectors[randomIndex]; CentroidVector centroid = new CentroidVector(); foreach (double weight in documentVector.getWeightVector()) { centroid.addWeight(weight); } centroids.Add(centroid); } return(centroids); }
private double cos(DocumentVector vector1, DocumentVector vector2) { int dimensions = vector1.getWeightVector().Count; double molecular = 0.0; double denominator = 0.0; double denominatorLeft = 0.0; double denominatorRight = 0.0; for (int index = 0; index < dimensions; index++) { molecular += (vector1.getWeightVector()[index] * vector2.getWeightVector()[index]); denominatorLeft += (vector1.getWeightVector()[index] * vector1.getWeightVector()[index]); denominatorRight += (vector2.getWeightVector()[index] * vector2.getWeightVector()[index]); } denominator = Math.Sqrt(denominatorLeft) * Math.Sqrt(denominatorRight); return(molecular / denominator); }
private int FindClosestClusterCenter(List <Centeroid <T> > clusterCenter, DocumentVector <T> obj) { float[] similarityMeasure = new float[clusterCenter.Count()]; for (int i = 0; i < clusterCenter.Count(); i++) { similarityMeasure[i] = SimilarityMatrics.FindCosineSimilarity(clusterCenter[i].GroupedDocument[0].VectorSpace, obj.VectorSpace); } int index = 0; float maxValue = similarityMeasure[0]; for (int i = 0; i < similarityMeasure.Count(); i++) { if (similarityMeasure[i] > maxValue) { maxValue = similarityMeasure[i]; index = i; } } return(index); }
public static List <DocumentVector> GetSeedPoints2v(List <DocumentVector> docCollection, int k) { List <DocumentVector> seedPoints = new List <DocumentVector>(k); DocDetails docDetails; List <DocDetails> docDetailsList = new List <DocDetails>(); int index = 0; int firstIndex = KMeansPlus.GenerateRandomNumber(0, docCollection.Count); DocumentVector FirstPoint = docCollection[firstIndex]; seedPoints.Add(FirstPoint); for (int i = 0; i < k - 1; i++) { if (seedPoints.Count >= 2) { DocDetails minpd = GetMinimalPointDistance(docDetailsList); index = GetWeightedProbDist(minpd.Weights, minpd.Sum); DocumentVector SubsequentPoint = docCollection[index]; docDetails = new DocDetails(); docDetails = GetAllDetails(docCollection, SubsequentPoint, docDetails); docDetailsList.Add(docDetails); } else { docDetails = new DocDetails(); docDetails = GetAllDetails(docCollection, FirstPoint, docDetails); docDetailsList.Add(docDetails); index = GetWeightedProbDist(docDetails.Weights, docDetails.Sum); DocumentVector SecondPoint = docCollection[index]; seedPoints.Add(SecondPoint); docDetails = new DocDetails(); docDetails = GetAllDetails(docCollection, SecondPoint, docDetails); docDetailsList.Add(docDetails); } } return(seedPoints); }
public IList <DocumentVector <T> > Build(IList <Document <T> > documents) { distinctTerms = new HashSet <string>(); documentCollection = documents; foreach (var documentContent in documents) { foreach (string term in r.Split(documentContent.ToString())) { distinctTerms.Add(term); } } List <string> removeList = new List <string>() { "\"", "\r", "\n", "(", ")", "[", "]", "{", "}", "", ".", " ", "," }; foreach (string s in removeList) { distinctTerms.Remove(s); } List <DocumentVector <T> > documentVectorSpace = new List <DocumentVector <T> >(); DocumentVector <T> _documentVector; float[] space; foreach (var document in documentCollection) { int count = 0; space = new float[distinctTerms.Count]; foreach (string term in distinctTerms) { space[count] = FindTFIDF(document.ToString(), term); count++; } _documentVector = new DocumentVector <T>(); _documentVector.Content = document.GetData(); _documentVector.VectorSpace = space; documentVectorSpace.Add(_documentVector); } return(documentVectorSpace); }
/// <summary> /// K-means++ /// </summary> private List <CentroidVector> randomCentroidVectorList3(List <DocumentVector> vectors) { List <CentroidVector> centroids = new List <CentroidVector>(); int randomIndex = new Random(1).Next(vectors.Count); for (int index = 0; index < k; index++) { DocumentVector documentVector = null; if (index == 0) { documentVector = vectors[randomIndex]; } else { documentVector = nextCentroidVector(vectors, centroids); } centroids.Add(transform2CentroidVector(documentVector)); } return(centroids); }
private int FindClosestClusterCenter(List <Centeroid <T> > clusterCenter, DocumentVector <T> docVector) { float[] similarityMeasure = new float[clusterCenter.Count]; int index = 0; float maxValue = similarityMeasure[0]; Parallel.For(0, clusterCenter.Count, i => { if (clusterCenter[i].GroupedDocument.Count > 0) { similarityMeasure[i] = SimilarityMatrics .FindCosineSimilarity(clusterCenter[i].GroupedDocument[0].VectorSpace, docVector.VectorSpace); } if (similarityMeasure[i] > maxValue) { maxValue = similarityMeasure[i]; index = i; } }); return(index); }
private List <CentroidVector> randomCentroidVectorList(List <DocumentVector> vectors) { if (vectors.Count < k) { Console.WriteLine("Sorry, no more vector to random."); return(null); } List <CentroidVector> centroids = new List <CentroidVector>(); List <int> indexs = CommonUtils.randomSetByFloyd(0, vectors.Count, k); foreach (int index in indexs) { DocumentVector documentVector = vectors[index]; CentroidVector centroid = new CentroidVector(); foreach (double weight in documentVector.getWeightVector()) { centroid.addWeight(weight); } centroids.Add(centroid); } return(centroids); }
public string Decision(DocumentVector vector) { return Classification; }
public static List <Centroid> AverageMeansAssigned(List <Centroid> fillCentroidCollection, List <DocumentVector> vectorSpace) { List <Centroid> result; List <DocumentVector> newVectorSpace = vectorSpace; int length = vectorSpace[0].VectorSpace.Length; float[] newVectorSpaceArray = new float[length]; float[] minDistancesToCluster = new float[0]; for (int i = 0; i < length; i++) { newVectorSpaceArray[i] = 0.0F; } for (int c = 0; c < fillCentroidCollection.Count; c++) { for (int gd = 0; gd < fillCentroidCollection[c].GroupedDocument.Count; gd++) { for (int k = 0; k < fillCentroidCollection[c].GroupedDocument[gd].VectorSpace.Length; k++) { newVectorSpaceArray[k] += fillCentroidCollection[c].GroupedDocument[gd].VectorSpace[k]; } } } for (int c1 = 0; c1 < fillCentroidCollection.Count; c1++) { for (int gd1 = 0; gd1 < fillCentroidCollection[c1].GroupedDocument.Count; gd1++) { for (int k1 = 0; k1 < fillCentroidCollection[c1].GroupedDocument[gd1].VectorSpace.Length; k1++) { newVectorSpaceArray[k1] = newVectorSpaceArray[k1] / fillCentroidCollection[c1].GroupedDocument.Count; } } } float minDist = 0.1F; float currentValue = 0.1F; int index = 0; for (int i = 0; i < fillCentroidCollection.Count; i++) { minDistancesToCluster = new float[fillCentroidCollection[i].GroupedDocument.Count]; for (int j = 0; j < fillCentroidCollection[i].GroupedDocument.Count; j++) { //minDistancesToCluster = new float[fillCentroidCollection[i].GroupedDocument.Count]; minDistancesToCluster[j] = SimilarityMatrixCalculations.FindEuclideanDistance(fillCentroidCollection[i].GroupedDocument.First().VectorSpace, fillCentroidCollection[i].GroupedDocument[j].VectorSpace); //} for (int z = 0; z < minDistancesToCluster.Length; z++) { currentValue = minDistancesToCluster[z]; if (currentValue <= minDist && currentValue != 0) { minDist = currentValue; index = z; } //here we must to find the closest document to new vectorSpace; //for all docs in cluster create the vectorSpace } /* * DocumentVector newClusterCenter = fillCentroidCollection[i].GroupedDocument[index]; * fillCentroidCollection[i].GroupedDocument.Clear(); * fillCentroidCollection[i].GroupedDocument.Add(newClusterCenter); */ } DocumentVector newClusterCenter = fillCentroidCollection[i].GroupedDocument[index]; index = 0; fillCentroidCollection[i].GroupedDocument.Clear(); fillCentroidCollection[i].GroupedDocument.Add(newClusterCenter); } minDistancesToCluster = new float[0]; result = new List <Centroid>(fillCentroidCollection); return(result); }
public void SetDocument(DocumentVector doc) { document = doc; }
/// <summary> /// Here the description of Gravitational clustering algorithm. /// </summary> /// <param name="docCollection">List of entry elements.</param> /// <param name="G">Gravitational parameter value, for the test = 7*10^(-6).</param> /// <param name="deltaG">Gravitational forse loss = 0.01F.</param> /// <param name="M">Count of iteration, for test = 500.</param> /// <param name="epsilon">Minimum distance, for test = 10^(-4).</param> /// <returns>List<Centroid> result = sets stored in disjoint set union-find strukture.</returns> public static List <Centroid> Gravitational(List <DocumentVector> docCollection, float G, float deltaG, int M, float epsilon) { List <Centroid> result = new List <Centroid>(); List <DocumentVector> docVectorCopy = new List <DocumentVector>(docCollection); int docVectorCopy_Count = docVectorCopy.Count; int index = 0; Random rand = new Random(); var set_result = DisjointSet.Set(docVectorCopy); float[] documentVectorOriginalFirst = new float[docVectorCopy[0].VectorSpace.Length]; float[] documentVectorOriginalSecond = new float[docVectorCopy[0].VectorSpace.Length]; int[] parent = set_result.Item1; int[] rank = set_result.Item2; List <Centroid> centroidSet = set_result.Item3; List <Centroid> unionChanged = new List <Centroid>(centroidSet); for (int i = 0; i < M; i++) { for (int j = 0; j < unionChanged.Count; j++) { if (j == 0) { index = rand.Next(0, docVectorCopy.Count - 1); } else { index = rand.Next(0, unionChanged.Count - 1); } if (index != j) { DocumentVector document = new DocumentVector(); //document.SaveOriginal(docVectorCopy[j]); //documentVectorOriginalFirst = document.OriginalVectorSpace; //float[,] distanceMatrix = Move(docVectorCopy[j], docVectorCopy[index], docVectorCopy_Count); var distance = Move(docVectorCopy[j], docVectorCopy[index], G); //if(distanceMatrix[j, index]<= epsilon) if (distance <= epsilon) { if (j == 0) { var unionChangedResultTuple = DisjointSet.Union(j, index, centroidSet); unionChanged = unionChangedResultTuple.Item3; parent = unionChangedResultTuple.Item1; } // how to make union between to clusters. else { var unionChangedResultTuple = DisjointSet.Union(j, index, unionChanged); unionChanged = unionChangedResultTuple.Item3; parent = unionChangedResultTuple.Item1; } //List<Centroid> unionChanged = DisjointSet.Union(docVectorCopy[j], docVectorCopy[index]); } } G = (1 - deltaG) * G; for (int z = 0; z < result.Count; z++) { for (int k = 0; k < result[z].GroupedDocument.Count; k++) { DocumentVector element = docVectorCopy[DisjointSet.Find(parent, k)]; } } } } result = unionChanged; return(result); }
public override string Classify(DocumentVector Vector) { return RootNode.Decision(Vector); }
private void btnAdd_Click(object sender, EventArgs e) { int newDoc = 0; if (!string.IsNullOrEmpty(txtDoc1.Text)) { docCollection.DocumentList.Add(txtDoc1.Text); newDoc++; } if (!string.IsNullOrEmpty(txtDoc2.Text)) { newDoc++; docCollection.DocumentList.Add(txtDoc2.Text); } if (!string.IsNullOrEmpty(txtDoc3.Text)) { docCollection.DocumentList.Add(txtDoc3.Text); newDoc++; } if (!string.IsNullOrEmpty(txtDoc4.Text)) { newDoc++; docCollection.DocumentList.Add(txtDoc4.Text); } int totalDoc = 0; if (int.TryParse(docCollection.DocumentList.Count.ToString(), out totalDoc)) { lblTotalDoc.Text = totalDoc.ToString(); } txtDoc1.Clear(); txtDoc2.Clear(); txtDoc3.Clear(); txtDoc4.Clear(); if (ddlType.Text == "Incremental" && DocumnetClustering.mainCentroids.Count > 0) { switch (ddlIncAlg.Text) { case "KMeans": List <DocumentVector> vSpace = VectorSpaceModel.ProcessDocumentCollection(docCollection); for (int i = 1; i <= newDoc; i++) { DocumentVector obj = vSpace[vSpace.Count - i]; int index = DocumnetClustering.FindClosestClusterCenter(DocumnetClustering.mainCentroids, obj, ddl_sim.Text); DocumnetClustering.mainCentroids[index].GroupedDocument.Add(obj); } break; case "CMeans": List <DocumentVector> vSpace2 = VectorSpaceModel.ProcessDocumentCollection(docCollection); string outFilepath = @"E:\Dropbox\Masters\myMSc\PracticalPart\Sematic_K-MEANSClustering\FCM\HM_data_Out_centers.dat"; var reader = new StreamReader(File.OpenRead(outFilepath)); List <float[]> values = new List <float[]>(); int t = 0; while (!reader.EndOfStream) { var line = reader.ReadLine(); values.Add(Array.ConvertAll(line.Split(','), float.Parse)); t++; } for (int i = 0; i < newDoc; i++) { int closeCenter = 0; float min = 1000; int counter = 1; DocumentVector obj2 = vSpace2[vSpace2.Count - newDoc + i]; for (int l = 0; l < t; l++) { // float s = SimilarityMatrics.FindCosineSimilarity(values[l], obj2.VectorSpace); float s = ArrayDistanceFunction(values[l], obj2.VectorSpace); if (s < min) { min = s; closeCenter = counter; } counter++; } MessageBox.Show("Doc:" + (i + 1) + " Close is:" + closeCenter); DocumnetClustering.mainCentroids[closeCenter - 1].GroupedDocument.Add(obj2); } break; } printAlll(); } }
public abstract String Classify(DocumentVector Vector);
public String Add(DocumentVector Vector) { String Label = Classify(Vector); Documents.Add( new LabeledDocumentVector(Vector,Label) ); return Label; }
public string Decision(DocumentVector vector) { if (vector.Vector[Index] < Value) return LeftChild.Decision(vector); else return RightChild.Decision(vector); }