public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); mDataset = new UnlabeledDataset <SparseVector <double> >(dataset); ClusteringResult clustering = null; double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Info("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(mK); ArrayList <int> bestSeeds = null; for (int i = 0; i < mK; i++) { centroids.Add(new CentroidData()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(mDataset.Count); for (int i = 0; i < mDataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK); tmp.Shuffle(mRnd); for (int i = 0; i < mK; i++) { seeds.Add(mDataset[tmp[i]]); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(mK * mK - mK); //Console.WriteLine(simAvg); if (simAvg < minSim) { minSim = simAvg; bestSeeds = new ArrayList <int>(mK); for (int i = 0; i < mK; i++) { bestSeeds.Add(tmp[i]); } } } ArrayList <KeyDat <double, int> > medoids = new ArrayList <KeyDat <double, int> >(mK); for (int i = 0; i < mK; i++) { centroids[i].Items.Add(bestSeeds[i]); centroids[i].Update(mDataset); centroids[i].UpdateCentroidLen(); medoids.Add(new KeyDat <double, int>(-1, bestSeeds[i])); } double[,] dotProd = new double[mDataset.Count, mK]; SparseMatrix <double> dsMat = ModelUtils.GetTransposedMatrix(mDataset); // main loop int iter = 0; double bestClustQual = 0; double clustQual; while (true) { iter++; mLogger.Info("Cluster", "Iteration {0} ...", iter); clustQual = 0; // assign items to clusters //StopWatch stopWatch = new StopWatch(); int j = 0; foreach (CentroidData cen in centroids) { SparseVector <double> cenVec = cen.GetSparseVector(); double[] dotProdSimVec = ModelUtils.GetDotProductSimilarity(dsMat, mDataset.Count, cenVec); for (int i = 0; i < dotProdSimVec.Length; i++) { if (dotProdSimVec[i] > 0) { dotProd[i, j] = dotProdSimVec[i]; } } j++; } for (int dsInstIdx = 0; dsInstIdx < mDataset.Count; dsInstIdx++) { double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int cenIdx = 0; cenIdx < mK; cenIdx++) { double sim = dotProd[dsInstIdx, cenIdx]; if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(cenIdx); } else if (sim == maxSim) { candidates.Add(cenIdx); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } if (candidates.Count > 0) // *** is this always true? { centroids[candidates[0]].Items.Add(dsInstIdx); clustQual += maxSim; if (medoids[candidates[0]].Key < maxSim) { medoids[candidates[0]] = new KeyDat <double, int>(maxSim, dsInstIdx); } } } //Console.WriteLine(stopWatch.TotalMilliseconds); clustQual /= (double)mDataset.Count; mLogger.Info("Cluster", "Quality: {0:0.0000}", clustQual); // compute new centroids for (int i = 0; i < mK; i++) { centroids[i].Update(mDataset); centroids[i].UpdateCentroidLen(); } // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; for (int i = 0; i < medoids.Count; i++) { medoids[i] = new KeyDat <double, int>(-1, medoids[i].Dat); } } if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; mCentroids = centroids; mMedoids = medoids; // save the result clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); clustering.Roots.Last.Items.AddRange(centroids[i].Items); } } } return(clustering); }
public override void Run(object[] args) { // Get the stop words and stemmer for English. IStemmer stemmer; Set <string> .ReadOnly stopWords; TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); // Create a tokenizer. UnicodeTokenizer tokenizer = new UnicodeTokenizer(); tokenizer.MinTokenLen = 2; // Each token must be at least 2 // characters long. tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens // can consist of alphabetic characters only. // Load a document corpus from a file. Each line in the file // represents one document. string[] docs = File.ReadAllLines(@"Data\YahooFinance.txt"); // Create a bag-of-words space. BowSpace bowSpc = new BowSpace(); bowSpc.Tokenizer = tokenizer; // Assign the tokenizer. bowSpc.StopWords = stopWords; // Assign the stop words. bowSpc.Stemmer = stemmer; // Assign the stemmer. bowSpc.MinWordFreq = 3; // A term must appear at least 3 times in the corpus for it to be part of the vocabulary. bowSpc.MaxNGramLen = 3; // Terms consisting of at most 3 consecutive words will be considered. bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the weighting scheme for the bag-of-words vectors to TF-IDF. bowSpc.NormalizeVectors = true; // The TF-IDF vectors will be normalized. bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest weights, summing up to 20% of the overall weight sum, will be removed from each TF-IDF vector. ArrayList <SparseVector <double> > sparseVectors = bowSpc.Initialize(docs); // Initialize the Bow space. UnlabeledDataset <SparseVector <double> > ud = new UnlabeledDataset <SparseVector <double> >(sparseVectors); // Compute 100 clusters of documents. KMeansClusteringFast kMeans = new KMeansClusteringFast(100); // Set k to 100. kMeans.Trials = 3; // Perform 3 repetitions. Take the best result. kMeans.Eps = 0.001; // Stop iterating when the partition quality increases for less than 0.001. ClusteringResult cr = kMeans.Cluster(ud); // Execute. // Extract the top 5 terms with the highest TF-IDF weights // from each of the clusters' centroids and output the // number of documents (companies) in each cluster. foreach (Cluster cl in cr.Roots) { SparseVector <double> .ReadOnly centroid = cl.ComputeCentroid(ud, CentroidType.NrmL2); Console.Write(bowSpc.GetKeywordsStr(centroid, 5)); Output.WriteLine(" ({0} companies)", cl.Items.Count); } // Output the documents that are contained in the first // cluster. foreach (int docIdx in cr.Roots[0].Items) { Output.WriteLine(docs[docIdx]); } }
public Vector2D[] ComputeLayout(LayoutSettings settings) { UnlabeledDataset <SparseVector <double> > dataset = new UnlabeledDataset <SparseVector <double> >(mDataset); // clustering mLogger.Info("ComputeLayout", "Clustering ..."); KMeansFast kMeans = new KMeansFast(mKClust); kMeans.Eps = mKMeansEps; kMeans.Random = mRandom; kMeans.Trials = 1; ClusteringResult clustering = kMeans.Cluster(mDataset); // throws ArgumentValueException // determine reference instances UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >(); foreach (Cluster cluster in clustering.Roots) { SparseVector <double> centroid = cluster.Items.Count > 0 ? cluster.ComputeCentroid(mDataset, CentroidType.NrmL2) : new SparseVector <double>(); dsRefInst.Add(centroid); // dataset of reference instances dataset.Add(centroid); // add centroids to the main dataset } // position reference instances mLogger.Info("ComputeLayout", "Positioning reference instances ..."); SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false); StressMajorizationLayout sm = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx)); sm.Random = mRandom; Vector2D[] centrPos = sm.ComputeLayout(); // k-NN mLogger.Info("ComputeLayout", "Computing similarities ..."); simMtx = ModelUtils.GetDotProductSimilarity(dataset, mSimThresh, /*fullMatrix=*/ true); mLogger.Info("ComputeLayout", "Constructing system of linear equations ..."); LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >(); foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx) { if (simMtxRow.Dat.Count <= 1) { mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx); } ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count); foreach (IdxDat <double> item in simMtxRow.Dat) { if (item.Idx != simMtxRow.Idx) { knn.Add(new KeyDat <double, int>(item.Dat, item.Idx)); } } knn.Sort(DescSort <KeyDat <double, int> > .Instance); int count = Math.Min(knn.Count, mKNN); SparseVector <double> eq = new SparseVector <double>(); double wgt = 1.0 / (double)count; for (int i = 0; i < count; i++) { eq.InnerIdx.Add(knn[i].Dat); eq.InnerDat.Add(-wgt); } eq.InnerIdx.Sort(); // *** sort only indices eq[simMtxRow.Idx] = 1; lsqrDs.Add(0, eq); } Vector2D[] layout = new Vector2D[dataset.Count - mKClust]; for (int i = dataset.Count - mKClust, j = 0; i < dataset.Count; i++, j++) { SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) }); lsqrDs.Add(centrPos[j].X, eq); } LSqrModel lsqr = new LSqrModel(); lsqr.Train(lsqrDs); for (int i = 0; i < layout.Length; i++) { layout[i].X = lsqr.Solution[i]; } for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++) { lsqrDs[i].Label = centrPos[j].Y; } lsqr.Train(lsqrDs); for (int i = 0; i < layout.Length; i++) { layout[i].Y = lsqr.Solution[i]; } return(settings == null ? layout : settings.AdjustLayout(layout)); }
public IncrementalSemanticSpaceLayout(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); mDataset = new UnlabeledDataset <SparseVector <double> >(dataset); }
// TODO: exceptions public Vector2D[] Update(int numDequeue, IEnumerable <SparseVector <double> > newInst, bool test, LayoutSettings settings, ref PtInfo[] ptInfo, int _count) { // clustering mLogger.Info("Update", "Clustering ..."); /*prof*/ StopWatch sw = new StopWatch(); mKMeans.Eps = mKMeansEps; int iter = 0; mKMeans.Update(numDequeue, newInst, ref iter); /*prof*/ sw.Save("cl.txt", _count, iter.ToString()); // determine reference instances /*prof*/ sw.Reset(); UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >(); UnlabeledDataset <SparseVector <double> > dsNewInst = new UnlabeledDataset <SparseVector <double> >(newInst); foreach (SparseVector <double> centroid in mKMeans.GetCentroids()) { dsRefInst.Add(centroid); // dataset of reference instances dsNewInst.Add(centroid); // dataset of new instances } // position reference instances mLogger.Info("Update", "Positioning reference instances ..."); SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false); StressMajorizationLayout sm = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx)); sm.Random = mRandom; sm.MaxSteps = int.MaxValue; sm.MinDiff = 1E-3; mRefPos = sm.ComputeLayout(/*settings=*/ null, mRefPos /*make this a property!!!*/); /*prof*/ sw.Save("sm.txt", _count); // k-NN /*prof*/ sw.Reset(); DateTime t = DateTime.Now; mLogger.Info("Update", "Computing similarities ..."); // update list of neighborhoods mPatches.RemoveRange(mDataset.Count - mKClust, mKClust); mPatches.RemoveRange(0, numDequeue); // remove instances from [dataset and] neighborhoods foreach (Patch patch in mPatches) { if (patch.Min != null && (patch.Min.Idx < numDequeue || patch.Max.Idx >= mDataset.Count - mKClust)) { int oldCount = patch.List.Count; ArrayList <KeyDat <double, Patch> > tmp = new ArrayList <KeyDat <double, Patch> >(); foreach (KeyDat <double, Patch> item in patch.List) { if (item.Dat.Idx >= numDequeue && item.Dat.Idx < mDataset.Count - mKClust) { tmp.Add(item); } //else //{ // Console.WriteLine("Remove {0}", item.Dat.Idx - numDequeue); //} } patch.List = tmp; patch.ProcessList(); patch.NeedUpdate = patch.List.Count < mKNn && oldCount >= mKNn; } } // update dataset mDataset.RemoveRange(mDataset.Count - mKClust, mKClust); mDataset.RemoveRange(0, numDequeue); // add new instances to dataset int preAddCount = mDataset.Count; mDataset.AddRange(dsNewInst); // precompute transposed matrices SparseMatrix <double> trNewInst = ModelUtils.GetTransposedMatrix(dsNewInst); SparseMatrix <double> trDataset = ModelUtils.GetTransposedMatrix(mDataset); // add new instances to neighborhoods for (int i = 0; i < dsNewInst.Count; i++) { mPatches.Add(new Patch(-1)); mPatches.Last.NeedUpdate = true; } for (int i = 0; i < mPatches.Count; i++) { mPatches[i].Idx = i; } for (int i = 0; i < mPatches.Count; i++) { Patch patch = mPatches[i]; SparseVector <double> vec = mDataset[i]; if (vec != null) { if (patch.NeedUpdate) // full update required { //if (i == 1347) { Console.WriteLine("full update"); } SparseVector <double> simVec = ModelUtils.GetDotProductSimilarity(trDataset, mDataset.Count, vec, mSimThresh); ArrayList <KeyDat <double, int> > tmp = new ArrayList <KeyDat <double, int> >(); foreach (IdxDat <double> item in simVec) { if (item.Idx != i) { tmp.Add(new KeyDat <double, int>(item.Dat, item.Idx)); } } tmp.Sort(new Comparer2()); int count = Math.Min(tmp.Count, mKNnExt); patch.List.Clear(); for (int j = 0; j < count; j++) { patch.List.Add(new KeyDat <double, Patch>(tmp[j].Key, mPatches[tmp[j].Dat])); } patch.ProcessList(); patch.NeedUpdate = false; } else // only new instances need to be considered { //if (i == 1347) { Console.WriteLine("partial update"); } SparseVector <double> simVec = ModelUtils.GetDotProductSimilarity(trNewInst, dsNewInst.Count, vec, mSimThresh); // check if further processing is needed bool needMerge = false; if (test) { foreach (IdxDat <double> item in simVec) { if (item.Dat >= patch.MinSim) { needMerge = true; //Console.WriteLine("{0} {1}", item.Dat, patch.MinSim); break; } } } else { foreach (IdxDat <double> item in simVec) { if (item.Dat > patch.MinSim) { needMerge = true; //Console.WriteLine("{0} {1}", item.Dat, patch.MinSim); break; } } } if (needMerge || patch.List.Count < mKNn) { //if (i == 1347) { Console.WriteLine("merge"); } int oldCount = patch.List.Count; ArrayList <KeyDat <double, Patch> > tmp = new ArrayList <KeyDat <double, Patch> >(); foreach (IdxDat <double> item in simVec) { tmp.Add(new KeyDat <double, Patch>(item.Dat, mPatches[item.Idx + preAddCount])); } // merge the two lists // TODO: speed this up patch.List.AddRange(tmp); patch.List.Sort(new Comparer()); // trim list to size if (oldCount >= mKNn) { patch.List.RemoveRange(oldCount, patch.List.Count - oldCount); } patch.ProcessList(); } } } } /*prof*/ sw.Save("knn.txt", _count); // *** Test *** sw.Reset(); ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true); sw.Save("selfSim.txt", _count, mDataset.Count.ToString()); if (test) { simMtx = ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true); ArrayList <Patch> patches = new ArrayList <Patch>(); for (int i = 0; i < mDataset.Count; i++) { patches.Add(new Patch(i)); } foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx) { if (simMtxRow.Dat.Count <= 1) { mLogger.Warn("Update", "Instance #{0} has no neighborhood.", simMtxRow.Idx); } ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count); foreach (IdxDat <double> item in simMtxRow.Dat) { if (item.Idx != simMtxRow.Idx) { knn.Add(new KeyDat <double, int>(item.Dat, item.Idx)); } } knn.Sort(new Comparer2()); int count = Math.Min(knn.Count, mKNnExt); for (int i = 0; i < count; i++) { patches[simMtxRow.Idx].List.Add(new KeyDat <double, Patch>(knn[i].Key, patches[knn[i].Dat])); } patches[simMtxRow.Idx].ProcessList(); } // compare if (patches.Count != mPatches.Count) { throw new Exception("Count mismatch."); } for (int i = 0; i < mPatches.Count; i++) { if (patches[i].List.Count < mKNn && patches[i].List.Count != mPatches[i].List.Count) { Console.WriteLine(mPatches[i].List.Count); Console.WriteLine(patches[i].List.Count); Output(mPatches[i].List); Output(patches[i].List); Console.WriteLine(i); throw new Exception("List count mismatch."); } int count = Math.Min(mPatches[i].List.Count, mKNn); for (int j = 0; j < count; j++) { //Console.WriteLine("{4} {0}-{1} {2}-{3}", mPatches[i].List[j].Key, mPatches[i].List[j].Dat.Idx, patches[i].List[j].Key, patches[i].List[j].Dat.Idx, i); if (mPatches[i].List[j].Key != patches[i].List[j].Key || mPatches[i].List[j].Dat.Idx != patches[i].List[j].Dat.Idx) { Console.WriteLine("i:{4} fast:{0}-{1} slow:{2}-{3}", mPatches[i].List[j].Key, mPatches[i].List[j].Dat.Idx, patches[i].List[j].Key, patches[i].List[j].Dat.Idx, i); int idxFast = mPatches[i].List[j].Dat.Idx; int idxSlow = patches[i].List[j].Dat.Idx; Console.WriteLine("slow @ fast idx: {0}", GetKey(patches[i].List, idxFast)); Console.WriteLine("fast @ slow idx: {0}", GetKey(mPatches[i].List, idxSlow)); throw new Exception("Patch item mismatch."); } } } } // *** End of test *** //Console.WriteLine("Number of patches: {0}", mPatches.Count); //int waka = 0; //foreach (Patch patch in mPatches) //{ // waka += patch.List.Count; //} //Console.WriteLine("Avg list size: {0}", (double)waka / (double)mPatches.Count); Console.WriteLine((DateTime.Now - t).TotalMilliseconds); /*prof*/ sw.Reset(); mLogger.Info("Update", "Constructing system of linear equations ..."); LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >(); Vector2D[] layout = new Vector2D[mDataset.Count - mKClust]; foreach (Patch patch in mPatches) { int count = Math.Min(patch.List.Count, mKNn); SparseVector <double> eq = new SparseVector <double>(); double wgt = 1.0 / (double)count; for (int i = 0; i < count; i++) { eq.InnerIdx.Add(patch.List[i].Dat.Idx); eq.InnerDat.Add(-wgt); } eq.InnerIdx.Sort(); // *** sort only indices eq[patch.Idx] = 1; lsqrDs.Add(0, eq); } for (int i = mDataset.Count - mKClust, j = 0; i < mDataset.Count; i++, j++) { SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) }); lsqrDs.Add(mRefPos[j].X, eq); } LSqrModel lsqr = new LSqrModel(); mSolX.RemoveRange(0, numDequeue); double[] aux = new double[mKClust]; mSolX.CopyTo(mSolX.Count - mKClust, aux, 0, mKClust); mSolX.RemoveRange(mSolX.Count - mKClust, mKClust); foreach (SparseVector <double> newVec in newInst) { mSolX.Add(0); } mSolX.AddRange(aux); lsqr.InitialSolution = mSolX.ToArray(); lsqr.Train(lsqrDs); mSolX = lsqr.Solution.GetWritableCopy(); //for (int i = 0; i < lsqr.InitialSolution.Length; i++) //{ // Console.WriteLine("{0}\t{1}", lsqr.InitialSolution[i], lsqr.Solution[i]); //} for (int i = 0; i < layout.Length; i++) { layout[i].X = lsqr.Solution[i]; } for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++) { lsqrDs[i].Label = mRefPos[j].Y; } mSolY.RemoveRange(0, numDequeue); aux = new double[mKClust]; mSolY.CopyTo(mSolY.Count - mKClust, aux, 0, mKClust); mSolY.RemoveRange(mSolY.Count - mKClust, mKClust); foreach (SparseVector <double> newVec in newInst) { mSolY.Add(0); } mSolY.AddRange(aux); lsqr.InitialSolution = mSolY.ToArray(); lsqr.Train(lsqrDs); mSolY = lsqr.Solution.GetWritableCopy(); for (int i = 0; i < layout.Length; i++) { layout[i].Y = lsqr.Solution[i]; } /*prof*/ sw.Save("lsqr.txt", _count); // ----------------------------------------------------------------- // make ptInfo // ----------------------------------------------------------------- ptInfo = new PtInfo[layout.Length]; int ii = 0; foreach (Vector2D pt in layout) { ptInfo[ii] = new PtInfo(); ptInfo[ii].X = pt.X; ptInfo[ii].Y = pt.Y; ptInfo[ii].Vec = mDataset[ii]; ii++; } // ----------------------------------------------------------------- return(settings == null ? layout : settings.AdjustLayout(layout)); }
public Vector2D[] ComputeLayout(LayoutSettings settings) { // clustering mLogger.Info("ComputeLayout", "Clustering ..."); mKMeans = new IncrementalKMeans(mKClust); mKMeans.Eps = mKMeansEps; mKMeans.Random = mRandom; mKMeans.Trials = 3; ClusteringResult clustering = mKMeans.Cluster(mDataset); // throws ArgumentValueException // determine reference instances UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >(); foreach (SparseVector <double> centroid in mKMeans.GetCentroids()) { dsRefInst.Add(centroid); // dataset of reference instances mDataset.Add(centroid); // add centroids to the main dataset } // position reference instances mLogger.Info("ComputeLayout", "Positioning reference instances ..."); SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false); StressMajorizationLayout sm = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx)); sm.Random = mRandom; sm.MaxSteps = int.MaxValue; sm.MinDiff = 0.00001; mRefPos = sm.ComputeLayout(); // k-NN mLogger.Info("ComputeLayout", "Computing similarities ..."); simMtx = ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true); mLogger.Info("ComputeLayout", "Constructing system of linear equations ..."); LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >(); mPatches = new ArrayList <Patch>(mDataset.Count); for (int i = 0; i < mDataset.Count; i++) { mPatches.Add(new Patch(i)); } foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx) { if (simMtxRow.Dat.Count <= 1) { mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx); } ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count); foreach (IdxDat <double> item in simMtxRow.Dat) { if (item.Idx != simMtxRow.Idx) { knn.Add(new KeyDat <double, int>(item.Dat, item.Idx)); } } knn.Sort(DescSort <KeyDat <double, int> > .Instance); int count = Math.Min(knn.Count, mKNnExt); for (int i = 0; i < count; i++) { mPatches[simMtxRow.Idx].List.Add(new KeyDat <double, Patch>(knn[i].Key, mPatches[knn[i].Dat])); } mPatches[simMtxRow.Idx].ProcessList(); count = Math.Min(knn.Count, mKNn); SparseVector <double> eq = new SparseVector <double>(); double wgt = 1.0 / (double)count; for (int i = 0; i < count; i++) { eq.InnerIdx.Add(knn[i].Dat); eq.InnerDat.Add(-wgt); } eq.InnerIdx.Sort(); // *** sort only indices eq[simMtxRow.Idx] = 1; lsqrDs.Add(0, eq); } Vector2D[] layout = new Vector2D[mDataset.Count - mKClust]; for (int i = mDataset.Count - mKClust, j = 0; i < mDataset.Count; i++, j++) { SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) }); lsqrDs.Add(mRefPos[j].X, eq); } LSqrModel lsqr = new LSqrModel(); lsqr.Train(lsqrDs); mSolX = lsqr.Solution.GetWritableCopy(); for (int i = 0; i < layout.Length; i++) { layout[i].X = lsqr.Solution[i]; } for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++) { lsqrDs[i].Label = mRefPos[j].Y; } lsqr.Train(lsqrDs); mSolY = lsqr.Solution.GetWritableCopy(); for (int i = 0; i < layout.Length; i++) { layout[i].Y = lsqr.Solution[i]; } return(settings == null ? layout : settings.AdjustLayout(layout)); }
public ClusteringResult Cluster(int numOutdated, IUnlabeledExampleCollection<SparseVector<double>> batch) { Utils.ThrowException(batch == null ? new ArgumentNullException("batch") : null); Utils.ThrowException(numOutdated < 0 ? new ArgumentOutOfRangeException("numOutdated") : null); if (mDataset == null) { // initialize mLogger.Info("Cluster", "Initializing ..."); Utils.ThrowException(numOutdated > 0 ? new ArgumentOutOfRangeException("numOutdated") : null); //Utils.ThrowException(batch.Count == 0 ? new ArgumentValueException("batch") : null); if (batch.Count == 0) { return new ClusteringResult(); } kMeans(batch, Math.Min(mK, batch.Count)); mDataset = new UnlabeledDataset<SparseVector<double>>(batch); foreach (CentroidData centroid in mCentroids) { centroid.Tag = mTopicId++; } //OutputState(); } else { // update clusters Utils.ThrowException(numOutdated > mDataset.Count ? new ArgumentOutOfRangeException("numOutdated") : null); if (numOutdated == 0 && batch.Count == 0) { return GetClusteringResult(); } mLogger.Info("Cluster", "Updating clusters ..."); // assign new instances double dummy; Assign(mCentroids, ModelUtils.GetTransposedMatrix(batch), batch.Count, /*offs=*/mDataset.Count, out dummy); mDataset.AddRange(batch); // remove outdated instances foreach (CentroidData centroid in mCentroids) { foreach (int item in centroid.CurrentItems) { if (item >= numOutdated) { centroid.Items.Add(item); } } centroid.Update(mDataset); centroid.UpdateCentroidLen(); } mDataset.RemoveRange(0, numOutdated); ArrayList<CentroidData> centroidsNew = new ArrayList<CentroidData>(mCentroids.Count); foreach (CentroidData centroid in mCentroids) { if (centroid.CurrentItems.Count > 0) { centroidsNew.Add(centroid); Set<int> tmp = new Set<int>(); foreach (int idx in centroid.CurrentItems) { tmp.Add(idx - numOutdated); } centroid.CurrentItems.Inner.SetItems(tmp); } } if (centroidsNew.Count == 0) // reset { mCentroids = null; mDataset = null; return new ClusteringResult(); } mCentroids = centroidsNew; // execute main loop kMeansMainLoop(mDataset, mCentroids); //OutputState(); } // adjust k double minQual; // *** not used at the moment int minQualIdx; double qual = GetClustQual(out minQual, out minQualIdx); if (qual < mQualThresh) { while (qual < mQualThresh) // split cluster at minQualIdx { mLogger.Info("Cluster", "Increasing k to {0} ...", mCentroids.Count + 1); mCentroids.Add(mCentroids[minQualIdx].Clone()); mCentroids.Last.Tag = mTopicId++; kMeansMainLoop(mDataset, mCentroids); if (mCentroids.Last.CurrentItems.Count > mCentroids[minQualIdx].CurrentItems.Count) { // swap topic identifiers object tmp = mCentroids.Last.Tag; mCentroids.Last.Tag = mCentroids[minQualIdx].Tag; mCentroids[minQualIdx].Tag = tmp; } qual = GetClustQual(out minQual, out minQualIdx); //OutputState(); } } else if (numOutdated > 0) { while (qual > mQualThresh && mCentroids.Count > 1) // join clusters { mLogger.Info("Cluster", "Decreasing k to {0} ...", mCentroids.Count - 1); ArrayList<CentroidData> centroidsCopy = mCentroids.DeepClone(); if (mCentroids.Count == 2) // create single cluster { object topicId = mCentroids[0].CurrentItems.Count > mCentroids[1].CurrentItems.Count ? mCentroids[0].Tag : mCentroids[1].Tag; mCentroids = new ArrayList<CentroidData>(); mCentroids.Add(new CentroidData()); for (int i = 0; i < mDataset.Count; i++) { mCentroids.Last.Items.Add(i); } mCentroids.Last.Tag = topicId; mCentroids.Last.Update(mDataset); mCentroids.Last.UpdateCentroidLen(); } else { int idx1, idx2; GetMostSimilarClusters(out idx1, out idx2); CentroidData c1 = mCentroids[idx1]; CentroidData c2 = mCentroids[idx2]; object topicId = c1.CurrentItems.Count > c2.CurrentItems.Count ? c1.Tag : c2.Tag; mCentroids.RemoveAt(idx2); c1.Items.AddRange(c1.CurrentItems); c1.Items.AddRange(c2.CurrentItems); c1.Tag = topicId; c1.Update(mDataset); c1.UpdateCentroidLen(); kMeansMainLoop(mDataset, mCentroids); } qual = GetClustQual(); if (qual >= mQualThresh) { mLogger.Info("Cluster", "Accepted solution at k = {0}.", mCentroids.Count); } else { mCentroids = centroidsCopy; } //OutputState(); } } OutputState(); return GetClusteringResult(); }
public void Load(BinarySerializer reader) { Utils.ThrowException(reader == null ? new ArgumentNullException("reader") : null); // the following statements throw serialization-related exceptions mRnd = (Random)reader.ReadDotNetObject(); mEps = reader.ReadDouble(); mTrials = reader.ReadInt(); mK = reader.ReadInt(); mCentroids = reader.ReadObject<ArrayList<CentroidData>>(); mDataset = reader.ReadObject<UnlabeledDataset<SparseVector<double>>>(); mQualThresh = reader.ReadDouble(); mTopicId = reader.ReadLong(); }