public SparseVector <double> Trim(SparseVector <double> vec, int size, double cutPerc) { SparseVector <double> trimmed = vec; if (vec.Count > size) { ArrayList <KeyDat <double, int> > tmp = new ArrayList <KeyDat <double, int> >(vec.Count); foreach (IdxDat <double> item in vec) { tmp.Add(new KeyDat <double, int>(item.Dat, item.Idx)); } tmp.Sort(DescSort <KeyDat <double, int> > .Instance); tmp.RemoveRange(size, tmp.Count - size); ArrayList <IdxDat <double> > tmp2 = new ArrayList <IdxDat <double> >(); foreach (KeyDat <double, int> item in tmp) { tmp2.Add(new IdxDat <double>(item.Dat, item.Key)); } tmp2.Sort(); trimmed = new SparseVector <double>(tmp2); } ModelUtils.CutLowWeights(ref trimmed, cutPerc); ModelUtils.TryNrmVecL2(trimmed); return(trimmed); }
private static SparseMatrix <double> CreateObservationMatrix <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset, ref LblT[] idx_to_lbl) { SparseMatrix <double> mtx = new SparseMatrix <double>(); ArrayList <LblT> tmp = new ArrayList <LblT>(); Dictionary <LblT, int> lbl_to_idx = new Dictionary <LblT, int>(); foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> labeled_example in dataset) { if (!lbl_to_idx.ContainsKey(labeled_example.Label)) { lbl_to_idx.Add(labeled_example.Label, lbl_to_idx.Count); tmp.Add(labeled_example.Label); } } int i = 0; foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> labeled_example in dataset) { Utils.Verbose("{0} / {1}\r", ++i, dataset.Count); int lbl_idx = lbl_to_idx[labeled_example.Label]; if (!mtx.ContainsRowAt(lbl_idx)) { mtx[lbl_idx] = ModelUtils.ConvertExample <SparseVector <double> >(labeled_example.Example); } else { SparseVector <double> new_vec = ModelUtils.ConvertExample <SparseVector <double> >(labeled_example.Example); new_vec.Merge(mtx[lbl_idx], new SumOperator()); mtx[lbl_idx] = new_vec; } } Utils.VerboseLine(""); idx_to_lbl = tmp.ToArray(); return(mtx); }
// *** ISimilarity<SparseVector<double>.ReadOnly> interface implementation *** public double GetSimilarity(SparseVector <double> .ReadOnly a, SparseVector <double> .ReadOnly b) { Utils.ThrowException(a == null ? new ArgumentNullException("a") : null); Utils.ThrowException(b == null ? new ArgumentNullException("b") : null); double dotProd = 0; int i = 0, j = 0; int aCount = a.Count; Utils.ThrowException(aCount == 0 ? new ArgumentValueException("a") : null); int bCount = b.Count; Utils.ThrowException(bCount == 0 ? new ArgumentValueException("b") : null); ArrayList <int> aIdx = a.Inner.InnerIdx; ArrayList <double> aDat = a.Inner.InnerDat; ArrayList <int> bIdx = b.Inner.InnerIdx; ArrayList <double> bDat = b.Inner.InnerDat; int aIdx_i = aCount == 0 ? 0 : aIdx[0]; int bIdx_j = bCount == 0 ? 0 : bIdx[0]; while (true) { if (aIdx_i < bIdx_j) { if (++i == aCount) { break; } aIdx_i = aIdx[i]; } else if (aIdx_i > bIdx_j) { if (++j == bCount) { break; } bIdx_j = bIdx[j]; } else { dotProd += aDat[i] * bDat[j]; if (++i == aCount || ++j == bCount) { break; } aIdx_i = aIdx[i]; bIdx_j = bIdx[j]; } } double aLen = ModelUtils.GetVecLenL2(a); Utils.ThrowException(aLen == 0 ? new ArgumentValueException("a") : null); double bLen = ModelUtils.GetVecLenL2(b); Utils.ThrowException(bLen == 0 ? new ArgumentValueException("b") : null); double lenMult = aLen * bLen; return(dotProd / lenMult); }
// *** ISimilarity<SparseVector<double>.ReadOnly> interface implementation *** public double GetSimilarity(SparseVector <double> .ReadOnly a, SparseVector <double> .ReadOnly b) { Utils.ThrowException(a == null ? new ArgumentNullException("a") : null); Utils.ThrowException(b == null ? new ArgumentNullException("b") : null); double dot_prod = 0; int i = 0, j = 0; int a_count = a.Count; Utils.ThrowException(a_count == 0 ? new ArgumentValueException("a") : null); int b_count = b.Count; Utils.ThrowException(b_count == 0 ? new ArgumentValueException("b") : null); ArrayList <int> a_idx = a.Inner.InnerIdx; ArrayList <double> a_dat = a.Inner.InnerDat; ArrayList <int> b_idx = b.Inner.InnerIdx; ArrayList <double> b_dat = b.Inner.InnerDat; int a_idx_i = a_count == 0 ? 0 : a_idx[0]; int b_idx_j = b_count == 0 ? 0 : b_idx[0]; while (true) { if (a_idx_i < b_idx_j) { if (++i == a_count) { break; } a_idx_i = a_idx[i]; } else if (a_idx_i > b_idx_j) { if (++j == b_count) { break; } b_idx_j = b_idx[j]; } else { dot_prod += a_dat[i] * b_dat[j]; if (++i == a_count || ++j == b_count) { break; } a_idx_i = a_idx[i]; b_idx_j = b_idx[j]; } } double len_a = ModelUtils.GetVecLenL2(a); Utils.ThrowException(len_a == 0 ? new ArgumentValueException("a") : null); double len_b = ModelUtils.GetVecLenL2(b); Utils.ThrowException(len_b == 0 ? new ArgumentValueException("b") : null); double len_mult = len_a * len_b; return(dot_prod / len_mult); }
private void Update(SparseMatrix <double> simMtx, SparseMatrix <double> clustMtxTr, int numClusters, int idx1, int idx2, ArrayList <Cluster> clusters, IUnlabeledExampleCollection <SparseVector <double> > dataset, double damping) { Debug.Assert(idx1 < idx2); // create new parent Cluster c1 = clusters[idx1]; Cluster c2 = clusters[idx2]; Cluster parent = new Cluster(); parent.Items.AddRange(c1.Items); parent.Items.AddRange(c2.Items); parent.ClusterInfo = Math.Max((int)c1.ClusterInfo, (int)c2.ClusterInfo) + 1; c1.Parent = parent; c2.Parent = parent; parent.AddChild(c1); parent.AddChild(c2); SparseVector <double> centroid = ModelUtils.ComputeCentroid(parent.Items, dataset, CentroidType.NrmL2); centroid = Trim(centroid, 1000, 0.8); // remove clusters clusters.RemoveAt(idx2); clusters.RemoveAt(idx1); // add new parent clusters.Add(parent); // remove rows at idx1 and idx2 simMtx.PurgeRowAt(idx2); simMtx.PurgeRowAt(idx1); // remove cols at idx1 and idx2 simMtx.PurgeColAt(idx2); simMtx.PurgeColAt(idx1); clustMtxTr.PurgeColAt(idx2); clustMtxTr.PurgeColAt(idx1); // update matrices numClusters -= 2; foreach (IdxDat <double> item in centroid) { if (clustMtxTr[item.Idx] == null) { clustMtxTr[item.Idx] = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(numClusters, item.Dat) }); } else { clustMtxTr[item.Idx].InnerIdx.Add(numClusters); clustMtxTr[item.Idx].InnerDat.Add(item.Dat); } } double[] simVec = ModelUtils.GetDotProductSimilarity(clustMtxTr, numClusters + 1, centroid); for (int i = 0; i < simVec.Length; i++) { simVec[i] *= Math.Pow(damping, (double)((int)parent.ClusterInfo + (int)clusters[i].ClusterInfo) / 2.0); } SparseMatrix <double> col = new SparseMatrix <double>(); col[0] = new SparseVector <double>(simVec); simMtx.AppendCols(col.GetTransposedCopy(), numClusters); }
public Prediction <LblT> Predict(SparseVector <double> example) { Utils.ThrowException(mDatasetMtx == null ? new InvalidOperationException() : null); Utils.ThrowException(example == null ? new ArgumentNullException("example") : null); ArrayList <KeyDat <double, LblT> > tmp = new ArrayList <KeyDat <double, LblT> >(mLabels.Count); double[] dotProdSimVec = ModelUtils.GetDotProductSimilarity(mDatasetMtx, mLabels.Count, example); for (int i = 0; i < mLabels.Count; i++) { tmp.Add(new KeyDat <double, LblT>(dotProdSimVec[i], mLabels[i])); } tmp.Sort(DescSort <KeyDat <double, LblT> > .Instance); Dictionary <LblT, double> voting = new Dictionary <LblT, double>(mLblCmp); int n = Math.Min(mK, tmp.Count); double value; if (mSoftVoting) // "soft" voting { for (int i = 0; i < n; i++) { KeyDat <double, LblT> item = tmp[i]; if (!voting.TryGetValue(item.Dat, out value)) { voting.Add(item.Dat, item.Key); } else { voting[item.Dat] = value + item.Key; } } } else // normal voting { for (int i = 0; i < n; i++) { KeyDat <double, LblT> item = tmp[i]; if (!voting.TryGetValue(item.Dat, out value)) { voting.Add(item.Dat, 1); } else { voting[item.Dat] = value + 1.0; } } } Prediction <LblT> classifierResult = new Prediction <LblT>(); foreach (KeyValuePair <LblT, double> item in voting) { classifierResult.Inner.Add(new KeyDat <double, LblT>(item.Value, item.Key)); } classifierResult.Inner.Sort(DescSort <KeyDat <double, LblT> > .Instance); return(classifierResult); }
public void Train(ILabeledExampleCollection <LblT, SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); mDatasetMtx = ModelUtils.GetTransposedMatrix(ModelUtils.ConvertToUnlabeledDataset(dataset)); mLabels = new ArrayList <LblT>(); foreach (LabeledExample <LblT, SparseVector <double> > labeledExample in dataset) { mLabels.Add(labeledExample.Label); } }
public static ClassifierResult <LblT> Classify <LblT>(BinaryVector <int> .ReadOnly bin_vec, SparseMatrix <double> .ReadOnly lambdas, LblT[] idx_to_lbl) { DotProductSimilarity dot_prod = new DotProductSimilarity(); SparseVector <double> vec = ModelUtils.ConvertExample <SparseVector <double> >(bin_vec); ArrayList <KeyDat <double, LblT> > scores = new ArrayList <KeyDat <double, LblT> >(); foreach (IdxDat <SparseVector <double> .ReadOnly> row in lambdas) { double score = Math.Exp(dot_prod.GetSimilarity(row.Dat, vec)); scores.Add(new KeyDat <double, LblT>(score, idx_to_lbl[row.Idx])); } return(new ClassifierResult <LblT>(scores)); // *** for some reason, the code below is slower than the one currently in use /*ClassifierResult<LblT> classifier_result = new ClassifierResult<LblT>(); * foreach (IdxDat<SparseVector<double>.ReadOnly> row in lambdas) * { * int i = 0, j = 0; * int a_count = bin_vec.Count; * int b_count = row.Dat.Count; * double dot_prod = 0; * List<int> a_idx = bin_vec.Inner.Inner; * ArrayList<int> b_idx = row.Dat.Inner.InnerIdx; * ArrayList<double> b_dat = row.Dat.Inner.InnerDat; * int a_idx_i = a_idx[0]; * int b_idx_j = b_idx[0]; * while (true) * { * if (a_idx_i < b_idx_j) * { * if (++i == a_count) { break; } * a_idx_i = a_idx[i]; * } * else if (a_idx_i > b_idx_j) * { * if (++j == b_count) { break; } * b_idx_j = b_idx[j]; * } * else * { * dot_prod += b_dat[j]; * if (++i == a_count || ++j == b_count) { break; } * a_idx_i = a_idx[i]; * b_idx_j = b_idx[j]; * } * } * double score = Math.Exp(dot_prod); * classifier_result.Inner.Add(new KeyDat<double, LblT>(score, idx_to_lbl[row.Idx])); * } * classifier_result.Inner.Sort(new DescSort<KeyDat<double, LblT>>()); * return classifier_result;*/ }
public Prediction <LblT> Predict(SparseVector <double> example) { Utils.ThrowException(mCentroidMtxTr == null ? new InvalidOperationException() : null); Utils.ThrowException(example == null ? new ArgumentNullException("example") : null); Prediction <LblT> result = new Prediction <LblT>(); double[] dotProdSimVec = ModelUtils.GetDotProductSimilarity(mCentroidMtxTr, mLabels.Count, example); for (int i = 0; i < dotProdSimVec.Length; i++) { result.Inner.Add(new KeyDat <double, LblT>(dotProdSimVec[i], mLabels[i])); } result.Inner.Sort(DescSort <KeyDat <double, LblT> > .Instance); return(result); }
private double GetClusterQuality(IUnlabeledExampleCollection <SparseVector <double> > dataset, out SparseVector <double> centroid) { // compute centroid centroid = ModelUtils.ComputeCentroid(dataset, CentroidType.NrmL2); // compute intra-cluster similarities double[] simData = ModelUtils.GetDotProductSimilarity(dataset, centroid); // compute cluster quality double quality = 0; for (int i = 0; i < simData.Length; i++) { quality += simData[i]; } quality /= (double)simData.Length; return(quality); }
public IUnlabeledDataset ConvertDataset(Type newExType, bool move) { Utils.ThrowException(newExType == null ? new ArgumentNullException("newExType") : null); Utils.ThrowException(move && typeof(ExT).IsValueType ? new ArgumentValueException("newExType") : null); IUnlabeledDataset newDataset = null; ArrayList <object> tmp = new ArrayList <object>(mItems.Count); for (int i = 0; i < mItems.Count; i++) { tmp.Add(ModelUtils.ConvertExample(mItems[i], newExType)); // throws ArgumentValueException if (move) { mItems[i] = default(ExT); } // *** this is guaranteed to be null by the second assertion } if (move) { mItems.Clear(); } if (newExType == typeof(SparseVector <double>)) { newDataset = new UnlabeledDataset <SparseVector <double> >(tmp); } else if (newExType == typeof(SparseVector <double> .ReadOnly)) { newDataset = new UnlabeledDataset <SparseVector <double> .ReadOnly>(tmp); } else if (newExType == typeof(BinaryVector)) { newDataset = new UnlabeledDataset <BinaryVector>(tmp); } else if (newExType == typeof(BinaryVector.ReadOnly)) { newDataset = new UnlabeledDataset <BinaryVector.ReadOnly>(tmp); } else { throw new ArgumentNotSupportedException("newExType"); } return(newDataset); }
internal void Assign(ArrayList <CentroidData> centroids, SparseMatrix <double> dataMtx, int instCount, int offs, out double clustQual) { int k = centroids.Count; double[][] dotProd = new double[k][]; clustQual = 0; int i = 0; foreach (CentroidData cen in centroids) { SparseVector <double> cenVec = cen.GetSparseVector(); dotProd[i++] = ModelUtils.GetDotProductSimilarity(dataMtx, instCount, cenVec); } for (int instIdx = 0; instIdx < instCount; instIdx++) { double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int cenIdx = 0; cenIdx < k; cenIdx++) { double sim = dotProd[cenIdx][instIdx]; if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(cenIdx); } else if (sim == maxSim) { candidates.Add(cenIdx); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } centroids[candidates[0]].Items.Add(instIdx + offs); clustQual += maxSim; } clustQual /= (double)instCount; }
public ILabeledDataset <LblT> ConvertDataset(Type newExType, bool move) { Utils.ThrowException(newExType == null ? new ArgumentNullException("newExType") : null); ILabeledDataset <LblT> newDataset = null; ArrayList <LabeledExample <LblT, object> > tmp = new ArrayList <LabeledExample <LblT, object> >(mItems.Count); for (int i = 0; i < mItems.Count; i++) { tmp.Add(new LabeledExample <LblT, object>(mItems[i].Label, ModelUtils.ConvertExample(mItems[i].Example, newExType))); // throws ArgumentValueException if (move) { mItems[i] = null; } } if (move) { mItems.Clear(); } if (newExType == typeof(SparseVector <double>)) { newDataset = new LabeledDataset <LblT, SparseVector <double> >(tmp); } else if (newExType == typeof(SparseVector <double> .ReadOnly)) { newDataset = new LabeledDataset <LblT, SparseVector <double> .ReadOnly>(tmp); } else if (newExType == typeof(BinaryVector)) { newDataset = new LabeledDataset <LblT, BinaryVector>(tmp); } else if (newExType == typeof(BinaryVector.ReadOnly)) { newDataset = new LabeledDataset <LblT, BinaryVector.ReadOnly>(tmp); } else { throw new ArgumentNotSupportedException("newExType"); } return(newDataset); }
private static SparseMatrix <double> TransposeDataset <LblT>(ILabeledExampleCollection <LblT, BinaryVector> dataset, bool clearDataset) { SparseMatrix <double> aux = new SparseMatrix <double>(); int i = 0; if (clearDataset) { foreach (LabeledExample <LblT, BinaryVector> item in dataset) { aux[i++] = ModelUtils.ConvertExample <SparseVector <double> >(item.Example); item.Example.Clear(); } } else { foreach (LabeledExample <LblT, BinaryVector> item in dataset) { aux[i++] = ModelUtils.ConvertExample <SparseVector <double> >(item.Example); } } return(aux.GetTransposedCopy()); }
private static SparseMatrix <double> TransposeDataset <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset, bool clear_dataset) { SparseMatrix <double> aux = new SparseMatrix <double>(); int i = 0; if (clear_dataset) { foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> item in dataset) { aux[i++] = ModelUtils.ConvertExample <SparseVector <double> >(item.Example); item.Example.Inner.Clear(); // *** clear read-only vectors to save space } } else { foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> item in dataset) { aux[i++] = ModelUtils.ConvertExample <SparseVector <double> >(item.Example); } } return(aux.GetTransposedCopy()); }
public static Prediction <LblT> Classify <LblT>(BinaryVector binVec, SparseMatrix <double> .ReadOnly lambdas, LblT[] idxToLbl, bool normalize) { SparseVector <double> vec = ModelUtils.ConvertExample <SparseVector <double> >(binVec); Prediction <LblT> scores = new Prediction <LblT>(); double sum = 0; foreach (IdxDat <SparseVector <double> .ReadOnly> row in lambdas) { double score = Math.Exp(DotProductSimilarity.Instance.GetSimilarity(row.Dat, vec)); scores.Inner.Add(new KeyDat <double, LblT>(score, idxToLbl[row.Idx])); sum += score; } if (normalize && sum > 0) { for (int i = 0; i < scores.Count; i++) { KeyDat <double, LblT> score = scores[i]; scores.Inner[i] = new KeyDat <double, LblT>(score.Key / sum, score.Dat); } } scores.Inner.Sort(DescSort <KeyDat <double, LblT> > .Instance); return(scores); }
public void Train(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); m_centroids = new ArrayList <Pair <LblT, SparseVector <double> .ReadOnly> >(); Dictionary <LblT, ArrayList <SparseVector <double> .ReadOnly> > tmp = new Dictionary <LblT, ArrayList <SparseVector <double> .ReadOnly> >(m_lbl_cmp); foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset) { if (!tmp.ContainsKey(labeled_example.Label)) { tmp.Add(labeled_example.Label, new ArrayList <SparseVector <double> .ReadOnly>(new SparseVector <double> .ReadOnly[] { labeled_example.Example })); } else { tmp[labeled_example.Label].Add(labeled_example.Example); } } foreach (KeyValuePair <LblT, ArrayList <SparseVector <double> .ReadOnly> > centroid_data in tmp) { SparseVector <double> centroid = ModelUtils.ComputeCentroid(centroid_data.Value, m_normalize ? CentroidType.NrmL2 : CentroidType.Avg); m_centroids.Add(new Pair <LblT, SparseVector <double> .ReadOnly>(centroid_data.Key, centroid)); } }
internal void kMeansMainLoop(IUnlabeledExampleCollection <SparseVector <double> > dataset, ArrayList <CentroidData> centroids, out double clustQual) { double[][] dotProd = new double[centroids.Count][]; SparseMatrix <double> dataMtx = ModelUtils.GetTransposedMatrix(dataset); int iter = 0; double bestClustQual = 0; while (true) { iter++; mLogger.Trace("Cluster", "Iteration {0} ...", iter); // assign items to clusters Assign(centroids, dataMtx, dataset.Count, /*offs=*/ 0, out clustQual); mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual); // update centroids Update(dataset, centroids); // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; } }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; ClusteringResult bestClustering = null; double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Trace("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <SparseVector <double> > centroids = null; clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK); tmp.Shuffle(mRnd); for (int i = 0; i < mK; i++) { seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double>[] { dataset[tmp[i]] }, mCentroidType)); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += mSimilarity.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(mK * mK - mK); if (simAvg < minSim) { minSim = simAvg; centroids = seeds; } } // main loop int iter = 0; double bestClustQual = 0; double clustQual; while (true) { iter++; mLogger.Trace("Cluster", "Iteration {0} ...", iter); clustQual = 0; // assign items to clusters foreach (Cluster cluster in clustering.Roots) { cluster.Items.Clear(); } for (int i = 0; i < dataset.Count; i++) { SparseVector <double> example = dataset[i]; double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < mK; j++) { SparseVector <double> centroid = centroids[j]; double sim = mSimilarity.GetSimilarity(example, centroid); if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == maxSim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } clustering.Roots[candidates[0]].Items.Add(i); clustQual += maxSim; } clustQual /= (double)dataset.Count; mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual); // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; // compute new centroids for (int i = 0; i < mK; i++) { centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, mCentroidType); } } if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; bestClustering = clustering; } } return(bestClustering); }
public ClusteringResult Cluster(int numOutdated, IUnlabeledExampleCollection <SparseVector <double> > batch) { Utils.ThrowException(batch == null ? new ArgumentNullException("batch") : null); Utils.ThrowException(numOutdated < 0 ? new ArgumentOutOfRangeException("numOutdated") : null); if (mDataset == null) { // initialize mLogger.Trace("Cluster", "Initializing ..."); Utils.ThrowException(numOutdated > 0 ? new ArgumentOutOfRangeException("numOutdated") : null); //Utils.ThrowException(batch.Count == 0 ? new ArgumentValueException("batch") : null); if (batch.Count == 0) { return(new ClusteringResult()); } kMeans(batch, Math.Min(mK, batch.Count)); mDataset = new UnlabeledDataset <SparseVector <double> >(batch); foreach (CentroidData centroid in mCentroids) { centroid.Tag = mTopicId++; } //OutputState(); } else { // update clusters Utils.ThrowException(numOutdated > mDataset.Count ? new ArgumentOutOfRangeException("numOutdated") : null); if (numOutdated == 0 && batch.Count == 0) { return(GetClusteringResult()); } mLogger.Trace("Cluster", "Updating clusters ..."); // assign new instances double dummy; Assign(mCentroids, ModelUtils.GetTransposedMatrix(batch), batch.Count, /*offs=*/ mDataset.Count, out dummy); mDataset.AddRange(batch); // remove outdated instances foreach (CentroidData centroid in mCentroids) { foreach (int item in centroid.CurrentItems) { if (item >= numOutdated) { centroid.Items.Add(item); } } centroid.Update(mDataset); centroid.UpdateCentroidLen(); } mDataset.RemoveRange(0, numOutdated); ArrayList <CentroidData> centroidsNew = new ArrayList <CentroidData>(mCentroids.Count); foreach (CentroidData centroid in mCentroids) { if (centroid.CurrentItems.Count > 0) { centroidsNew.Add(centroid); Set <int> tmp = new Set <int>(); foreach (int idx in centroid.CurrentItems) { tmp.Add(idx - numOutdated); } centroid.CurrentItems.Inner.SetItems(tmp); } } if (centroidsNew.Count == 0) // reset { mCentroids = null; mDataset = null; return(new ClusteringResult()); } mCentroids = centroidsNew; // execute main loop kMeansMainLoop(mDataset, mCentroids); //OutputState(); } // adjust k double minQual; // *** not used at the moment int minQualIdx; double qual = GetClustQual(out minQual, out minQualIdx); if (qual < mQualThresh) { while (qual < mQualThresh) // split cluster at minQualIdx { mLogger.Trace("Cluster", "Increasing k to {0} ...", mCentroids.Count + 1); mCentroids.Add(mCentroids[minQualIdx].Clone()); mCentroids.Last.Tag = mTopicId++; kMeansMainLoop(mDataset, mCentroids); if (mCentroids.Last.CurrentItems.Count > mCentroids[minQualIdx].CurrentItems.Count) { // swap topic identifiers object tmp = mCentroids.Last.Tag; mCentroids.Last.Tag = mCentroids[minQualIdx].Tag; mCentroids[minQualIdx].Tag = tmp; } qual = GetClustQual(out minQual, out minQualIdx); //OutputState(); } } else if (numOutdated > 0) { while (qual > mQualThresh && mCentroids.Count > 1) // join clusters { mLogger.Trace("Cluster", "Decreasing k to {0} ...", mCentroids.Count - 1); ArrayList <CentroidData> centroidsCopy = mCentroids.DeepClone(); if (mCentroids.Count == 2) // create single cluster { object topicId = mCentroids[0].CurrentItems.Count > mCentroids[1].CurrentItems.Count ? mCentroids[0].Tag : mCentroids[1].Tag; mCentroids = new ArrayList <CentroidData>(); mCentroids.Add(new CentroidData()); for (int i = 0; i < mDataset.Count; i++) { mCentroids.Last.Items.Add(i); } mCentroids.Last.Tag = topicId; mCentroids.Last.Update(mDataset); mCentroids.Last.UpdateCentroidLen(); } else { int idx1, idx2; GetMostSimilarClusters(out idx1, out idx2); CentroidData c1 = mCentroids[idx1]; CentroidData c2 = mCentroids[idx2]; object topicId = c1.CurrentItems.Count > c2.CurrentItems.Count ? c1.Tag : c2.Tag; mCentroids.RemoveAt(idx2); c1.Items.AddRange(c1.CurrentItems); c1.Items.AddRange(c2.CurrentItems); c1.Tag = topicId; c1.Update(mDataset); c1.UpdateCentroidLen(); kMeansMainLoop(mDataset, mCentroids); } qual = GetClustQual(); if (qual >= mQualThresh) { mLogger.Trace("Cluster", "Accepted solution at k = {0}.", mCentroids.Count); } else { mCentroids = centroidsCopy; } //OutputState(); } } OutputState(); return(GetClusteringResult()); }
public UnlabeledDataset <ExT> ToUnlabeledDataset() { return(ModelUtils.ConvertToUnlabeledDataset <LblT, ExT>(this)); }
public void Train(ILabeledExampleCollection <LblT, SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); Dictionary <LblT, CentroidData> centroids = new Dictionary <LblT, CentroidData>(mLblCmp); foreach (LabeledExample <LblT, SparseVector <double> > labeledExample in dataset) { if (!centroids.ContainsKey(labeledExample.Label)) { CentroidData centroidData = new CentroidData(); centroidData.AddToSum(labeledExample.Example); centroids.Add(labeledExample.Label, centroidData); } else { CentroidData centroidData = centroids[labeledExample.Label]; centroidData.AddToSum(labeledExample.Example); } } foreach (CentroidData cenData in centroids.Values) { cenData.UpdateCentroidLen(); } double learnRate = 1; double[][] dotProd = null; SparseMatrix <double> dsMtx = null; if (mIterations > 0) { dotProd = new double[centroids.Count][]; dsMtx = ModelUtils.GetTransposedMatrix(ModelUtils.ConvertToUnlabeledDataset(dataset)); } for (int iter = 1; iter <= mIterations; iter++) { mLogger.Info("Train", "Iteration {0} / {1} ...", iter, mIterations); // compute dot products mLogger.Info("Train", "Computing dot products ..."); int j = 0; foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids) { mLogger.ProgressNormal(Logger.Level.Info, /*sender=*/ this, "Train", "Centroid {0} / {1} ...", j + 1, centroids.Count); SparseVector <double> cenVec = labeledCentroid.Value.GetSparseVector(); dotProd[j] = ModelUtils.GetDotProductSimilarity(dsMtx, dataset.Count, cenVec); j++; } // classify training examples mLogger.Info("Train", "Classifying training examples ..."); int errCount = 0; for (int instIdx = 0; instIdx < dataset.Count; instIdx++) { mLogger.ProgressFast(Logger.Level.Info, /*sender=*/ this, "Train", "Example {0} / {1} ...", instIdx + 1, dataset.Count); double maxSim = double.MinValue; CentroidData assignedCentroid = null; CentroidData actualCentroid = null; LabeledExample <LblT, SparseVector <double> > labeledExample = dataset[instIdx]; SparseVector <double> vec = labeledExample.Example; int cenIdx = 0; foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids) { double sim = dotProd[cenIdx][instIdx]; if (sim > maxSim) { maxSim = sim; assignedCentroid = labeledCentroid.Value; } if (labeledCentroid.Key.Equals(labeledExample.Label)) { actualCentroid = labeledCentroid.Value; } cenIdx++; } if (assignedCentroid != actualCentroid) { assignedCentroid.AddToDiff(-learnRate, vec); actualCentroid.AddToDiff(learnRate, vec); errCount++; } } mLogger.Info("Train", "Training set error rate: {0:0.00}%", (double)errCount / (double)dataset.Count * 100.0); // update centroids int k = 0; foreach (CentroidData centroidData in centroids.Values) { mLogger.ProgressNormal(Logger.Level.Info, /*sender=*/ this, "Train", "Centroid {0} / {1} ...", ++k, centroids.Count); centroidData.Update(mPositiveValuesOnly); centroidData.UpdateCentroidLen(); } learnRate *= mDamping; } mCentroidMtxTr = new SparseMatrix <double>(); mLabels = new ArrayList <LblT>(); int rowIdx = 0; foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids) { mCentroidMtxTr[rowIdx++] = labeledCentroid.Value.GetSparseVector(); mLabels.Add(labeledCentroid.Key); } mCentroidMtxTr = mCentroidMtxTr.GetTransposedCopy(); }
public ClusteringResult Cluster(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < m_k ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; ClusteringResult best_clustering = null; double global_best_clust_qual = 0; for (int trial = 1; trial <= m_trials; trial++) { Utils.VerboseLine("*** CLUSTERING TRIAL {0} OF {1} ***", trial, m_trials); ArrayList <SparseVector <double> .ReadOnly> centroids = null; clustering = new ClusteringResult(); for (int i = 0; i < m_k; i++) { clustering.Roots.Add(new Cluster()); } // select seed items double min_sim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> .ReadOnly> seeds = new ArrayList <SparseVector <double> .ReadOnly>(m_k); tmp.Shuffle(m_rnd); for (int i = 0; i < m_k; i++) { seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double> .ReadOnly[] { dataset[tmp[i]].Example }, m_centroid_type)); } // assess quality of seed items double sim_avg = 0; foreach (SparseVector <double> .ReadOnly seed_1 in seeds) { foreach (SparseVector <double> .ReadOnly seed_2 in seeds) { if (seed_1 != seed_2) { sim_avg += m_similarity.GetSimilarity(seed_1, seed_2); } } } sim_avg /= (double)(m_k * m_k - m_k); //Console.WriteLine(sim_avg); if (sim_avg < min_sim) { min_sim = sim_avg; centroids = seeds; } } // main loop int iter = 0; double best_clust_qual = 0; double clust_qual; while (true) { iter++; clust_qual = 0; // assign items to clusters foreach (Cluster cluster in clustering.Roots) { cluster.Items.Clear(); } for (int i = 0; i < dataset.Count; i++) { SparseVector <double> .ReadOnly example = dataset[i].Example; double max_sim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < m_k; j++) { SparseVector <double> .ReadOnly centroid = centroids[j]; double sim = m_similarity.GetSimilarity(example, centroid); if (sim > max_sim) { max_sim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == max_sim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(m_rnd); } if (candidates.Count > 0) // *** is this always true? { clustering.Roots[candidates[0]].Items.Add(new Pair <double, int>(1, i)); clust_qual += max_sim; } } clust_qual /= (double)dataset.Count; Utils.VerboseLine("*** Iteration {0} ***", iter); Utils.VerboseLine("Quality: {0:0.0000}", clust_qual); // check if done if (iter > 1 && clust_qual - best_clust_qual <= m_eps) { break; } best_clust_qual = clust_qual; // compute new centroids for (int i = 0; i < m_k; i++) { centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, m_centroid_type); } } if (trial == 1 || clust_qual > global_best_clust_qual) { global_best_clust_qual = clust_qual; best_clustering = clustering; } } return(best_clustering); }
private double[][] GetKernel(int rmvFeatIdx) { int numSv = SvmLightLib.GetSupportVectorCount(mModelId); // initialize matrix double[][] kernel = new double[numSv][]; // compute linear kernel SparseMatrix <double> m = new SparseMatrix <double>(); for (int i = 0; i < numSv; i++) { SparseVector <double> sv = GetSupportVector(i); m[i] = sv; } if (rmvFeatIdx >= 0) { m.RemoveColAt(rmvFeatIdx); } SparseMatrix <double> mTr = m.GetTransposedCopy(); for (int i = 0; i < numSv; i++) { double[] innerProd = ModelUtils.GetDotProductSimilarity(mTr, numSv, m[i]); kernel[i] = innerProd; } // compute non-linear kernel switch (mKernelType) { case SvmLightKernelType.Polynomial: for (int row = 0; row < kernel.Length; row++) { for (int col = 0; col < kernel.Length; col++) { kernel[row][col] = Math.Pow(mKernelParamS * kernel[row][col] + mKernelParamC, mKernelParamD); } } break; case SvmLightKernelType.RadialBasisFunction: double[] diag = new double[kernel.Length]; for (int i = 0; i < kernel.Length; i++) { diag[i] = kernel[i][i]; } // save diagonal for (int row = 0; row < kernel.Length; row++) { for (int col = 0; col < kernel.Length; col++) { kernel[row][col] = Math.Exp(-mKernelParamGamma * (diag[row] + diag[col] - 2.0 * kernel[row][col])); } } break; case SvmLightKernelType.Sigmoid: for (int row = 0; row < kernel.Length; row++) { for (int col = 0; col < kernel.Length; col++) { kernel[row][col] = Math.Tanh(mKernelParamS * kernel[row][col] + mKernelParamC); } } break; } return(kernel); }
public IDataset <LblT> ConvertDataset(Type new_ex_type, bool move) { Utils.ThrowException(new_ex_type == null ? new ArgumentNullException("new_ex_type") : null); if (new_ex_type == typeof(SparseVector <double>)) { Dataset <LblT, SparseVector <double> > new_dataset = new Dataset <LblT, SparseVector <double> >(); for (int i = 0; i < m_items.Count; i++) { LabeledExample <LblT, ExT> example = m_items[i]; new_dataset.Add(example.Label, ModelUtils.ConvertExample <SparseVector <double> >(example.Example)); if (move) { m_items[i] = new LabeledExample <LblT, ExT>(); } } if (move) { m_items.Clear(); } return(new_dataset); } else if (new_ex_type == typeof(SparseVector <double> .ReadOnly)) { Dataset <LblT, SparseVector <double> .ReadOnly> new_dataset = new Dataset <LblT, SparseVector <double> .ReadOnly>(); for (int i = 0; i < m_items.Count; i++) { LabeledExample <LblT, ExT> example = m_items[i]; new_dataset.Add(example.Label, ModelUtils.ConvertExample <SparseVector <double> .ReadOnly>(example.Example)); if (move) { m_items[i] = new LabeledExample <LblT, ExT>(); } } if (move) { m_items.Clear(); } return(new_dataset); } else if (new_ex_type == typeof(BinaryVector <int>)) { Dataset <LblT, BinaryVector <int> > new_dataset = new Dataset <LblT, BinaryVector <int> >(); for (int i = 0; i < m_items.Count; i++) { LabeledExample <LblT, ExT> example = m_items[i]; new_dataset.Add(example.Label, ModelUtils.ConvertExample <BinaryVector <int> >(example.Example)); if (move) { m_items[i] = new LabeledExample <LblT, ExT>(); } } if (move) { m_items.Clear(); } return(new_dataset); } else if (new_ex_type == typeof(BinaryVector <int> .ReadOnly)) { Dataset <LblT, BinaryVector <int> .ReadOnly> new_dataset = new Dataset <LblT, BinaryVector <int> .ReadOnly>(); for (int i = 0; i < m_items.Count; i++) { LabeledExample <LblT, ExT> example = m_items[i]; new_dataset.Add(example.Label, ModelUtils.ConvertExample <BinaryVector <int> .ReadOnly>(example.Example)); if (move) { m_items[i] = new LabeledExample <LblT, ExT>(); } } if (move) { m_items.Clear(); } return(new_dataset); } //else if (new_ex_type == typeof(SvmFeatureVector)) //{ // Dataset<LblT, SvmFeatureVector> new_dataset = new Dataset<LblT, SvmFeatureVector>(); // for (int i = 0; i < m_items.Count; i++) // { // LabeledExample<LblT, ExT> example = m_items[i]; // new_dataset.Add(example.Label, ModelUtils.ConvertVector<SvmFeatureVector>(example.Example)); // if (move) { m_items[i] = new LabeledExample<LblT, ExT>(); } // } // if (move) { m_items.Clear(); } // return new_dataset; //} else { throw new ArgumentNotSupportedException("new_ex_type"); } }
public SparseVector <double> ComputeCentroid(IUnlabeledExampleCollection <SparseVector <double> > dataset, CentroidType type) { return(ModelUtils.ComputeCentroid(mItems, dataset, type)); // throws ArgumentValueException }
public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; double globalBestClustQual = 0; for (int trial = 1; trial <= mTrials; trial++) { mLogger.Info("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials); ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(mK); ArrayList <int> bestSeeds = null; for (int i = 0; i < mK; i++) { centroids.Add(new CentroidData()); } // select seed items double minSim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK); tmp.Shuffle(mRnd); for (int i = 0; i < mK; i++) { seeds.Add(dataset[tmp[i]]); } // assess quality of seed items double simAvg = 0; foreach (SparseVector <double> seed1 in seeds) { foreach (SparseVector <double> seed2 in seeds) { if (seed1 != seed2) { simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2); } } } simAvg /= (double)(mK * mK - mK); //Console.WriteLine(simAvg); if (simAvg < minSim) { minSim = simAvg; bestSeeds = new ArrayList <int>(mK); for (int i = 0; i < mK; i++) { bestSeeds.Add(tmp[i]); } } } for (int i = 0; i < mK; i++) { centroids[i].Items.Add(bestSeeds[i]); centroids[i].Update(dataset); centroids[i].UpdateCentroidLen(); } double[][] dotProd = new double[mK][]; SparseMatrix <double> dsMtx = ModelUtils.GetTransposedMatrix(dataset); // main loop int iter = 0; double bestClustQual = 0; double clustQual; while (true) { iter++; mLogger.Info("Cluster", "Iteration {0} ...", iter); clustQual = 0; // assign items to clusters int j = 0; foreach (CentroidData cen in centroids) { SparseVector <double> cenVec = cen.GetSparseVector(); dotProd[j] = ModelUtils.GetDotProductSimilarity(dsMtx, dataset.Count, cenVec); j++; } for (int instIdx = 0; instIdx < dataset.Count; instIdx++) { double maxSim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int cenIdx = 0; cenIdx < mK; cenIdx++) { double sim = dotProd[cenIdx][instIdx]; if (sim > maxSim) { maxSim = sim; candidates.Clear(); candidates.Add(cenIdx); } else if (sim == maxSim) { candidates.Add(cenIdx); } } if (candidates.Count > 1) { candidates.Shuffle(mRnd); } if (candidates.Count > 0) // *** is this always true? { centroids[candidates[0]].Items.Add(instIdx); clustQual += maxSim; } } clustQual /= (double)dataset.Count; mLogger.Info("Cluster", "Quality: {0:0.0000}", clustQual); // check if done if (iter > 1 && clustQual - bestClustQual <= mEps) { break; } bestClustQual = clustQual; // compute new centroids for (int i = 0; i < mK; i++) { centroids[i].Update(dataset); centroids[i].UpdateCentroidLen(); } } if (trial == 1 || clustQual > globalBestClustQual) { globalBestClustQual = clustQual; // save the result clustering = new ClusteringResult(); for (int i = 0; i < mK; i++) { clustering.AddRoot(new Cluster()); clustering.Roots.Last.Items.AddRange(centroids[i].Items); } } } return(clustering); }