public SparseVector <double> Trim(SparseVector <double> vec, int size, double cutPerc)
        {
            SparseVector <double> trimmed = vec;

            if (vec.Count > size)
            {
                ArrayList <KeyDat <double, int> > tmp = new ArrayList <KeyDat <double, int> >(vec.Count);
                foreach (IdxDat <double> item in vec)
                {
                    tmp.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                }
                tmp.Sort(DescSort <KeyDat <double, int> > .Instance);
                tmp.RemoveRange(size, tmp.Count - size);
                ArrayList <IdxDat <double> > tmp2 = new ArrayList <IdxDat <double> >();
                foreach (KeyDat <double, int> item in tmp)
                {
                    tmp2.Add(new IdxDat <double>(item.Dat, item.Key));
                }
                tmp2.Sort();
                trimmed = new SparseVector <double>(tmp2);
            }
            ModelUtils.CutLowWeights(ref trimmed, cutPerc);
            ModelUtils.TryNrmVecL2(trimmed);
            return(trimmed);
        }
Exemplo n.º 2
0
        private static SparseMatrix <double> CreateObservationMatrix <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset, ref LblT[] idx_to_lbl)
        {
            SparseMatrix <double>  mtx        = new SparseMatrix <double>();
            ArrayList <LblT>       tmp        = new ArrayList <LblT>();
            Dictionary <LblT, int> lbl_to_idx = new Dictionary <LblT, int>();

            foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> labeled_example in dataset)
            {
                if (!lbl_to_idx.ContainsKey(labeled_example.Label))
                {
                    lbl_to_idx.Add(labeled_example.Label, lbl_to_idx.Count);
                    tmp.Add(labeled_example.Label);
                }
            }
            int i = 0;

            foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> labeled_example in dataset)
            {
                Utils.Verbose("{0} / {1}\r", ++i, dataset.Count);
                int lbl_idx = lbl_to_idx[labeled_example.Label];
                if (!mtx.ContainsRowAt(lbl_idx))
                {
                    mtx[lbl_idx] = ModelUtils.ConvertExample <SparseVector <double> >(labeled_example.Example);
                }
                else
                {
                    SparseVector <double> new_vec = ModelUtils.ConvertExample <SparseVector <double> >(labeled_example.Example);
                    new_vec.Merge(mtx[lbl_idx], new SumOperator());
                    mtx[lbl_idx] = new_vec;
                }
            }
            Utils.VerboseLine("");
            idx_to_lbl = tmp.ToArray();
            return(mtx);
        }
Exemplo n.º 3
0
        // *** ISimilarity<SparseVector<double>.ReadOnly> interface implementation ***

        public double GetSimilarity(SparseVector <double> .ReadOnly a, SparseVector <double> .ReadOnly b)
        {
            Utils.ThrowException(a == null ? new ArgumentNullException("a") : null);
            Utils.ThrowException(b == null ? new ArgumentNullException("b") : null);
            double dotProd = 0;
            int    i = 0, j = 0;
            int    aCount = a.Count;

            Utils.ThrowException(aCount == 0 ? new ArgumentValueException("a") : null);
            int bCount = b.Count;

            Utils.ThrowException(bCount == 0 ? new ArgumentValueException("b") : null);
            ArrayList <int>    aIdx = a.Inner.InnerIdx;
            ArrayList <double> aDat = a.Inner.InnerDat;
            ArrayList <int>    bIdx = b.Inner.InnerIdx;
            ArrayList <double> bDat = b.Inner.InnerDat;
            int aIdx_i = aCount == 0 ? 0 : aIdx[0];
            int bIdx_j = bCount == 0 ? 0 : bIdx[0];

            while (true)
            {
                if (aIdx_i < bIdx_j)
                {
                    if (++i == aCount)
                    {
                        break;
                    }
                    aIdx_i = aIdx[i];
                }
                else if (aIdx_i > bIdx_j)
                {
                    if (++j == bCount)
                    {
                        break;
                    }
                    bIdx_j = bIdx[j];
                }
                else
                {
                    dotProd += aDat[i] * bDat[j];
                    if (++i == aCount || ++j == bCount)
                    {
                        break;
                    }
                    aIdx_i = aIdx[i];
                    bIdx_j = bIdx[j];
                }
            }
            double aLen = ModelUtils.GetVecLenL2(a);

            Utils.ThrowException(aLen == 0 ? new ArgumentValueException("a") : null);
            double bLen = ModelUtils.GetVecLenL2(b);

            Utils.ThrowException(bLen == 0 ? new ArgumentValueException("b") : null);
            double lenMult = aLen * bLen;

            return(dotProd / lenMult);
        }
Exemplo n.º 4
0
        // *** ISimilarity<SparseVector<double>.ReadOnly> interface implementation ***

        public double GetSimilarity(SparseVector <double> .ReadOnly a, SparseVector <double> .ReadOnly b)
        {
            Utils.ThrowException(a == null ? new ArgumentNullException("a") : null);
            Utils.ThrowException(b == null ? new ArgumentNullException("b") : null);
            double dot_prod = 0;
            int    i = 0, j = 0;
            int    a_count = a.Count;

            Utils.ThrowException(a_count == 0 ? new ArgumentValueException("a") : null);
            int b_count = b.Count;

            Utils.ThrowException(b_count == 0 ? new ArgumentValueException("b") : null);
            ArrayList <int>    a_idx = a.Inner.InnerIdx;
            ArrayList <double> a_dat = a.Inner.InnerDat;
            ArrayList <int>    b_idx = b.Inner.InnerIdx;
            ArrayList <double> b_dat = b.Inner.InnerDat;
            int a_idx_i = a_count == 0 ? 0 : a_idx[0];
            int b_idx_j = b_count == 0 ? 0 : b_idx[0];

            while (true)
            {
                if (a_idx_i < b_idx_j)
                {
                    if (++i == a_count)
                    {
                        break;
                    }
                    a_idx_i = a_idx[i];
                }
                else if (a_idx_i > b_idx_j)
                {
                    if (++j == b_count)
                    {
                        break;
                    }
                    b_idx_j = b_idx[j];
                }
                else
                {
                    dot_prod += a_dat[i] * b_dat[j];
                    if (++i == a_count || ++j == b_count)
                    {
                        break;
                    }
                    a_idx_i = a_idx[i];
                    b_idx_j = b_idx[j];
                }
            }
            double len_a = ModelUtils.GetVecLenL2(a);

            Utils.ThrowException(len_a == 0 ? new ArgumentValueException("a") : null);
            double len_b = ModelUtils.GetVecLenL2(b);

            Utils.ThrowException(len_b == 0 ? new ArgumentValueException("b") : null);
            double len_mult = len_a * len_b;

            return(dot_prod / len_mult);
        }
        private void Update(SparseMatrix <double> simMtx, SparseMatrix <double> clustMtxTr, int numClusters, int idx1, int idx2, ArrayList <Cluster> clusters,
                            IUnlabeledExampleCollection <SparseVector <double> > dataset, double damping)
        {
            Debug.Assert(idx1 < idx2);
            // create new parent
            Cluster c1     = clusters[idx1];
            Cluster c2     = clusters[idx2];
            Cluster parent = new Cluster();

            parent.Items.AddRange(c1.Items);
            parent.Items.AddRange(c2.Items);
            parent.ClusterInfo = Math.Max((int)c1.ClusterInfo, (int)c2.ClusterInfo) + 1;
            c1.Parent          = parent;
            c2.Parent          = parent;
            parent.AddChild(c1);
            parent.AddChild(c2);
            SparseVector <double> centroid = ModelUtils.ComputeCentroid(parent.Items, dataset, CentroidType.NrmL2);

            centroid = Trim(centroid, 1000, 0.8);
            // remove clusters
            clusters.RemoveAt(idx2);
            clusters.RemoveAt(idx1);
            // add new parent
            clusters.Add(parent);
            // remove rows at idx1 and idx2
            simMtx.PurgeRowAt(idx2);
            simMtx.PurgeRowAt(idx1);
            // remove cols at idx1 and idx2
            simMtx.PurgeColAt(idx2);
            simMtx.PurgeColAt(idx1);
            clustMtxTr.PurgeColAt(idx2);
            clustMtxTr.PurgeColAt(idx1);
            // update matrices
            numClusters -= 2;
            foreach (IdxDat <double> item in centroid)
            {
                if (clustMtxTr[item.Idx] == null)
                {
                    clustMtxTr[item.Idx] = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(numClusters, item.Dat) });
                }
                else
                {
                    clustMtxTr[item.Idx].InnerIdx.Add(numClusters);
                    clustMtxTr[item.Idx].InnerDat.Add(item.Dat);
                }
            }
            double[] simVec = ModelUtils.GetDotProductSimilarity(clustMtxTr, numClusters + 1, centroid);
            for (int i = 0; i < simVec.Length; i++)
            {
                simVec[i] *= Math.Pow(damping, (double)((int)parent.ClusterInfo + (int)clusters[i].ClusterInfo) / 2.0);
            }
            SparseMatrix <double> col = new SparseMatrix <double>();

            col[0] = new SparseVector <double>(simVec);
            simMtx.AppendCols(col.GetTransposedCopy(), numClusters);
        }
Exemplo n.º 6
0
        public Prediction <LblT> Predict(SparseVector <double> example)
        {
            Utils.ThrowException(mDatasetMtx == null ? new InvalidOperationException() : null);
            Utils.ThrowException(example == null ? new ArgumentNullException("example") : null);
            ArrayList <KeyDat <double, LblT> > tmp = new ArrayList <KeyDat <double, LblT> >(mLabels.Count);

            double[] dotProdSimVec = ModelUtils.GetDotProductSimilarity(mDatasetMtx, mLabels.Count, example);
            for (int i = 0; i < mLabels.Count; i++)
            {
                tmp.Add(new KeyDat <double, LblT>(dotProdSimVec[i], mLabels[i]));
            }
            tmp.Sort(DescSort <KeyDat <double, LblT> > .Instance);
            Dictionary <LblT, double> voting = new Dictionary <LblT, double>(mLblCmp);
            int    n = Math.Min(mK, tmp.Count);
            double value;

            if (mSoftVoting) // "soft" voting
            {
                for (int i = 0; i < n; i++)
                {
                    KeyDat <double, LblT> item = tmp[i];
                    if (!voting.TryGetValue(item.Dat, out value))
                    {
                        voting.Add(item.Dat, item.Key);
                    }
                    else
                    {
                        voting[item.Dat] = value + item.Key;
                    }
                }
            }
            else // normal voting
            {
                for (int i = 0; i < n; i++)
                {
                    KeyDat <double, LblT> item = tmp[i];
                    if (!voting.TryGetValue(item.Dat, out value))
                    {
                        voting.Add(item.Dat, 1);
                    }
                    else
                    {
                        voting[item.Dat] = value + 1.0;
                    }
                }
            }
            Prediction <LblT> classifierResult = new Prediction <LblT>();

            foreach (KeyValuePair <LblT, double> item in voting)
            {
                classifierResult.Inner.Add(new KeyDat <double, LblT>(item.Value, item.Key));
            }
            classifierResult.Inner.Sort(DescSort <KeyDat <double, LblT> > .Instance);
            return(classifierResult);
        }
Exemplo n.º 7
0
 public void Train(ILabeledExampleCollection <LblT, SparseVector <double> > dataset)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null);
     mDatasetMtx = ModelUtils.GetTransposedMatrix(ModelUtils.ConvertToUnlabeledDataset(dataset));
     mLabels     = new ArrayList <LblT>();
     foreach (LabeledExample <LblT, SparseVector <double> > labeledExample in dataset)
     {
         mLabels.Add(labeledExample.Label);
     }
 }
Exemplo n.º 8
0
        public static ClassifierResult <LblT> Classify <LblT>(BinaryVector <int> .ReadOnly bin_vec, SparseMatrix <double> .ReadOnly lambdas, LblT[] idx_to_lbl)
        {
            DotProductSimilarity  dot_prod            = new DotProductSimilarity();
            SparseVector <double> vec                 = ModelUtils.ConvertExample <SparseVector <double> >(bin_vec);
            ArrayList <KeyDat <double, LblT> > scores = new ArrayList <KeyDat <double, LblT> >();

            foreach (IdxDat <SparseVector <double> .ReadOnly> row in lambdas)
            {
                double score = Math.Exp(dot_prod.GetSimilarity(row.Dat, vec));
                scores.Add(new KeyDat <double, LblT>(score, idx_to_lbl[row.Idx]));
            }
            return(new ClassifierResult <LblT>(scores));
            // *** for some reason, the code below is slower than the one currently in use

            /*ClassifierResult<LblT> classifier_result = new ClassifierResult<LblT>();
             * foreach (IdxDat<SparseVector<double>.ReadOnly> row in lambdas)
             * {
             *  int i = 0, j = 0;
             *  int a_count = bin_vec.Count;
             *  int b_count = row.Dat.Count;
             *  double dot_prod = 0;
             *  List<int> a_idx = bin_vec.Inner.Inner;
             *  ArrayList<int> b_idx = row.Dat.Inner.InnerIdx;
             *  ArrayList<double> b_dat = row.Dat.Inner.InnerDat;
             *  int a_idx_i = a_idx[0];
             *  int b_idx_j = b_idx[0];
             *  while (true)
             *  {
             *      if (a_idx_i < b_idx_j)
             *      {
             *          if (++i == a_count) { break; }
             *          a_idx_i = a_idx[i];
             *      }
             *      else if (a_idx_i > b_idx_j)
             *      {
             *          if (++j == b_count) { break; }
             *          b_idx_j = b_idx[j];
             *      }
             *      else
             *      {
             *          dot_prod += b_dat[j];
             *          if (++i == a_count || ++j == b_count) { break; }
             *          a_idx_i = a_idx[i];
             *          b_idx_j = b_idx[j];
             *      }
             *  }
             *  double score = Math.Exp(dot_prod);
             *  classifier_result.Inner.Add(new KeyDat<double, LblT>(score, idx_to_lbl[row.Idx]));
             * }
             * classifier_result.Inner.Sort(new DescSort<KeyDat<double, LblT>>());
             * return classifier_result;*/
        }
Exemplo n.º 9
0
        public Prediction <LblT> Predict(SparseVector <double> example)
        {
            Utils.ThrowException(mCentroidMtxTr == null ? new InvalidOperationException() : null);
            Utils.ThrowException(example == null ? new ArgumentNullException("example") : null);
            Prediction <LblT> result = new Prediction <LblT>();

            double[] dotProdSimVec = ModelUtils.GetDotProductSimilarity(mCentroidMtxTr, mLabels.Count, example);
            for (int i = 0; i < dotProdSimVec.Length; i++)
            {
                result.Inner.Add(new KeyDat <double, LblT>(dotProdSimVec[i], mLabels[i]));
            }
            result.Inner.Sort(DescSort <KeyDat <double, LblT> > .Instance);
            return(result);
        }
Exemplo n.º 10
0
        private double GetClusterQuality(IUnlabeledExampleCollection <SparseVector <double> > dataset, out SparseVector <double> centroid)
        {
            // compute centroid
            centroid = ModelUtils.ComputeCentroid(dataset, CentroidType.NrmL2);
            // compute intra-cluster similarities
            double[] simData = ModelUtils.GetDotProductSimilarity(dataset, centroid);
            // compute cluster quality
            double quality = 0;

            for (int i = 0; i < simData.Length; i++)
            {
                quality += simData[i];
            }
            quality /= (double)simData.Length;
            return(quality);
        }
Exemplo n.º 11
0
        public IUnlabeledDataset ConvertDataset(Type newExType, bool move)
        {
            Utils.ThrowException(newExType == null ? new ArgumentNullException("newExType") : null);
            Utils.ThrowException(move && typeof(ExT).IsValueType ? new ArgumentValueException("newExType") : null);
            IUnlabeledDataset  newDataset = null;
            ArrayList <object> tmp        = new ArrayList <object>(mItems.Count);

            for (int i = 0; i < mItems.Count; i++)
            {
                tmp.Add(ModelUtils.ConvertExample(mItems[i], newExType)); // throws ArgumentValueException
                if (move)
                {
                    mItems[i] = default(ExT);
                }                                       // *** this is guaranteed to be null by the second assertion
            }
            if (move)
            {
                mItems.Clear();
            }
            if (newExType == typeof(SparseVector <double>))
            {
                newDataset = new UnlabeledDataset <SparseVector <double> >(tmp);
            }
            else if (newExType == typeof(SparseVector <double> .ReadOnly))
            {
                newDataset = new UnlabeledDataset <SparseVector <double> .ReadOnly>(tmp);
            }
            else if (newExType == typeof(BinaryVector))
            {
                newDataset = new UnlabeledDataset <BinaryVector>(tmp);
            }
            else if (newExType == typeof(BinaryVector.ReadOnly))
            {
                newDataset = new UnlabeledDataset <BinaryVector.ReadOnly>(tmp);
            }
            else
            {
                throw new ArgumentNotSupportedException("newExType");
            }
            return(newDataset);
        }
Exemplo n.º 12
0
        internal void Assign(ArrayList <CentroidData> centroids, SparseMatrix <double> dataMtx, int instCount, int offs, out double clustQual)
        {
            int k = centroids.Count;

            double[][] dotProd = new double[k][];
            clustQual = 0;
            int i = 0;

            foreach (CentroidData cen in centroids)
            {
                SparseVector <double> cenVec = cen.GetSparseVector();
                dotProd[i++] = ModelUtils.GetDotProductSimilarity(dataMtx, instCount, cenVec);
            }
            for (int instIdx = 0; instIdx < instCount; instIdx++)
            {
                double          maxSim     = double.MinValue;
                ArrayList <int> candidates = new ArrayList <int>();
                for (int cenIdx = 0; cenIdx < k; cenIdx++)
                {
                    double sim = dotProd[cenIdx][instIdx];
                    if (sim > maxSim)
                    {
                        maxSim = sim;
                        candidates.Clear();
                        candidates.Add(cenIdx);
                    }
                    else if (sim == maxSim)
                    {
                        candidates.Add(cenIdx);
                    }
                }
                if (candidates.Count > 1)
                {
                    candidates.Shuffle(mRnd);
                }
                centroids[candidates[0]].Items.Add(instIdx + offs);
                clustQual += maxSim;
            }
            clustQual /= (double)instCount;
        }
Exemplo n.º 13
0
        public ILabeledDataset <LblT> ConvertDataset(Type newExType, bool move)
        {
            Utils.ThrowException(newExType == null ? new ArgumentNullException("newExType") : null);
            ILabeledDataset <LblT> newDataset = null;
            ArrayList <LabeledExample <LblT, object> > tmp = new ArrayList <LabeledExample <LblT, object> >(mItems.Count);

            for (int i = 0; i < mItems.Count; i++)
            {
                tmp.Add(new LabeledExample <LblT, object>(mItems[i].Label, ModelUtils.ConvertExample(mItems[i].Example, newExType))); // throws ArgumentValueException
                if (move)
                {
                    mItems[i] = null;
                }
            }
            if (move)
            {
                mItems.Clear();
            }
            if (newExType == typeof(SparseVector <double>))
            {
                newDataset = new LabeledDataset <LblT, SparseVector <double> >(tmp);
            }
            else if (newExType == typeof(SparseVector <double> .ReadOnly))
            {
                newDataset = new LabeledDataset <LblT, SparseVector <double> .ReadOnly>(tmp);
            }
            else if (newExType == typeof(BinaryVector))
            {
                newDataset = new LabeledDataset <LblT, BinaryVector>(tmp);
            }
            else if (newExType == typeof(BinaryVector.ReadOnly))
            {
                newDataset = new LabeledDataset <LblT, BinaryVector.ReadOnly>(tmp);
            }
            else
            {
                throw new ArgumentNotSupportedException("newExType");
            }
            return(newDataset);
        }
Exemplo n.º 14
0
        private static SparseMatrix <double> TransposeDataset <LblT>(ILabeledExampleCollection <LblT, BinaryVector> dataset, bool clearDataset)
        {
            SparseMatrix <double> aux = new SparseMatrix <double>();
            int i = 0;

            if (clearDataset)
            {
                foreach (LabeledExample <LblT, BinaryVector> item in dataset)
                {
                    aux[i++] = ModelUtils.ConvertExample <SparseVector <double> >(item.Example);
                    item.Example.Clear();
                }
            }
            else
            {
                foreach (LabeledExample <LblT, BinaryVector> item in dataset)
                {
                    aux[i++] = ModelUtils.ConvertExample <SparseVector <double> >(item.Example);
                }
            }
            return(aux.GetTransposedCopy());
        }
Exemplo n.º 15
0
        private static SparseMatrix <double> TransposeDataset <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset, bool clear_dataset)
        {
            SparseMatrix <double> aux = new SparseMatrix <double>();
            int i = 0;

            if (clear_dataset)
            {
                foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> item in dataset)
                {
                    aux[i++] = ModelUtils.ConvertExample <SparseVector <double> >(item.Example);
                    item.Example.Inner.Clear(); // *** clear read-only vectors to save space
                }
            }
            else
            {
                foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> item in dataset)
                {
                    aux[i++] = ModelUtils.ConvertExample <SparseVector <double> >(item.Example);
                }
            }
            return(aux.GetTransposedCopy());
        }
Exemplo n.º 16
0
        public static Prediction <LblT> Classify <LblT>(BinaryVector binVec, SparseMatrix <double> .ReadOnly lambdas, LblT[] idxToLbl, bool normalize)
        {
            SparseVector <double> vec    = ModelUtils.ConvertExample <SparseVector <double> >(binVec);
            Prediction <LblT>     scores = new Prediction <LblT>();
            double sum = 0;

            foreach (IdxDat <SparseVector <double> .ReadOnly> row in lambdas)
            {
                double score = Math.Exp(DotProductSimilarity.Instance.GetSimilarity(row.Dat, vec));
                scores.Inner.Add(new KeyDat <double, LblT>(score, idxToLbl[row.Idx]));
                sum += score;
            }
            if (normalize && sum > 0)
            {
                for (int i = 0; i < scores.Count; i++)
                {
                    KeyDat <double, LblT> score = scores[i];
                    scores.Inner[i] = new KeyDat <double, LblT>(score.Key / sum, score.Dat);
                }
            }
            scores.Inner.Sort(DescSort <KeyDat <double, LblT> > .Instance);
            return(scores);
        }
Exemplo n.º 17
0
        public void Train(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null);
            m_centroids = new ArrayList <Pair <LblT, SparseVector <double> .ReadOnly> >();
            Dictionary <LblT, ArrayList <SparseVector <double> .ReadOnly> > tmp = new Dictionary <LblT, ArrayList <SparseVector <double> .ReadOnly> >(m_lbl_cmp);

            foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset)
            {
                if (!tmp.ContainsKey(labeled_example.Label))
                {
                    tmp.Add(labeled_example.Label, new ArrayList <SparseVector <double> .ReadOnly>(new SparseVector <double> .ReadOnly[] { labeled_example.Example }));
                }
                else
                {
                    tmp[labeled_example.Label].Add(labeled_example.Example);
                }
            }
            foreach (KeyValuePair <LblT, ArrayList <SparseVector <double> .ReadOnly> > centroid_data in tmp)
            {
                SparseVector <double> centroid = ModelUtils.ComputeCentroid(centroid_data.Value, m_normalize ? CentroidType.NrmL2 : CentroidType.Avg);
                m_centroids.Add(new Pair <LblT, SparseVector <double> .ReadOnly>(centroid_data.Key, centroid));
            }
        }
Exemplo n.º 18
0
        internal void kMeansMainLoop(IUnlabeledExampleCollection <SparseVector <double> > dataset, ArrayList <CentroidData> centroids, out double clustQual)
        {
            double[][]            dotProd = new double[centroids.Count][];
            SparseMatrix <double> dataMtx = ModelUtils.GetTransposedMatrix(dataset);
            int    iter          = 0;
            double bestClustQual = 0;

            while (true)
            {
                iter++;
                mLogger.Trace("Cluster", "Iteration {0} ...", iter);
                // assign items to clusters
                Assign(centroids, dataMtx, dataset.Count, /*offs=*/ 0, out clustQual);
                mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual);
                // update centroids
                Update(dataset, centroids);
                // check if done
                if (iter > 1 && clustQual - bestClustQual <= mEps)
                {
                    break;
                }
                bestClustQual = clustQual;
            }
        }
Exemplo n.º 19
0
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null);
            ClusteringResult clustering          = null;
            ClusteringResult bestClustering      = null;
            double           globalBestClustQual = 0;

            for (int trial = 1; trial <= mTrials; trial++)
            {
                mLogger.Trace("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials);
                ArrayList <SparseVector <double> > centroids = null;
                clustering = new ClusteringResult();
                for (int i = 0; i < mK; i++)
                {
                    clustering.AddRoot(new Cluster());
                }
                // select seed items
                double          minSim = double.MaxValue;
                ArrayList <int> tmp    = new ArrayList <int>(dataset.Count);
                for (int i = 0; i < dataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int k = 0; k < 3; k++)
                {
                    ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK);
                    tmp.Shuffle(mRnd);
                    for (int i = 0; i < mK; i++)
                    {
                        seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double>[] { dataset[tmp[i]] }, mCentroidType));
                    }
                    // assess quality of seed items
                    double simAvg = 0;
                    foreach (SparseVector <double> seed1 in seeds)
                    {
                        foreach (SparseVector <double> seed2 in seeds)
                        {
                            if (seed1 != seed2)
                            {
                                simAvg += mSimilarity.GetSimilarity(seed1, seed2);
                            }
                        }
                    }
                    simAvg /= (double)(mK * mK - mK);
                    if (simAvg < minSim)
                    {
                        minSim    = simAvg;
                        centroids = seeds;
                    }
                }
                // main loop
                int    iter          = 0;
                double bestClustQual = 0;
                double clustQual;
                while (true)
                {
                    iter++;
                    mLogger.Trace("Cluster", "Iteration {0} ...", iter);
                    clustQual = 0;
                    // assign items to clusters
                    foreach (Cluster cluster in clustering.Roots)
                    {
                        cluster.Items.Clear();
                    }
                    for (int i = 0; i < dataset.Count; i++)
                    {
                        SparseVector <double> example = dataset[i];
                        double          maxSim        = double.MinValue;
                        ArrayList <int> candidates    = new ArrayList <int>();
                        for (int j = 0; j < mK; j++)
                        {
                            SparseVector <double> centroid = centroids[j];
                            double sim = mSimilarity.GetSimilarity(example, centroid);
                            if (sim > maxSim)
                            {
                                maxSim = sim;
                                candidates.Clear();
                                candidates.Add(j);
                            }
                            else if (sim == maxSim)
                            {
                                candidates.Add(j);
                            }
                        }
                        if (candidates.Count > 1)
                        {
                            candidates.Shuffle(mRnd);
                        }
                        clustering.Roots[candidates[0]].Items.Add(i);
                        clustQual += maxSim;
                    }
                    clustQual /= (double)dataset.Count;
                    mLogger.Trace("Cluster", "Quality: {0:0.0000}", clustQual);
                    // check if done
                    if (iter > 1 && clustQual - bestClustQual <= mEps)
                    {
                        break;
                    }
                    bestClustQual = clustQual;
                    // compute new centroids
                    for (int i = 0; i < mK; i++)
                    {
                        centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, mCentroidType);
                    }
                }
                if (trial == 1 || clustQual > globalBestClustQual)
                {
                    globalBestClustQual = clustQual;
                    bestClustering      = clustering;
                }
            }
            return(bestClustering);
        }
Exemplo n.º 20
0
        public ClusteringResult Cluster(int numOutdated, IUnlabeledExampleCollection <SparseVector <double> > batch)
        {
            Utils.ThrowException(batch == null ? new ArgumentNullException("batch") : null);
            Utils.ThrowException(numOutdated < 0 ? new ArgumentOutOfRangeException("numOutdated") : null);
            if (mDataset == null)
            {
                // initialize
                mLogger.Trace("Cluster", "Initializing ...");
                Utils.ThrowException(numOutdated > 0 ? new ArgumentOutOfRangeException("numOutdated") : null);
                //Utils.ThrowException(batch.Count == 0 ? new ArgumentValueException("batch") : null);
                if (batch.Count == 0)
                {
                    return(new ClusteringResult());
                }
                kMeans(batch, Math.Min(mK, batch.Count));
                mDataset = new UnlabeledDataset <SparseVector <double> >(batch);
                foreach (CentroidData centroid in mCentroids)
                {
                    centroid.Tag = mTopicId++;
                }
                //OutputState();
            }
            else
            {
                // update clusters
                Utils.ThrowException(numOutdated > mDataset.Count ? new ArgumentOutOfRangeException("numOutdated") : null);
                if (numOutdated == 0 && batch.Count == 0)
                {
                    return(GetClusteringResult());
                }
                mLogger.Trace("Cluster", "Updating clusters ...");
                // assign new instances
                double dummy;
                Assign(mCentroids, ModelUtils.GetTransposedMatrix(batch), batch.Count, /*offs=*/ mDataset.Count, out dummy);
                mDataset.AddRange(batch);
                // remove outdated instances
                foreach (CentroidData centroid in mCentroids)
                {
                    foreach (int item in centroid.CurrentItems)
                    {
                        if (item >= numOutdated)
                        {
                            centroid.Items.Add(item);
                        }
                    }
                    centroid.Update(mDataset);
                    centroid.UpdateCentroidLen();
                }
                mDataset.RemoveRange(0, numOutdated);
                ArrayList <CentroidData> centroidsNew = new ArrayList <CentroidData>(mCentroids.Count);
                foreach (CentroidData centroid in mCentroids)
                {
                    if (centroid.CurrentItems.Count > 0)
                    {
                        centroidsNew.Add(centroid);
                        Set <int> tmp = new Set <int>();
                        foreach (int idx in centroid.CurrentItems)
                        {
                            tmp.Add(idx - numOutdated);
                        }
                        centroid.CurrentItems.Inner.SetItems(tmp);
                    }
                }
                if (centroidsNew.Count == 0) // reset
                {
                    mCentroids = null;
                    mDataset   = null;
                    return(new ClusteringResult());
                }
                mCentroids = centroidsNew;
                // execute main loop
                kMeansMainLoop(mDataset, mCentroids);
                //OutputState();
            }
            // adjust k
            double minQual; // *** not used at the moment
            int    minQualIdx;
            double qual = GetClustQual(out minQual, out minQualIdx);

            if (qual < mQualThresh)
            {
                while (qual < mQualThresh) // split cluster at minQualIdx
                {
                    mLogger.Trace("Cluster", "Increasing k to {0} ...", mCentroids.Count + 1);
                    mCentroids.Add(mCentroids[minQualIdx].Clone());
                    mCentroids.Last.Tag = mTopicId++;
                    kMeansMainLoop(mDataset, mCentroids);
                    if (mCentroids.Last.CurrentItems.Count > mCentroids[minQualIdx].CurrentItems.Count)
                    {
                        // swap topic identifiers
                        object tmp = mCentroids.Last.Tag;
                        mCentroids.Last.Tag        = mCentroids[minQualIdx].Tag;
                        mCentroids[minQualIdx].Tag = tmp;
                    }
                    qual = GetClustQual(out minQual, out minQualIdx);
                    //OutputState();
                }
            }
            else if (numOutdated > 0)
            {
                while (qual > mQualThresh && mCentroids.Count > 1) // join clusters
                {
                    mLogger.Trace("Cluster", "Decreasing k to {0} ...", mCentroids.Count - 1);
                    ArrayList <CentroidData> centroidsCopy = mCentroids.DeepClone();
                    if (mCentroids.Count == 2) // create single cluster
                    {
                        object topicId = mCentroids[0].CurrentItems.Count > mCentroids[1].CurrentItems.Count ? mCentroids[0].Tag : mCentroids[1].Tag;
                        mCentroids = new ArrayList <CentroidData>();
                        mCentroids.Add(new CentroidData());
                        for (int i = 0; i < mDataset.Count; i++)
                        {
                            mCentroids.Last.Items.Add(i);
                        }
                        mCentroids.Last.Tag = topicId;
                        mCentroids.Last.Update(mDataset);
                        mCentroids.Last.UpdateCentroidLen();
                    }
                    else
                    {
                        int idx1, idx2;
                        GetMostSimilarClusters(out idx1, out idx2);
                        CentroidData c1      = mCentroids[idx1];
                        CentroidData c2      = mCentroids[idx2];
                        object       topicId = c1.CurrentItems.Count > c2.CurrentItems.Count ? c1.Tag : c2.Tag;
                        mCentroids.RemoveAt(idx2);
                        c1.Items.AddRange(c1.CurrentItems);
                        c1.Items.AddRange(c2.CurrentItems);
                        c1.Tag = topicId;
                        c1.Update(mDataset);
                        c1.UpdateCentroidLen();
                        kMeansMainLoop(mDataset, mCentroids);
                    }
                    qual = GetClustQual();
                    if (qual >= mQualThresh)
                    {
                        mLogger.Trace("Cluster", "Accepted solution at k = {0}.", mCentroids.Count);
                    }
                    else
                    {
                        mCentroids = centroidsCopy;
                    }
                    //OutputState();
                }
            }
            OutputState();
            return(GetClusteringResult());
        }
Exemplo n.º 21
0
 public UnlabeledDataset <ExT> ToUnlabeledDataset()
 {
     return(ModelUtils.ConvertToUnlabeledDataset <LblT, ExT>(this));
 }
Exemplo n.º 22
0
        public void Train(ILabeledExampleCollection <LblT, SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null);
            Dictionary <LblT, CentroidData> centroids = new Dictionary <LblT, CentroidData>(mLblCmp);

            foreach (LabeledExample <LblT, SparseVector <double> > labeledExample in dataset)
            {
                if (!centroids.ContainsKey(labeledExample.Label))
                {
                    CentroidData centroidData = new CentroidData();
                    centroidData.AddToSum(labeledExample.Example);
                    centroids.Add(labeledExample.Label, centroidData);
                }
                else
                {
                    CentroidData centroidData = centroids[labeledExample.Label];
                    centroidData.AddToSum(labeledExample.Example);
                }
            }
            foreach (CentroidData cenData in centroids.Values)
            {
                cenData.UpdateCentroidLen();
            }
            double learnRate = 1;

            double[][]            dotProd = null;
            SparseMatrix <double> dsMtx   = null;

            if (mIterations > 0)
            {
                dotProd = new double[centroids.Count][];
                dsMtx   = ModelUtils.GetTransposedMatrix(ModelUtils.ConvertToUnlabeledDataset(dataset));
            }
            for (int iter = 1; iter <= mIterations; iter++)
            {
                mLogger.Info("Train", "Iteration {0} / {1} ...", iter, mIterations);
                // compute dot products
                mLogger.Info("Train", "Computing dot products ...");
                int j = 0;
                foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids)
                {
                    mLogger.ProgressNormal(Logger.Level.Info, /*sender=*/ this, "Train", "Centroid {0} / {1} ...", j + 1, centroids.Count);
                    SparseVector <double> cenVec = labeledCentroid.Value.GetSparseVector();
                    dotProd[j] = ModelUtils.GetDotProductSimilarity(dsMtx, dataset.Count, cenVec);
                    j++;
                }
                // classify training examples
                mLogger.Info("Train", "Classifying training examples ...");
                int errCount = 0;
                for (int instIdx = 0; instIdx < dataset.Count; instIdx++)
                {
                    mLogger.ProgressFast(Logger.Level.Info, /*sender=*/ this, "Train", "Example {0} / {1} ...", instIdx + 1, dataset.Count);
                    double       maxSim           = double.MinValue;
                    CentroidData assignedCentroid = null;
                    CentroidData actualCentroid   = null;
                    LabeledExample <LblT, SparseVector <double> > labeledExample = dataset[instIdx];
                    SparseVector <double> vec = labeledExample.Example;
                    int cenIdx = 0;
                    foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids)
                    {
                        double sim = dotProd[cenIdx][instIdx];
                        if (sim > maxSim)
                        {
                            maxSim = sim; assignedCentroid = labeledCentroid.Value;
                        }
                        if (labeledCentroid.Key.Equals(labeledExample.Label))
                        {
                            actualCentroid = labeledCentroid.Value;
                        }
                        cenIdx++;
                    }
                    if (assignedCentroid != actualCentroid)
                    {
                        assignedCentroid.AddToDiff(-learnRate, vec);
                        actualCentroid.AddToDiff(learnRate, vec);
                        errCount++;
                    }
                }
                mLogger.Info("Train", "Training set error rate: {0:0.00}%", (double)errCount / (double)dataset.Count * 100.0);
                // update centroids
                int k = 0;
                foreach (CentroidData centroidData in centroids.Values)
                {
                    mLogger.ProgressNormal(Logger.Level.Info, /*sender=*/ this, "Train", "Centroid {0} / {1} ...", ++k, centroids.Count);
                    centroidData.Update(mPositiveValuesOnly);
                    centroidData.UpdateCentroidLen();
                }
                learnRate *= mDamping;
            }
            mCentroidMtxTr = new SparseMatrix <double>();
            mLabels        = new ArrayList <LblT>();
            int rowIdx = 0;

            foreach (KeyValuePair <LblT, CentroidData> labeledCentroid in centroids)
            {
                mCentroidMtxTr[rowIdx++] = labeledCentroid.Value.GetSparseVector();
                mLabels.Add(labeledCentroid.Key);
            }
            mCentroidMtxTr = mCentroidMtxTr.GetTransposedCopy();
        }
Exemplo n.º 23
0
        public ClusteringResult Cluster(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < m_k ? new ArgumentValueException("dataset") : null);
            ClusteringResult clustering             = null;
            ClusteringResult best_clustering        = null;
            double           global_best_clust_qual = 0;

            for (int trial = 1; trial <= m_trials; trial++)
            {
                Utils.VerboseLine("*** CLUSTERING TRIAL {0} OF {1} ***", trial, m_trials);
                ArrayList <SparseVector <double> .ReadOnly> centroids = null;
                clustering = new ClusteringResult();
                for (int i = 0; i < m_k; i++)
                {
                    clustering.Roots.Add(new Cluster());
                }
                // select seed items
                double          min_sim = double.MaxValue;
                ArrayList <int> tmp     = new ArrayList <int>(dataset.Count);
                for (int i = 0; i < dataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int k = 0; k < 3; k++)
                {
                    ArrayList <SparseVector <double> .ReadOnly> seeds = new ArrayList <SparseVector <double> .ReadOnly>(m_k);
                    tmp.Shuffle(m_rnd);
                    for (int i = 0; i < m_k; i++)
                    {
                        seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double> .ReadOnly[] { dataset[tmp[i]].Example }, m_centroid_type));
                    }
                    // assess quality of seed items
                    double sim_avg = 0;
                    foreach (SparseVector <double> .ReadOnly seed_1 in seeds)
                    {
                        foreach (SparseVector <double> .ReadOnly seed_2 in seeds)
                        {
                            if (seed_1 != seed_2)
                            {
                                sim_avg += m_similarity.GetSimilarity(seed_1, seed_2);
                            }
                        }
                    }
                    sim_avg /= (double)(m_k * m_k - m_k);
                    //Console.WriteLine(sim_avg);
                    if (sim_avg < min_sim)
                    {
                        min_sim   = sim_avg;
                        centroids = seeds;
                    }
                }
                // main loop
                int    iter            = 0;
                double best_clust_qual = 0;
                double clust_qual;
                while (true)
                {
                    iter++;
                    clust_qual = 0;
                    // assign items to clusters
                    foreach (Cluster cluster in clustering.Roots)
                    {
                        cluster.Items.Clear();
                    }
                    for (int i = 0; i < dataset.Count; i++)
                    {
                        SparseVector <double> .ReadOnly example = dataset[i].Example;
                        double          max_sim    = double.MinValue;
                        ArrayList <int> candidates = new ArrayList <int>();
                        for (int j = 0; j < m_k; j++)
                        {
                            SparseVector <double> .ReadOnly centroid = centroids[j];
                            double sim = m_similarity.GetSimilarity(example, centroid);
                            if (sim > max_sim)
                            {
                                max_sim = sim;
                                candidates.Clear();
                                candidates.Add(j);
                            }
                            else if (sim == max_sim)
                            {
                                candidates.Add(j);
                            }
                        }
                        if (candidates.Count > 1)
                        {
                            candidates.Shuffle(m_rnd);
                        }
                        if (candidates.Count > 0) // *** is this always true?
                        {
                            clustering.Roots[candidates[0]].Items.Add(new Pair <double, int>(1, i));
                            clust_qual += max_sim;
                        }
                    }
                    clust_qual /= (double)dataset.Count;
                    Utils.VerboseLine("*** Iteration {0} ***", iter);
                    Utils.VerboseLine("Quality: {0:0.0000}", clust_qual);
                    // check if done
                    if (iter > 1 && clust_qual - best_clust_qual <= m_eps)
                    {
                        break;
                    }
                    best_clust_qual = clust_qual;
                    // compute new centroids
                    for (int i = 0; i < m_k; i++)
                    {
                        centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, m_centroid_type);
                    }
                }
                if (trial == 1 || clust_qual > global_best_clust_qual)
                {
                    global_best_clust_qual = clust_qual;
                    best_clustering        = clustering;
                }
            }
            return(best_clustering);
        }
Exemplo n.º 24
0
        private double[][] GetKernel(int rmvFeatIdx)
        {
            int numSv = SvmLightLib.GetSupportVectorCount(mModelId);

            // initialize matrix
            double[][] kernel = new double[numSv][];
            // compute linear kernel
            SparseMatrix <double> m = new SparseMatrix <double>();

            for (int i = 0; i < numSv; i++)
            {
                SparseVector <double> sv = GetSupportVector(i);
                m[i] = sv;
            }
            if (rmvFeatIdx >= 0)
            {
                m.RemoveColAt(rmvFeatIdx);
            }
            SparseMatrix <double> mTr = m.GetTransposedCopy();

            for (int i = 0; i < numSv; i++)
            {
                double[] innerProd = ModelUtils.GetDotProductSimilarity(mTr, numSv, m[i]);
                kernel[i] = innerProd;
            }
            // compute non-linear kernel
            switch (mKernelType)
            {
            case SvmLightKernelType.Polynomial:
                for (int row = 0; row < kernel.Length; row++)
                {
                    for (int col = 0; col < kernel.Length; col++)
                    {
                        kernel[row][col] = Math.Pow(mKernelParamS * kernel[row][col] + mKernelParamC, mKernelParamD);
                    }
                }
                break;

            case SvmLightKernelType.RadialBasisFunction:
                double[] diag = new double[kernel.Length];
                for (int i = 0; i < kernel.Length; i++)
                {
                    diag[i] = kernel[i][i];
                }                                                                       // save diagonal
                for (int row = 0; row < kernel.Length; row++)
                {
                    for (int col = 0; col < kernel.Length; col++)
                    {
                        kernel[row][col] = Math.Exp(-mKernelParamGamma * (diag[row] + diag[col] - 2.0 * kernel[row][col]));
                    }
                }
                break;

            case SvmLightKernelType.Sigmoid:
                for (int row = 0; row < kernel.Length; row++)
                {
                    for (int col = 0; col < kernel.Length; col++)
                    {
                        kernel[row][col] = Math.Tanh(mKernelParamS * kernel[row][col] + mKernelParamC);
                    }
                }
                break;
            }
            return(kernel);
        }
Exemplo n.º 25
0
 public IDataset <LblT> ConvertDataset(Type new_ex_type, bool move)
 {
     Utils.ThrowException(new_ex_type == null ? new ArgumentNullException("new_ex_type") : null);
     if (new_ex_type == typeof(SparseVector <double>))
     {
         Dataset <LblT, SparseVector <double> > new_dataset = new Dataset <LblT, SparseVector <double> >();
         for (int i = 0; i < m_items.Count; i++)
         {
             LabeledExample <LblT, ExT> example = m_items[i];
             new_dataset.Add(example.Label, ModelUtils.ConvertExample <SparseVector <double> >(example.Example));
             if (move)
             {
                 m_items[i] = new LabeledExample <LblT, ExT>();
             }
         }
         if (move)
         {
             m_items.Clear();
         }
         return(new_dataset);
     }
     else if (new_ex_type == typeof(SparseVector <double> .ReadOnly))
     {
         Dataset <LblT, SparseVector <double> .ReadOnly> new_dataset = new Dataset <LblT, SparseVector <double> .ReadOnly>();
         for (int i = 0; i < m_items.Count; i++)
         {
             LabeledExample <LblT, ExT> example = m_items[i];
             new_dataset.Add(example.Label, ModelUtils.ConvertExample <SparseVector <double> .ReadOnly>(example.Example));
             if (move)
             {
                 m_items[i] = new LabeledExample <LblT, ExT>();
             }
         }
         if (move)
         {
             m_items.Clear();
         }
         return(new_dataset);
     }
     else if (new_ex_type == typeof(BinaryVector <int>))
     {
         Dataset <LblT, BinaryVector <int> > new_dataset = new Dataset <LblT, BinaryVector <int> >();
         for (int i = 0; i < m_items.Count; i++)
         {
             LabeledExample <LblT, ExT> example = m_items[i];
             new_dataset.Add(example.Label, ModelUtils.ConvertExample <BinaryVector <int> >(example.Example));
             if (move)
             {
                 m_items[i] = new LabeledExample <LblT, ExT>();
             }
         }
         if (move)
         {
             m_items.Clear();
         }
         return(new_dataset);
     }
     else if (new_ex_type == typeof(BinaryVector <int> .ReadOnly))
     {
         Dataset <LblT, BinaryVector <int> .ReadOnly> new_dataset = new Dataset <LblT, BinaryVector <int> .ReadOnly>();
         for (int i = 0; i < m_items.Count; i++)
         {
             LabeledExample <LblT, ExT> example = m_items[i];
             new_dataset.Add(example.Label, ModelUtils.ConvertExample <BinaryVector <int> .ReadOnly>(example.Example));
             if (move)
             {
                 m_items[i] = new LabeledExample <LblT, ExT>();
             }
         }
         if (move)
         {
             m_items.Clear();
         }
         return(new_dataset);
     }
     //else if (new_ex_type == typeof(SvmFeatureVector))
     //{
     //    Dataset<LblT, SvmFeatureVector> new_dataset = new Dataset<LblT, SvmFeatureVector>();
     //    for (int i = 0; i < m_items.Count; i++)
     //    {
     //        LabeledExample<LblT, ExT> example = m_items[i];
     //        new_dataset.Add(example.Label, ModelUtils.ConvertVector<SvmFeatureVector>(example.Example));
     //        if (move) { m_items[i] = new LabeledExample<LblT, ExT>(); }
     //    }
     //    if (move) { m_items.Clear(); }
     //    return new_dataset;
     //}
     else
     {
         throw new ArgumentNotSupportedException("new_ex_type");
     }
 }
Exemplo n.º 26
0
 public SparseVector <double> ComputeCentroid(IUnlabeledExampleCollection <SparseVector <double> > dataset, CentroidType type)
 {
     return(ModelUtils.ComputeCentroid(mItems, dataset, type)); // throws ArgumentValueException
 }
Exemplo n.º 27
0
        public ClusteringResult Cluster(IUnlabeledExampleCollection <SparseVector <double> > dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < mK ? new ArgumentValueException("dataset") : null);
            ClusteringResult clustering          = null;
            double           globalBestClustQual = 0;

            for (int trial = 1; trial <= mTrials; trial++)
            {
                mLogger.Info("Cluster", "Clustering trial {0} of {1} ...", trial, mTrials);
                ArrayList <CentroidData> centroids = new ArrayList <CentroidData>(mK);
                ArrayList <int>          bestSeeds = null;
                for (int i = 0; i < mK; i++)
                {
                    centroids.Add(new CentroidData());
                }
                // select seed items
                double          minSim = double.MaxValue;
                ArrayList <int> tmp    = new ArrayList <int>(dataset.Count);
                for (int i = 0; i < dataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int k = 0; k < 3; k++)
                {
                    ArrayList <SparseVector <double> > seeds = new ArrayList <SparseVector <double> >(mK);
                    tmp.Shuffle(mRnd);
                    for (int i = 0; i < mK; i++)
                    {
                        seeds.Add(dataset[tmp[i]]);
                    }
                    // assess quality of seed items
                    double simAvg = 0;
                    foreach (SparseVector <double> seed1 in seeds)
                    {
                        foreach (SparseVector <double> seed2 in seeds)
                        {
                            if (seed1 != seed2)
                            {
                                simAvg += DotProductSimilarity.Instance.GetSimilarity(seed1, seed2);
                            }
                        }
                    }
                    simAvg /= (double)(mK * mK - mK);
                    //Console.WriteLine(simAvg);
                    if (simAvg < minSim)
                    {
                        minSim    = simAvg;
                        bestSeeds = new ArrayList <int>(mK);
                        for (int i = 0; i < mK; i++)
                        {
                            bestSeeds.Add(tmp[i]);
                        }
                    }
                }
                for (int i = 0; i < mK; i++)
                {
                    centroids[i].Items.Add(bestSeeds[i]);
                    centroids[i].Update(dataset);
                    centroids[i].UpdateCentroidLen();
                }
                double[][]            dotProd = new double[mK][];
                SparseMatrix <double> dsMtx   = ModelUtils.GetTransposedMatrix(dataset);
                // main loop
                int    iter          = 0;
                double bestClustQual = 0;
                double clustQual;
                while (true)
                {
                    iter++;
                    mLogger.Info("Cluster", "Iteration {0} ...", iter);
                    clustQual = 0;
                    // assign items to clusters
                    int j = 0;
                    foreach (CentroidData cen in centroids)
                    {
                        SparseVector <double> cenVec = cen.GetSparseVector();
                        dotProd[j] = ModelUtils.GetDotProductSimilarity(dsMtx, dataset.Count, cenVec);
                        j++;
                    }
                    for (int instIdx = 0; instIdx < dataset.Count; instIdx++)
                    {
                        double          maxSim     = double.MinValue;
                        ArrayList <int> candidates = new ArrayList <int>();
                        for (int cenIdx = 0; cenIdx < mK; cenIdx++)
                        {
                            double sim = dotProd[cenIdx][instIdx];
                            if (sim > maxSim)
                            {
                                maxSim = sim;
                                candidates.Clear();
                                candidates.Add(cenIdx);
                            }
                            else if (sim == maxSim)
                            {
                                candidates.Add(cenIdx);
                            }
                        }
                        if (candidates.Count > 1)
                        {
                            candidates.Shuffle(mRnd);
                        }
                        if (candidates.Count > 0) // *** is this always true?
                        {
                            centroids[candidates[0]].Items.Add(instIdx);
                            clustQual += maxSim;
                        }
                    }
                    clustQual /= (double)dataset.Count;
                    mLogger.Info("Cluster", "Quality: {0:0.0000}", clustQual);
                    // check if done
                    if (iter > 1 && clustQual - bestClustQual <= mEps)
                    {
                        break;
                    }
                    bestClustQual = clustQual;
                    // compute new centroids
                    for (int i = 0; i < mK; i++)
                    {
                        centroids[i].Update(dataset);
                        centroids[i].UpdateCentroidLen();
                    }
                }
                if (trial == 1 || clustQual > globalBestClustQual)
                {
                    globalBestClustQual = clustQual;
                    // save the result
                    clustering = new ClusteringResult();
                    for (int i = 0; i < mK; i++)
                    {
                        clustering.AddRoot(new Cluster());
                        clustering.Roots.Last.Items.AddRange(centroids[i].Items);
                    }
                }
            }
            return(clustering);
        }