Ejemplo n.º 1
0
 public void Train(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null);
     m_lambda = null; // allow GC to collect this
     m_lambda = MaxEnt.Gis(dataset, m_cut_off, m_num_iter, m_move_data, /*mtx_file_name=*/ null, ref m_idx_to_lbl, m_num_threads);
 }
Ejemplo n.º 2
0
        private static SparseMatrix <double> CreateObservationMatrix <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset, ref LblT[] idx_to_lbl)
        {
            SparseMatrix <double>  mtx        = new SparseMatrix <double>();
            ArrayList <LblT>       tmp        = new ArrayList <LblT>();
            Dictionary <LblT, int> lbl_to_idx = new Dictionary <LblT, int>();

            foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> labeled_example in dataset)
            {
                if (!lbl_to_idx.ContainsKey(labeled_example.Label))
                {
                    lbl_to_idx.Add(labeled_example.Label, lbl_to_idx.Count);
                    tmp.Add(labeled_example.Label);
                }
            }
            int i = 0;

            foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> labeled_example in dataset)
            {
                Utils.Verbose("{0} / {1}\r", ++i, dataset.Count);
                int lbl_idx = lbl_to_idx[labeled_example.Label];
                if (!mtx.ContainsRowAt(lbl_idx))
                {
                    mtx[lbl_idx] = ModelUtils.ConvertExample <SparseVector <double> >(labeled_example.Example);
                }
                else
                {
                    SparseVector <double> new_vec = ModelUtils.ConvertExample <SparseVector <double> >(labeled_example.Example);
                    new_vec.Merge(mtx[lbl_idx], new SumOperator());
                    mtx[lbl_idx] = new_vec;
                }
            }
            Utils.VerboseLine("");
            idx_to_lbl = tmp.ToArray();
            return(mtx);
        }
Ejemplo n.º 3
0
        private static SparseMatrix <double> CreateObservationMatrix2 <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset, ref LblT[] idxToLbl)
        {
            ArrayList <LblT>       tmp      = new ArrayList <LblT>();
            Dictionary <LblT, int> lblToIdx = new Dictionary <LblT, int>();

            foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> labeledExample in dataset)
            {
                if (!lblToIdx.ContainsKey(labeledExample.Label))
                {
                    lblToIdx.Add(labeledExample.Label, lblToIdx.Count);
                    tmp.Add(labeledExample.Label);
                }
            }
            // prepare struct for fast computation
            Dictionary <int, int>[] counter = new Dictionary <int, int> [tmp.Count];
            for (int j = 0; j < counter.Length; j++)
            {
                counter[j] = new Dictionary <int, int>();
            }
            // count features
            int i = 0;

            foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> labeledExample in dataset)
            {
                Utils.Verbose("{0} / {1}\r", ++i, dataset.Count);
                int lblIdx = lblToIdx[labeledExample.Label];
                int val;
                foreach (int idx in labeledExample.Example)
                {
                    if (counter[lblIdx].TryGetValue(idx, out val))
                    {
                        counter[lblIdx][idx] = val + 1;
                    }
                    else
                    {
                        counter[lblIdx].Add(idx, 1);
                    }
                }
            }
            // create sparse matrix
            SparseMatrix <double> mtx = new SparseMatrix <double>();

            for (int j = 0; j < counter.Length; j++)
            {
                SparseVector <double> vec = new SparseVector <double>();
                foreach (KeyValuePair <int, int> item in counter[j])
                {
                    vec.InnerIdx.Add(item.Key);
                    vec.InnerDat.Add(item.Value);
                }
                vec.Sort();
                mtx[j] = vec;
            }
            idxToLbl = tmp.ToArray();
            Utils.VerboseLine("");
            return(mtx);
        }
Ejemplo n.º 4
0
        private static double GisFindMaxF <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset)
        {
            double max_val = 0;

            foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> item in dataset)
            {
                if (item.Example.Count > max_val)
                {
                    max_val = item.Example.Count;
                }
            }
            return(max_val);
        }
Ejemplo n.º 5
0
        public Centroid(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset)
        {
            int max_idx = -1;

            foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset)
            {
                int last_idx = labeled_example.Example.LastNonEmptyIndex;
                if (last_idx > max_idx)
                {
                    max_idx = last_idx;
                }
            }
            m_vec     = new double[max_idx + 1];
            m_dataset = dataset;
        }
Ejemplo n.º 6
0
        public void Train(IExampleCollection <double, SparseVector <double> .ReadOnly> dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null);
            LSqrSparseMatrix mat = new LSqrSparseMatrix(dataset.Count);

            double[] rhs      = new double[dataset.Count];
            int      sol_size = -1;
            int      i        = 0;

            foreach (LabeledExample <double, SparseVector <double> .ReadOnly> labeled_example in dataset)
            {
                if (labeled_example.Example.LastNonEmptyIndex + 1 > sol_size)
                {
                    sol_size = labeled_example.Example.LastNonEmptyIndex + 1;
                }
                foreach (IdxDat <double> item in labeled_example.Example)
                {
                    mat.InsertValue(i, item.Idx, item.Dat);
                }
                rhs[i++] = labeled_example.Label;
            }
            LSqrSparseMatrix mat_t = new LSqrSparseMatrix(sol_size);

            i = 0;
            foreach (LabeledExample <double, SparseVector <double> .ReadOnly> labeled_example in dataset)
            {
                foreach (IdxDat <double> item in labeled_example.Example)
                {
                    mat_t.InsertValue(item.Idx, i, item.Dat);
                }
                i++;
            }
            int num_iter = m_num_iter < 0 ? sol_size + dataset.Count + 50 : m_num_iter;

            m_sol = new ArrayList <double>(LSqrDll.DoLSqr(sol_size, mat, mat_t, rhs, num_iter));
            mat.Dispose();
            mat_t.Dispose();
        }
Ejemplo n.º 7
0
        private static SparseMatrix <double> TransposeDataset <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset, bool clear_dataset)
        {
            SparseMatrix <double> aux = new SparseMatrix <double>();
            int i = 0;

            if (clear_dataset)
            {
                foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> item in dataset)
                {
                    aux[i++] = ModelUtils.ConvertExample <SparseVector <double> >(item.Example);
                    item.Example.Inner.Clear(); // *** clear read-only vectors to save space
                }
            }
            else
            {
                foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> item in dataset)
                {
                    aux[i++] = ModelUtils.ConvertExample <SparseVector <double> >(item.Example);
                }
            }
            return(aux.GetTransposedCopy());
        }
Ejemplo n.º 8
0
        public void Train(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null);
            m_centroids = new ArrayList <Pair <LblT, SparseVector <double> .ReadOnly> >();
            Dictionary <LblT, ArrayList <SparseVector <double> .ReadOnly> > tmp = new Dictionary <LblT, ArrayList <SparseVector <double> .ReadOnly> >(m_lbl_cmp);

            foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset)
            {
                if (!tmp.ContainsKey(labeled_example.Label))
                {
                    tmp.Add(labeled_example.Label, new ArrayList <SparseVector <double> .ReadOnly>(new SparseVector <double> .ReadOnly[] { labeled_example.Example }));
                }
                else
                {
                    tmp[labeled_example.Label].Add(labeled_example.Example);
                }
            }
            foreach (KeyValuePair <LblT, ArrayList <SparseVector <double> .ReadOnly> > centroid_data in tmp)
            {
                SparseVector <double> centroid = ModelUtils.ComputeCentroid(centroid_data.Value, m_normalize ? CentroidType.NrmL2 : CentroidType.Avg);
                m_centroids.Add(new Pair <LblT, SparseVector <double> .ReadOnly>(centroid_data.Key, centroid));
            }
        }
Ejemplo n.º 9
0
 public Centroid(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset, int vec_len)
 {
     m_vec     = new double[vec_len];
     m_dataset = dataset;
 }
Ejemplo n.º 10
0
 void IModel <LblT> .Train(IExampleCollection <LblT> dataset)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     Utils.ThrowException(!(dataset is IExampleCollection <LblT, SparseVector <double> .ReadOnly>) ? new ArgumentTypeException("dataset") : null);
     Train((IExampleCollection <LblT, SparseVector <double> .ReadOnly>)dataset); // throws ArgumentValueException
 }
Ejemplo n.º 11
0
        public SparseVector <double> .ReadOnly ComputeCentroid <LblT>(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset, CentroidType type)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Dictionary <int, double> tmp = new Dictionary <int, double>();
            double wgt_sum = 0;

            foreach (Pair <double, int> wgt_vec in m_items)
            {
                Utils.ThrowException((wgt_vec.Second < 0 || wgt_vec.Second >= dataset.Count) ? new IndexOutOfRangeException("Items (dataset index)") : null);
                foreach (IdxDat <double> item in dataset[wgt_vec.Second].Example)
                {
                    if (tmp.ContainsKey(item.Idx))
                    {
                        tmp[item.Idx] += wgt_vec.First * item.Dat;
                    }
                    else
                    {
                        tmp.Add(item.Idx, wgt_vec.First * item.Dat);
                    }
                }
                wgt_sum += wgt_vec.First;
            }
            Utils.ThrowException(wgt_sum == 0 ? new ArgumentValueException("Items (weights)") : null);
            SparseVector <double> centroid = new SparseVector <double>();

            switch (type)
            {
            case CentroidType.Sum:
                foreach (KeyValuePair <int, double> item in tmp)
                {
                    centroid.InnerIdx.Add(item.Key);
                    centroid.InnerDat.Add(item.Value);
                }
                break;

            case CentroidType.Avg:
                foreach (KeyValuePair <int, double> item in tmp)
                {
                    centroid.InnerIdx.Add(item.Key);
                    centroid.InnerDat.Add(item.Value / wgt_sum);
                }
                break;

            case CentroidType.NrmL2:
                double vec_len = 0;
                foreach (KeyValuePair <int, double> item in tmp)
                {
                    vec_len += item.Value * item.Value;
                }
                Utils.ThrowException(vec_len == 0 ? new InvalidOperationException() : null);
                vec_len = Math.Sqrt(vec_len);
                foreach (KeyValuePair <int, double> item in tmp)
                {
                    centroid.InnerIdx.Add(item.Key);
                    centroid.InnerDat.Add(item.Value / vec_len);
                }
                break;
            }
            centroid.Sort();
            return(centroid);
        }
Ejemplo n.º 12
0
        public ClusteringResult Cluster(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count < m_k ? new ArgumentValueException("dataset") : null);
            ClusteringResult clustering             = null;
            ClusteringResult best_clustering        = null;
            double           global_best_clust_qual = 0;

            for (int trial = 1; trial <= m_trials; trial++)
            {
                Utils.VerboseLine("*** CLUSTERING TRIAL {0} OF {1} ***", trial, m_trials);
                ArrayList <SparseVector <double> .ReadOnly> centroids = null;
                clustering = new ClusteringResult();
                for (int i = 0; i < m_k; i++)
                {
                    clustering.Roots.Add(new Cluster());
                }
                // select seed items
                double          min_sim = double.MaxValue;
                ArrayList <int> tmp     = new ArrayList <int>(dataset.Count);
                for (int i = 0; i < dataset.Count; i++)
                {
                    tmp.Add(i);
                }
                for (int k = 0; k < 3; k++)
                {
                    ArrayList <SparseVector <double> .ReadOnly> seeds = new ArrayList <SparseVector <double> .ReadOnly>(m_k);
                    tmp.Shuffle(m_rnd);
                    for (int i = 0; i < m_k; i++)
                    {
                        seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double> .ReadOnly[] { dataset[tmp[i]].Example }, m_centroid_type));
                    }
                    // assess quality of seed items
                    double sim_avg = 0;
                    foreach (SparseVector <double> .ReadOnly seed_1 in seeds)
                    {
                        foreach (SparseVector <double> .ReadOnly seed_2 in seeds)
                        {
                            if (seed_1 != seed_2)
                            {
                                sim_avg += m_similarity.GetSimilarity(seed_1, seed_2);
                            }
                        }
                    }
                    sim_avg /= (double)(m_k * m_k - m_k);
                    //Console.WriteLine(sim_avg);
                    if (sim_avg < min_sim)
                    {
                        min_sim   = sim_avg;
                        centroids = seeds;
                    }
                }
                // main loop
                int    iter            = 0;
                double best_clust_qual = 0;
                double clust_qual;
                while (true)
                {
                    iter++;
                    clust_qual = 0;
                    // assign items to clusters
                    foreach (Cluster cluster in clustering.Roots)
                    {
                        cluster.Items.Clear();
                    }
                    for (int i = 0; i < dataset.Count; i++)
                    {
                        SparseVector <double> .ReadOnly example = dataset[i].Example;
                        double          max_sim    = double.MinValue;
                        ArrayList <int> candidates = new ArrayList <int>();
                        for (int j = 0; j < m_k; j++)
                        {
                            SparseVector <double> .ReadOnly centroid = centroids[j];
                            double sim = m_similarity.GetSimilarity(example, centroid);
                            if (sim > max_sim)
                            {
                                max_sim = sim;
                                candidates.Clear();
                                candidates.Add(j);
                            }
                            else if (sim == max_sim)
                            {
                                candidates.Add(j);
                            }
                        }
                        if (candidates.Count > 1)
                        {
                            candidates.Shuffle(m_rnd);
                        }
                        if (candidates.Count > 0) // *** is this always true?
                        {
                            clustering.Roots[candidates[0]].Items.Add(new Pair <double, int>(1, i));
                            clust_qual += max_sim;
                        }
                    }
                    clust_qual /= (double)dataset.Count;
                    Utils.VerboseLine("*** Iteration {0} ***", iter);
                    Utils.VerboseLine("Quality: {0:0.0000}", clust_qual);
                    // check if done
                    if (iter > 1 && clust_qual - best_clust_qual <= m_eps)
                    {
                        break;
                    }
                    best_clust_qual = clust_qual;
                    // compute new centroids
                    for (int i = 0; i < m_k; i++)
                    {
                        centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, m_centroid_type);
                    }
                }
                if (trial == 1 || clust_qual > global_best_clust_qual)
                {
                    global_best_clust_qual = clust_qual;
                    best_clustering        = clustering;
                }
            }
            return(best_clustering);
        }
Ejemplo n.º 13
0
 ClusteringResult IClustering <LblT> .Cluster(IExampleCollection <LblT> dataset)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     Utils.ThrowException(!(dataset is IExampleCollection <LblT, SparseVector <double> .ReadOnly>) ? new ArgumentTypeException("dataset") : null);
     return(Cluster((IExampleCollection <LblT, SparseVector <double> .ReadOnly>)dataset)); // throws ArgumentValueException
 }
Ejemplo n.º 14
0
 public void Train(IExampleCollection <LblT, ExT> dataset)
 {
     Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
     Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null);
     m_examples = new ArrayList <LabeledExample <LblT, ExT> >(dataset);
 }
Ejemplo n.º 15
0
        public static SparseMatrix <double> Gis <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset, int cut_off, int num_iter, bool clear_dataset, string mtx_file_name, ref LblT[] idx_to_lbl, int num_threads)
        {
            Utils.VerboseLine("Creating observation matrix ...");
            SparseMatrix <double> observations = null;

            if (Utils.VerifyFileNameOpen(mtx_file_name))
            {
                BinarySerializer reader = new BinarySerializer(mtx_file_name, FileMode.Open);
                idx_to_lbl   = new ArrayList <LblT>(reader).ToArray();
                observations = new SparseMatrix <double>(reader);
                reader.Close();
            }
            else
            {
                observations = CreateObservationMatrix2(dataset, ref idx_to_lbl);
                //SparseMatrix<double> test = CreateObservationMatrix(dataset, ref idx_to_lbl);
                //Console.WriteLine(test.ContentEquals(observations));
                if (Utils.VerifyFileNameCreate(mtx_file_name))
                {
                    BinarySerializer writer = new BinarySerializer(mtx_file_name, FileMode.Create);
                    new ArrayList <LblT>(idx_to_lbl).Save(writer);
                    observations.Save(writer);
                    writer.Close();
                }
            }
            int num_classes  = observations.GetLastNonEmptyRowIdx() + 1;
            int num_examples = dataset.Count;

            if (cut_off > 0)
            {
                Utils.VerboseLine("Performing cut-off ...");
                observations = CutOff(observations, cut_off);
            }
            Utils.VerboseLine("Preparing structures ...");
            SparseMatrix <double> lambda       = CopyStructure(observations);
            SparseMatrix <double> expectations = CopyStructure(observations);
            double f = GisFindMaxF(dataset);
            SparseMatrix <double> train_mtx_tr = TransposeDataset(dataset, clear_dataset);

            Utils.VerboseLine("Entering main loop ...");
            for (int i = 0; i < num_iter; i++)
            {
                Utils.VerboseLine("Iteration {0} / {1} ...", i + 1, num_iter);
                Utils.VerboseLine("Updating expectations ...");
                if (num_threads > 1)
                {
                    UpdateExpectationMatrix(num_classes, num_examples, train_mtx_tr, lambda, expectations, num_threads);
                }
                else
                {
                    UpdateExpectationMatrix(num_classes, num_examples, train_mtx_tr, lambda, expectations);
                }
                Utils.VerboseLine("Updating lambdas ...");
                GisUpdate(lambda, expectations, observations, f);
                //SaveForMatlab(expectations, "c:\\mec\\old\\expem.txt");
                Reset(expectations);
            }
            //SaveForMatlab(lambda, "c:\\mec\\old\\lamem.txt");
            //SaveForMatlab(observations, "c:\\mec\\old\\obsem.txt");
            return(lambda);
        }
        public void Train(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset)
        {
            Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null);
            Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null);
            m_centroids = new Dictionary <LblT, CentroidData>();
            foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset)
            {
                if (!m_centroids.ContainsKey(labeled_example.Label))
                {
                    CentroidData centroid_data = new CentroidData();
                    centroid_data.AddToSum(labeled_example.Example);
                    m_centroids.Add(labeled_example.Label, centroid_data);
                }
                else
                {
                    CentroidData centroid_data = m_centroids[labeled_example.Label];
                    centroid_data.AddToSum(labeled_example.Example);
                }
            }
            foreach (CentroidData vec_data in m_centroids.Values)
            {
                vec_data.UpdateCentroidLen();
            }
            double learn_rate = 1;

            for (int iter = 1; iter <= m_iterations; iter++)
            {
                Utils.VerboseLine("Iteration {0} / {1} ...", iter, m_iterations);
                // classify training documents
                int i          = 0;
                int num_miscfy = 0;
                foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset)
                {
                    Utils.Verbose("\rExample {0} / {1} ...", ++i, dataset.Count);
                    double       max_sim                = double.MinValue;
                    CentroidData assigned_centroid      = null;
                    CentroidData actual_centroid        = null;
                    SparseVector <double> .ReadOnly vec = labeled_example.Example;
                    foreach (KeyValuePair <LblT, CentroidData> labeled_centroid in m_centroids)
                    {
                        double sim = labeled_centroid.Value.GetSimilarity(vec);
                        if (sim > max_sim)
                        {
                            max_sim = sim; assigned_centroid = labeled_centroid.Value;
                        }
                        if (labeled_centroid.Key.Equals(labeled_example.Label))
                        {
                            actual_centroid = labeled_centroid.Value;
                        }
                    }
                    if (assigned_centroid != actual_centroid)
                    {
                        assigned_centroid.AddToDiff(-learn_rate, vec);
                        actual_centroid.AddToDiff(learn_rate, vec);
                        num_miscfy++;
                    }
                }
                Utils.VerboseLine("");
                Utils.VerboseLine("Training set error rate: {0:0.00}%", (double)num_miscfy / (double)dataset.Count * 100.0);
                // update centroids
                i = 0;
                foreach (CentroidData centroid_data in m_centroids.Values)
                {
                    Utils.Verbose("\rCentroid {0} / {1} ...", ++i, m_centroids.Count);
                    centroid_data.UpdateCentroid(m_positive_values_only);
                    centroid_data.UpdateCentroidLen();
                }
                Utils.VerboseLine("");
                learn_rate *= m_damping;
            }
        }