public void Train(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); m_lambda = null; // allow GC to collect this m_lambda = MaxEnt.Gis(dataset, m_cut_off, m_num_iter, m_move_data, /*mtx_file_name=*/ null, ref m_idx_to_lbl, m_num_threads); }
private static SparseMatrix <double> CreateObservationMatrix <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset, ref LblT[] idx_to_lbl) { SparseMatrix <double> mtx = new SparseMatrix <double>(); ArrayList <LblT> tmp = new ArrayList <LblT>(); Dictionary <LblT, int> lbl_to_idx = new Dictionary <LblT, int>(); foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> labeled_example in dataset) { if (!lbl_to_idx.ContainsKey(labeled_example.Label)) { lbl_to_idx.Add(labeled_example.Label, lbl_to_idx.Count); tmp.Add(labeled_example.Label); } } int i = 0; foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> labeled_example in dataset) { Utils.Verbose("{0} / {1}\r", ++i, dataset.Count); int lbl_idx = lbl_to_idx[labeled_example.Label]; if (!mtx.ContainsRowAt(lbl_idx)) { mtx[lbl_idx] = ModelUtils.ConvertExample <SparseVector <double> >(labeled_example.Example); } else { SparseVector <double> new_vec = ModelUtils.ConvertExample <SparseVector <double> >(labeled_example.Example); new_vec.Merge(mtx[lbl_idx], new SumOperator()); mtx[lbl_idx] = new_vec; } } Utils.VerboseLine(""); idx_to_lbl = tmp.ToArray(); return(mtx); }
private static SparseMatrix <double> CreateObservationMatrix2 <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset, ref LblT[] idxToLbl) { ArrayList <LblT> tmp = new ArrayList <LblT>(); Dictionary <LblT, int> lblToIdx = new Dictionary <LblT, int>(); foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> labeledExample in dataset) { if (!lblToIdx.ContainsKey(labeledExample.Label)) { lblToIdx.Add(labeledExample.Label, lblToIdx.Count); tmp.Add(labeledExample.Label); } } // prepare struct for fast computation Dictionary <int, int>[] counter = new Dictionary <int, int> [tmp.Count]; for (int j = 0; j < counter.Length; j++) { counter[j] = new Dictionary <int, int>(); } // count features int i = 0; foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> labeledExample in dataset) { Utils.Verbose("{0} / {1}\r", ++i, dataset.Count); int lblIdx = lblToIdx[labeledExample.Label]; int val; foreach (int idx in labeledExample.Example) { if (counter[lblIdx].TryGetValue(idx, out val)) { counter[lblIdx][idx] = val + 1; } else { counter[lblIdx].Add(idx, 1); } } } // create sparse matrix SparseMatrix <double> mtx = new SparseMatrix <double>(); for (int j = 0; j < counter.Length; j++) { SparseVector <double> vec = new SparseVector <double>(); foreach (KeyValuePair <int, int> item in counter[j]) { vec.InnerIdx.Add(item.Key); vec.InnerDat.Add(item.Value); } vec.Sort(); mtx[j] = vec; } idxToLbl = tmp.ToArray(); Utils.VerboseLine(""); return(mtx); }
private static double GisFindMaxF <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset) { double max_val = 0; foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> item in dataset) { if (item.Example.Count > max_val) { max_val = item.Example.Count; } } return(max_val); }
public Centroid(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset) { int max_idx = -1; foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset) { int last_idx = labeled_example.Example.LastNonEmptyIndex; if (last_idx > max_idx) { max_idx = last_idx; } } m_vec = new double[max_idx + 1]; m_dataset = dataset; }
public void Train(IExampleCollection <double, SparseVector <double> .ReadOnly> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); LSqrSparseMatrix mat = new LSqrSparseMatrix(dataset.Count); double[] rhs = new double[dataset.Count]; int sol_size = -1; int i = 0; foreach (LabeledExample <double, SparseVector <double> .ReadOnly> labeled_example in dataset) { if (labeled_example.Example.LastNonEmptyIndex + 1 > sol_size) { sol_size = labeled_example.Example.LastNonEmptyIndex + 1; } foreach (IdxDat <double> item in labeled_example.Example) { mat.InsertValue(i, item.Idx, item.Dat); } rhs[i++] = labeled_example.Label; } LSqrSparseMatrix mat_t = new LSqrSparseMatrix(sol_size); i = 0; foreach (LabeledExample <double, SparseVector <double> .ReadOnly> labeled_example in dataset) { foreach (IdxDat <double> item in labeled_example.Example) { mat_t.InsertValue(item.Idx, i, item.Dat); } i++; } int num_iter = m_num_iter < 0 ? sol_size + dataset.Count + 50 : m_num_iter; m_sol = new ArrayList <double>(LSqrDll.DoLSqr(sol_size, mat, mat_t, rhs, num_iter)); mat.Dispose(); mat_t.Dispose(); }
private static SparseMatrix <double> TransposeDataset <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset, bool clear_dataset) { SparseMatrix <double> aux = new SparseMatrix <double>(); int i = 0; if (clear_dataset) { foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> item in dataset) { aux[i++] = ModelUtils.ConvertExample <SparseVector <double> >(item.Example); item.Example.Inner.Clear(); // *** clear read-only vectors to save space } } else { foreach (LabeledExample <LblT, BinaryVector <int> .ReadOnly> item in dataset) { aux[i++] = ModelUtils.ConvertExample <SparseVector <double> >(item.Example); } } return(aux.GetTransposedCopy()); }
public void Train(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); m_centroids = new ArrayList <Pair <LblT, SparseVector <double> .ReadOnly> >(); Dictionary <LblT, ArrayList <SparseVector <double> .ReadOnly> > tmp = new Dictionary <LblT, ArrayList <SparseVector <double> .ReadOnly> >(m_lbl_cmp); foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset) { if (!tmp.ContainsKey(labeled_example.Label)) { tmp.Add(labeled_example.Label, new ArrayList <SparseVector <double> .ReadOnly>(new SparseVector <double> .ReadOnly[] { labeled_example.Example })); } else { tmp[labeled_example.Label].Add(labeled_example.Example); } } foreach (KeyValuePair <LblT, ArrayList <SparseVector <double> .ReadOnly> > centroid_data in tmp) { SparseVector <double> centroid = ModelUtils.ComputeCentroid(centroid_data.Value, m_normalize ? CentroidType.NrmL2 : CentroidType.Avg); m_centroids.Add(new Pair <LblT, SparseVector <double> .ReadOnly>(centroid_data.Key, centroid)); } }
public Centroid(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset, int vec_len) { m_vec = new double[vec_len]; m_dataset = dataset; }
void IModel <LblT> .Train(IExampleCollection <LblT> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(!(dataset is IExampleCollection <LblT, SparseVector <double> .ReadOnly>) ? new ArgumentTypeException("dataset") : null); Train((IExampleCollection <LblT, SparseVector <double> .ReadOnly>)dataset); // throws ArgumentValueException }
public SparseVector <double> .ReadOnly ComputeCentroid <LblT>(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset, CentroidType type) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Dictionary <int, double> tmp = new Dictionary <int, double>(); double wgt_sum = 0; foreach (Pair <double, int> wgt_vec in m_items) { Utils.ThrowException((wgt_vec.Second < 0 || wgt_vec.Second >= dataset.Count) ? new IndexOutOfRangeException("Items (dataset index)") : null); foreach (IdxDat <double> item in dataset[wgt_vec.Second].Example) { if (tmp.ContainsKey(item.Idx)) { tmp[item.Idx] += wgt_vec.First * item.Dat; } else { tmp.Add(item.Idx, wgt_vec.First * item.Dat); } } wgt_sum += wgt_vec.First; } Utils.ThrowException(wgt_sum == 0 ? new ArgumentValueException("Items (weights)") : null); SparseVector <double> centroid = new SparseVector <double>(); switch (type) { case CentroidType.Sum: foreach (KeyValuePair <int, double> item in tmp) { centroid.InnerIdx.Add(item.Key); centroid.InnerDat.Add(item.Value); } break; case CentroidType.Avg: foreach (KeyValuePair <int, double> item in tmp) { centroid.InnerIdx.Add(item.Key); centroid.InnerDat.Add(item.Value / wgt_sum); } break; case CentroidType.NrmL2: double vec_len = 0; foreach (KeyValuePair <int, double> item in tmp) { vec_len += item.Value * item.Value; } Utils.ThrowException(vec_len == 0 ? new InvalidOperationException() : null); vec_len = Math.Sqrt(vec_len); foreach (KeyValuePair <int, double> item in tmp) { centroid.InnerIdx.Add(item.Key); centroid.InnerDat.Add(item.Value / vec_len); } break; } centroid.Sort(); return(centroid); }
public ClusteringResult Cluster(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count < m_k ? new ArgumentValueException("dataset") : null); ClusteringResult clustering = null; ClusteringResult best_clustering = null; double global_best_clust_qual = 0; for (int trial = 1; trial <= m_trials; trial++) { Utils.VerboseLine("*** CLUSTERING TRIAL {0} OF {1} ***", trial, m_trials); ArrayList <SparseVector <double> .ReadOnly> centroids = null; clustering = new ClusteringResult(); for (int i = 0; i < m_k; i++) { clustering.Roots.Add(new Cluster()); } // select seed items double min_sim = double.MaxValue; ArrayList <int> tmp = new ArrayList <int>(dataset.Count); for (int i = 0; i < dataset.Count; i++) { tmp.Add(i); } for (int k = 0; k < 3; k++) { ArrayList <SparseVector <double> .ReadOnly> seeds = new ArrayList <SparseVector <double> .ReadOnly>(m_k); tmp.Shuffle(m_rnd); for (int i = 0; i < m_k; i++) { seeds.Add(ModelUtils.ComputeCentroid(new SparseVector <double> .ReadOnly[] { dataset[tmp[i]].Example }, m_centroid_type)); } // assess quality of seed items double sim_avg = 0; foreach (SparseVector <double> .ReadOnly seed_1 in seeds) { foreach (SparseVector <double> .ReadOnly seed_2 in seeds) { if (seed_1 != seed_2) { sim_avg += m_similarity.GetSimilarity(seed_1, seed_2); } } } sim_avg /= (double)(m_k * m_k - m_k); //Console.WriteLine(sim_avg); if (sim_avg < min_sim) { min_sim = sim_avg; centroids = seeds; } } // main loop int iter = 0; double best_clust_qual = 0; double clust_qual; while (true) { iter++; clust_qual = 0; // assign items to clusters foreach (Cluster cluster in clustering.Roots) { cluster.Items.Clear(); } for (int i = 0; i < dataset.Count; i++) { SparseVector <double> .ReadOnly example = dataset[i].Example; double max_sim = double.MinValue; ArrayList <int> candidates = new ArrayList <int>(); for (int j = 0; j < m_k; j++) { SparseVector <double> .ReadOnly centroid = centroids[j]; double sim = m_similarity.GetSimilarity(example, centroid); if (sim > max_sim) { max_sim = sim; candidates.Clear(); candidates.Add(j); } else if (sim == max_sim) { candidates.Add(j); } } if (candidates.Count > 1) { candidates.Shuffle(m_rnd); } if (candidates.Count > 0) // *** is this always true? { clustering.Roots[candidates[0]].Items.Add(new Pair <double, int>(1, i)); clust_qual += max_sim; } } clust_qual /= (double)dataset.Count; Utils.VerboseLine("*** Iteration {0} ***", iter); Utils.VerboseLine("Quality: {0:0.0000}", clust_qual); // check if done if (iter > 1 && clust_qual - best_clust_qual <= m_eps) { break; } best_clust_qual = clust_qual; // compute new centroids for (int i = 0; i < m_k; i++) { centroids[i] = clustering.Roots[i].ComputeCentroid(dataset, m_centroid_type); } } if (trial == 1 || clust_qual > global_best_clust_qual) { global_best_clust_qual = clust_qual; best_clustering = clustering; } } return(best_clustering); }
ClusteringResult IClustering <LblT> .Cluster(IExampleCollection <LblT> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(!(dataset is IExampleCollection <LblT, SparseVector <double> .ReadOnly>) ? new ArgumentTypeException("dataset") : null); return(Cluster((IExampleCollection <LblT, SparseVector <double> .ReadOnly>)dataset)); // throws ArgumentValueException }
public void Train(IExampleCollection <LblT, ExT> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); m_examples = new ArrayList <LabeledExample <LblT, ExT> >(dataset); }
public static SparseMatrix <double> Gis <LblT>(IExampleCollection <LblT, BinaryVector <int> .ReadOnly> dataset, int cut_off, int num_iter, bool clear_dataset, string mtx_file_name, ref LblT[] idx_to_lbl, int num_threads) { Utils.VerboseLine("Creating observation matrix ..."); SparseMatrix <double> observations = null; if (Utils.VerifyFileNameOpen(mtx_file_name)) { BinarySerializer reader = new BinarySerializer(mtx_file_name, FileMode.Open); idx_to_lbl = new ArrayList <LblT>(reader).ToArray(); observations = new SparseMatrix <double>(reader); reader.Close(); } else { observations = CreateObservationMatrix2(dataset, ref idx_to_lbl); //SparseMatrix<double> test = CreateObservationMatrix(dataset, ref idx_to_lbl); //Console.WriteLine(test.ContentEquals(observations)); if (Utils.VerifyFileNameCreate(mtx_file_name)) { BinarySerializer writer = new BinarySerializer(mtx_file_name, FileMode.Create); new ArrayList <LblT>(idx_to_lbl).Save(writer); observations.Save(writer); writer.Close(); } } int num_classes = observations.GetLastNonEmptyRowIdx() + 1; int num_examples = dataset.Count; if (cut_off > 0) { Utils.VerboseLine("Performing cut-off ..."); observations = CutOff(observations, cut_off); } Utils.VerboseLine("Preparing structures ..."); SparseMatrix <double> lambda = CopyStructure(observations); SparseMatrix <double> expectations = CopyStructure(observations); double f = GisFindMaxF(dataset); SparseMatrix <double> train_mtx_tr = TransposeDataset(dataset, clear_dataset); Utils.VerboseLine("Entering main loop ..."); for (int i = 0; i < num_iter; i++) { Utils.VerboseLine("Iteration {0} / {1} ...", i + 1, num_iter); Utils.VerboseLine("Updating expectations ..."); if (num_threads > 1) { UpdateExpectationMatrix(num_classes, num_examples, train_mtx_tr, lambda, expectations, num_threads); } else { UpdateExpectationMatrix(num_classes, num_examples, train_mtx_tr, lambda, expectations); } Utils.VerboseLine("Updating lambdas ..."); GisUpdate(lambda, expectations, observations, f); //SaveForMatlab(expectations, "c:\\mec\\old\\expem.txt"); Reset(expectations); } //SaveForMatlab(lambda, "c:\\mec\\old\\lamem.txt"); //SaveForMatlab(observations, "c:\\mec\\old\\obsem.txt"); return(lambda); }
public void Train(IExampleCollection <LblT, SparseVector <double> .ReadOnly> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); m_centroids = new Dictionary <LblT, CentroidData>(); foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset) { if (!m_centroids.ContainsKey(labeled_example.Label)) { CentroidData centroid_data = new CentroidData(); centroid_data.AddToSum(labeled_example.Example); m_centroids.Add(labeled_example.Label, centroid_data); } else { CentroidData centroid_data = m_centroids[labeled_example.Label]; centroid_data.AddToSum(labeled_example.Example); } } foreach (CentroidData vec_data in m_centroids.Values) { vec_data.UpdateCentroidLen(); } double learn_rate = 1; for (int iter = 1; iter <= m_iterations; iter++) { Utils.VerboseLine("Iteration {0} / {1} ...", iter, m_iterations); // classify training documents int i = 0; int num_miscfy = 0; foreach (LabeledExample <LblT, SparseVector <double> .ReadOnly> labeled_example in dataset) { Utils.Verbose("\rExample {0} / {1} ...", ++i, dataset.Count); double max_sim = double.MinValue; CentroidData assigned_centroid = null; CentroidData actual_centroid = null; SparseVector <double> .ReadOnly vec = labeled_example.Example; foreach (KeyValuePair <LblT, CentroidData> labeled_centroid in m_centroids) { double sim = labeled_centroid.Value.GetSimilarity(vec); if (sim > max_sim) { max_sim = sim; assigned_centroid = labeled_centroid.Value; } if (labeled_centroid.Key.Equals(labeled_example.Label)) { actual_centroid = labeled_centroid.Value; } } if (assigned_centroid != actual_centroid) { assigned_centroid.AddToDiff(-learn_rate, vec); actual_centroid.AddToDiff(learn_rate, vec); num_miscfy++; } } Utils.VerboseLine(""); Utils.VerboseLine("Training set error rate: {0:0.00}%", (double)num_miscfy / (double)dataset.Count * 100.0); // update centroids i = 0; foreach (CentroidData centroid_data in m_centroids.Values) { Utils.Verbose("\rCentroid {0} / {1} ...", ++i, m_centroids.Count); centroid_data.UpdateCentroid(m_positive_values_only); centroid_data.UpdateCentroidLen(); } Utils.VerboseLine(""); learn_rate *= m_damping; } }