static void Main(string[] args) { // Get the stop words and stemmer for English. IStemmer stemmer; Set <string> .ReadOnly stopWords; TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); // Create a tokenizer. UnicodeTokenizer tokenizer = new UnicodeTokenizer(); tokenizer.MinTokenLen = 2; // Each token must be at least 2 // characters long. tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens // can consist of alphabetic characters only. // Load a document corpus from a file. Each line in the file // represents one document. string[] docs = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt"); // Create a bag-of-words space. BowSpace bowSpc = new BowSpace(); bowSpc.Tokenizer = tokenizer; // Assign the tokenizer. bowSpc.StopWords = stopWords; // Assign the stop words. bowSpc.Stemmer = stemmer; // Assign the stemmer. bowSpc.MinWordFreq = 3; // A term must appear at least 3 // times in the corpus for it to be part of the // vocabulary. bowSpc.MaxNGramLen = 3; // Terms consisting of at most 3 // consecutive words will be considered. bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the // weighting scheme for the bag-of-words vectors to // TF-IDF. bowSpc.NormalizeVectors = true; // The TF-IDF vectors will // be normalized. bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest // weights, summing up to 20% of the overall weight sum, // will be removed from each TF-IDF vector. bowSpc.Initialize(docs); // Initialize the BOW space. // Compute 100 clusters of documents. KMeansFast kMeans = new KMeansFast(100); // Set k to 100. kMeans.Trials = 3; // Perform 3 repetitions. Take the best // result. kMeans.Eps = 0.001; // Stop iterating when the partition // quality increases for less than 0.001. ClusteringResult cr = kMeans.Cluster(bowSpc); // Execute. // Extract the top 5 terms with the highest TF-IDF weights // from each of the clusters' centroids and output the // number of documents (companies) in each cluster. foreach (Cluster cl in cr.Roots) { SparseVector <double> .ReadOnly centroid = cl.ComputeCentroid(bowSpc, CentroidType.NrmL2); Console.Write(bowSpc.GetKeywordsStr(centroid, 5)); Console.WriteLine(" ({0} companies)", cl.Items.Count); } // Output the documents that are contained in the first // cluster. foreach (int docIdx in cr.Roots[0].Items) { Console.WriteLine(docs[docIdx]); } }
public Vector2D[] ComputeLayout(LayoutSettings settings) { UnlabeledDataset <SparseVector <double> > dataset = new UnlabeledDataset <SparseVector <double> >(mDataset); // clustering mLogger.Info("ComputeLayout", "Clustering ..."); KMeansFast kMeans = new KMeansFast(mKClust); kMeans.Eps = mKMeansEps; kMeans.Random = mRandom; kMeans.Trials = 1; ClusteringResult clustering = kMeans.Cluster(mDataset); // throws ArgumentValueException // determine reference instances UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >(); foreach (Cluster cluster in clustering.Roots) { SparseVector <double> centroid = cluster.Items.Count > 0 ? cluster.ComputeCentroid(mDataset, CentroidType.NrmL2) : new SparseVector <double>(); dsRefInst.Add(centroid); // dataset of reference instances dataset.Add(centroid); // add centroids to the main dataset } // position reference instances mLogger.Info("ComputeLayout", "Positioning reference instances ..."); SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false); StressMajorizationLayout sm = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx)); sm.Random = mRandom; Vector2D[] centrPos = sm.ComputeLayout(); // k-NN mLogger.Info("ComputeLayout", "Computing similarities ..."); simMtx = ModelUtils.GetDotProductSimilarity(dataset, mSimThresh, /*fullMatrix=*/ true); mLogger.Info("ComputeLayout", "Constructing system of linear equations ..."); LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >(); foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx) { if (simMtxRow.Dat.Count <= 1) { mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx); } ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count); foreach (IdxDat <double> item in simMtxRow.Dat) { if (item.Idx != simMtxRow.Idx) { knn.Add(new KeyDat <double, int>(item.Dat, item.Idx)); } } knn.Sort(DescSort <KeyDat <double, int> > .Instance); int count = Math.Min(knn.Count, mKNN); SparseVector <double> eq = new SparseVector <double>(); double wgt = 1.0 / (double)count; for (int i = 0; i < count; i++) { eq.InnerIdx.Add(knn[i].Dat); eq.InnerDat.Add(-wgt); } eq.InnerIdx.Sort(); // *** sort only indices eq[simMtxRow.Idx] = 1; lsqrDs.Add(0, eq); } Vector2D[] layout = new Vector2D[dataset.Count - mKClust]; for (int i = dataset.Count - mKClust, j = 0; i < dataset.Count; i++, j++) { SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) }); lsqrDs.Add(centrPos[j].X, eq); } LSqrModel lsqr = new LSqrModel(); lsqr.Train(lsqrDs); for (int i = 0; i < layout.Length; i++) { layout[i].X = lsqr.Solution[i]; } for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++) { lsqrDs[i].Label = centrPos[j].Y; } lsqr.Train(lsqrDs); for (int i = 0; i < layout.Length; i++) { layout[i].Y = lsqr.Solution[i]; } return(settings == null ? layout : settings.AdjustLayout(layout)); }