public static LabeledDataset <SentimentLabel, SparseVector <double> > InitBowSpace(BowSpace bowSpace, IEnumerable <LabeledExample <SentimentLabel, string> > labeledExamples, IEnumerable <string> initExamples = null) { LabeledExample <SentimentLabel, string>[] examples = labeledExamples as LabeledExample <SentimentLabel, string>[] ?? labeledExamples.ToArray(); List <SparseVector <double> > bowData; if (initExamples != null) { Preconditions.CheckArgument(!(bowSpace is DeltaBowSpace <SentimentLabel>)); bowSpace.Initialize(initExamples); bowData = examples.Select(le => bowSpace.ProcessDocument(le.Example)).ToList(); } else { bowData = bowSpace is DeltaBowSpace <SentimentLabel> ?((DeltaBowSpace <SentimentLabel>)bowSpace).Initialize(new LabeledDataset <SentimentLabel, string>(examples)) : bowSpace.Initialize(examples.Select(d => d.Example)); } var bowDataset = new LabeledDataset <SentimentLabel, SparseVector <double> >(); for (int i = 0; i < bowData.Count; i++) { bowDataset.Add(examples[i].Label, bowData[i]); } return(bowDataset); }
public override void Train(ILabeledExampleCollection <SentimentLabel, SparseVector <double> > dataset) { Preconditions.CheckState(BowSpace != null); var replDataset = new LabeledDataset <SentimentLabel, SparseVector <double> >(); foreach (LabeledExample <SentimentLabel, SparseVector <double> > le in dataset) { SparseVector <double> vector1, vector2; Replicate(le.Example, out vector1, out vector2); replDataset.Add(new LabeledExample <SentimentLabel, SparseVector <double> >( le.Label == SentimentLabel.Neutral ? SentimentLabel.Negative : le.Label, vector1)); replDataset.Add(new LabeledExample <SentimentLabel, SparseVector <double> >( le.Label == SentimentLabel.Neutral ? SentimentLabel.Positive : le.Label, vector2)); } mClassifier = CreateModel(); mClassifier.Train(replDataset); IsTrained = true; }
static LabeledDataset <string, SparseVector <double> > CreateSingleFeatureDataset(LabeledDataset <BlogMetaData, SparseVector <double> > srcDataset, ClassType classType, int fIdx) { SparseVector <double> minValues, maxValues; GetExtremes(srcDataset, out minValues, out maxValues); LabeledDataset <string, SparseVector <double> > dataset = new LabeledDataset <string, SparseVector <double> >(); ((IEnumerable <LabeledExample <BlogMetaData, SparseVector <double> > >)srcDataset).ToList() .ForEach(x => dataset.Add(new LabeledExample <string, SparseVector <double> >(AnalysisUtils.GetLabel(x.Label, classType), new SparseVector <double>( new double[] { (x.Example[fIdx] - minValues[fIdx]) / (maxValues[fIdx] - minValues[fIdx]) } // simple normalization )))); return(dataset); }
private static LabeledDataset <int, int> NewData(int[,] labelCounts, bool sortShuffled = false) { var result = new LabeledDataset <int, int>(); for (int i = 0, k = 1; i <= labelCounts.GetUpperBound(0); i++) { int label = labelCounts[i, 0], count = labelCounts[i, 1]; for (int j = 0; j < count; j++) { result.Add(label, k++); } } if (sortShuffled) { result.GroupLabels(true); } return(result); }
protected override ILabeledDataset <LblT, SparseVector <double> > MapTrainSet(int foldN, ILabeledDataset <LblT, string> trainSet) { BowSpace bowSpace; Preconditions.CheckState(!mFoldBowSpaces.TryGetValue(foldN, out bowSpace)); Preconditions.CheckState(mFoldBowSpaces.TryAdd(foldN, bowSpace = BowSpaceFunc())); List <SparseVector <double> > bowData = bowSpace is DeltaBowSpace <LblT> ?((DeltaBowSpace <LblT>)bowSpace).Initialize(trainSet) : bowSpace.Initialize(trainSet.Select(d => d.Example)); var bowDataset = new LabeledDataset <LblT, SparseVector <double> >(); for (int i = 0; i < bowData.Count; i++) { bowDataset.Add(trainSet[i].Label, bowData[i]); } return(bowDataset); }
public void TrainModels(IEnumerable <Author> authors) { foreach (Author author in authors) { LabeledDataset <string, SparseVector <double> > ds = new LabeledDataset <string, SparseVector <double> >(); foreach (Author otherAuthor in authors) { if (otherAuthor != author && !otherAuthor.mIsTagged) { foreach (Text text in otherAuthor.mTexts) { ds.Add(new LabeledExample <string, SparseVector <double> >(otherAuthor.mName, text.mFeatureVectors[mSelector])); } } } SvmMulticlassClassifier <string> model = new SvmMulticlassClassifier <string>(); model.C = Convert.ToDouble(Utils.GetConfigValue("SvmMultiClassC", "5000")); model.Train(ds); mModels.Add(author.mName, model); } }
public void Train(ILabeledExampleCollection <LblT, string> dataset) { Preconditions.CheckState(!IsTrained); Preconditions.CheckNotNull(dataset); Preconditions.CheckNotNull(BowSpace); Preconditions.CheckNotNull(FeatureProcessor); Preconditions.CheckNotNull(Model); // preprocess the text foreach (LabeledExample <LblT, string> le in dataset) { le.Example = FeatureProcessor.Run(le.Example); } // bow vectors List <SparseVector <double> > bowData = BowSpace is DeltaBowSpace <LblT> ?(BowSpace as DeltaBowSpace <LblT>).Initialize(dataset as ILabeledDataset <LblT, string> ?? new LabeledDataset <LblT, string>(dataset)) : BowSpace.Initialize(dataset.Select(d => d.Example)); var bowDataset = new LabeledDataset <LblT, SparseVector <double> >(); for (int i = 0; i < bowData.Count; i++) { bowDataset.Add(dataset[i].Label, bowData[i]); } // train if (OnTrainModel == null) { Model.Train(bowDataset); } else { OnTrainModel(this, bowDataset); } IsTrained = true; }
public override void Run(object[] args) { // prepare data IStemmer stemmer; Set <string> .ReadOnly stopWords; TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); // Create a tokenizer. var tokenizer = new UnicodeTokenizer { MinTokenLen = 2, // Each token must be at least 2 characters long. Filter = TokenizerFilter.AlphaStrict // Tokens can consist of alphabetic characters only. }; // take data for two classes from cvs file var data = new List <LabeledTweet>(GetLabeledTweets().Where(lt => lt.Polarity != 2)).ToList(); // Create a bag-of-words space. var bowSpc = new BowSpace { Tokenizer = tokenizer, // Assign the tokenizer. StopWords = stopWords, // Assign the stop words. Stemmer = stemmer, // Assign the stemmer. MinWordFreq = 1, // A term must appear at least n-times in the corpus for it to be part of the vocabulary. MaxNGramLen = 2, // Terms consisting of at most n-consecutive words will be considered. WordWeightType = WordWeightType.TermFreq, // Set the weighting scheme for the bag-of-words vectors to TF. //WordWeightType = WordWeightType.TfIdf, // Set the weighting scheme for the bag-of-words vectors to TF-IDF. NormalizeVectors = true, // The TF-IDF vectors will be normalized. CutLowWeightsPerc = 0 // The terms with the lowest weights, summing up to 20% of the overall weight sum, will be removed from each TF-IDF vector. }; ArrayList <SparseVector <double> > bowData = bowSpc.Initialize(data.Select(d => d.Text)); // label data var labeledSet = new LabeledDataset <string, SparseVector <double> >(); for (int i = 0; i < data.Count; i++) { labeledSet.Add(data[i].Label, bowData[i]); } labeledSet.Shuffle(); int testSize = labeledSet.Count / 10; var trainingSet = new LabeledDataset <string, SparseVector <double> >(labeledSet.Skip(testSize)); var testSet = new LabeledDataset <string, SparseVector <double> >(labeledSet.Take(testSize)); //-------------------- SVM var svmBinClass = new SvmBinaryClassifier <string> { VerbosityLevel = SvmLightVerbosityLevel.Off }; if (args.Any()) { svmBinClass.C = (int)args[0]; } //svmBinClass.BiasedHyperplane = true; //svmBinClass.CustomParams = "-t 3"; // non-linear kernel //svmBinClass.CustomParams = String.Format("-j {0}",j); svmBinClass.Train(trainingSet); int correct = 0; double avgDist = 0; foreach (LabeledExample <string, SparseVector <double> > labeledExample in testSet) { var prediction = svmBinClass.Predict(labeledExample.Example); //Output.WriteLine("actual: {0}\tpredicted: {1}\t score: {2:0.0000}", labeledExample.Label, prediction.BestClassLabel, prediction.BestScore); avgDist += prediction.BestScore; if (prediction.BestClassLabel == labeledExample.Label) { correct++; } } Output.WriteLine("Accuracy: {0:0.00}", 100.0 * correct / testSet.Count); Output.WriteLine("Avg. distance: {0:0.00}", avgDist / testSet.Count); Result.Add("accuracy", (double)correct / testSet.Count); Result.Add("classifier", svmBinClass); Result.Add("labeled_data", labeledSet); }
public Vector2D[] ComputeLayout(LayoutSettings settings) { UnlabeledDataset <SparseVector <double> > dataset = new UnlabeledDataset <SparseVector <double> >(mDataset); // clustering mLogger.Info("ComputeLayout", "Clustering ..."); KMeansFast kMeans = new KMeansFast(mKClust); kMeans.Eps = mKMeansEps; kMeans.Random = mRandom; kMeans.Trials = 1; ClusteringResult clustering = kMeans.Cluster(mDataset); // throws ArgumentValueException // determine reference instances UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >(); foreach (Cluster cluster in clustering.Roots) { SparseVector <double> centroid = cluster.Items.Count > 0 ? cluster.ComputeCentroid(mDataset, CentroidType.NrmL2) : new SparseVector <double>(); dsRefInst.Add(centroid); // dataset of reference instances dataset.Add(centroid); // add centroids to the main dataset } // position reference instances mLogger.Info("ComputeLayout", "Positioning reference instances ..."); SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false); StressMajorizationLayout sm = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx)); sm.Random = mRandom; Vector2D[] centrPos = sm.ComputeLayout(); // k-NN mLogger.Info("ComputeLayout", "Computing similarities ..."); simMtx = ModelUtils.GetDotProductSimilarity(dataset, mSimThresh, /*fullMatrix=*/ true); mLogger.Info("ComputeLayout", "Constructing system of linear equations ..."); LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >(); foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx) { if (simMtxRow.Dat.Count <= 1) { mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx); } ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count); foreach (IdxDat <double> item in simMtxRow.Dat) { if (item.Idx != simMtxRow.Idx) { knn.Add(new KeyDat <double, int>(item.Dat, item.Idx)); } } knn.Sort(DescSort <KeyDat <double, int> > .Instance); int count = Math.Min(knn.Count, mKNN); SparseVector <double> eq = new SparseVector <double>(); double wgt = 1.0 / (double)count; for (int i = 0; i < count; i++) { eq.InnerIdx.Add(knn[i].Dat); eq.InnerDat.Add(-wgt); } eq.InnerIdx.Sort(); // *** sort only indices eq[simMtxRow.Idx] = 1; lsqrDs.Add(0, eq); } Vector2D[] layout = new Vector2D[dataset.Count - mKClust]; for (int i = dataset.Count - mKClust, j = 0; i < dataset.Count; i++, j++) { SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) }); lsqrDs.Add(centrPos[j].X, eq); } LSqrModel lsqr = new LSqrModel(); lsqr.Train(lsqrDs); for (int i = 0; i < layout.Length; i++) { layout[i].X = lsqr.Solution[i]; } for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++) { lsqrDs[i].Label = centrPos[j].Y; } lsqr.Train(lsqrDs); for (int i = 0; i < layout.Length; i++) { layout[i].Y = lsqr.Solution[i]; } return(settings == null ? layout : settings.AdjustLayout(layout)); }
// TODO: exceptions public Vector2D[] Update(int numDequeue, IEnumerable <SparseVector <double> > newInst, bool test, LayoutSettings settings, ref PtInfo[] ptInfo, int _count) { // clustering mLogger.Info("Update", "Clustering ..."); /*prof*/ StopWatch sw = new StopWatch(); mKMeans.Eps = mKMeansEps; int iter = 0; mKMeans.Update(numDequeue, newInst, ref iter); /*prof*/ sw.Save("cl.txt", _count, iter.ToString()); // determine reference instances /*prof*/ sw.Reset(); UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >(); UnlabeledDataset <SparseVector <double> > dsNewInst = new UnlabeledDataset <SparseVector <double> >(newInst); foreach (SparseVector <double> centroid in mKMeans.GetCentroids()) { dsRefInst.Add(centroid); // dataset of reference instances dsNewInst.Add(centroid); // dataset of new instances } // position reference instances mLogger.Info("Update", "Positioning reference instances ..."); SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false); StressMajorizationLayout sm = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx)); sm.Random = mRandom; sm.MaxSteps = int.MaxValue; sm.MinDiff = 1E-3; mRefPos = sm.ComputeLayout(/*settings=*/ null, mRefPos /*make this a property!!!*/); /*prof*/ sw.Save("sm.txt", _count); // k-NN /*prof*/ sw.Reset(); DateTime t = DateTime.Now; mLogger.Info("Update", "Computing similarities ..."); // update list of neighborhoods mPatches.RemoveRange(mDataset.Count - mKClust, mKClust); mPatches.RemoveRange(0, numDequeue); // remove instances from [dataset and] neighborhoods foreach (Patch patch in mPatches) { if (patch.Min != null && (patch.Min.Idx < numDequeue || patch.Max.Idx >= mDataset.Count - mKClust)) { int oldCount = patch.List.Count; ArrayList <KeyDat <double, Patch> > tmp = new ArrayList <KeyDat <double, Patch> >(); foreach (KeyDat <double, Patch> item in patch.List) { if (item.Dat.Idx >= numDequeue && item.Dat.Idx < mDataset.Count - mKClust) { tmp.Add(item); } //else //{ // Console.WriteLine("Remove {0}", item.Dat.Idx - numDequeue); //} } patch.List = tmp; patch.ProcessList(); patch.NeedUpdate = patch.List.Count < mKNn && oldCount >= mKNn; } } // update dataset mDataset.RemoveRange(mDataset.Count - mKClust, mKClust); mDataset.RemoveRange(0, numDequeue); // add new instances to dataset int preAddCount = mDataset.Count; mDataset.AddRange(dsNewInst); // precompute transposed matrices SparseMatrix <double> trNewInst = ModelUtils.GetTransposedMatrix(dsNewInst); SparseMatrix <double> trDataset = ModelUtils.GetTransposedMatrix(mDataset); // add new instances to neighborhoods for (int i = 0; i < dsNewInst.Count; i++) { mPatches.Add(new Patch(-1)); mPatches.Last.NeedUpdate = true; } for (int i = 0; i < mPatches.Count; i++) { mPatches[i].Idx = i; } for (int i = 0; i < mPatches.Count; i++) { Patch patch = mPatches[i]; SparseVector <double> vec = mDataset[i]; if (vec != null) { if (patch.NeedUpdate) // full update required { //if (i == 1347) { Console.WriteLine("full update"); } SparseVector <double> simVec = ModelUtils.GetDotProductSimilarity(trDataset, mDataset.Count, vec, mSimThresh); ArrayList <KeyDat <double, int> > tmp = new ArrayList <KeyDat <double, int> >(); foreach (IdxDat <double> item in simVec) { if (item.Idx != i) { tmp.Add(new KeyDat <double, int>(item.Dat, item.Idx)); } } tmp.Sort(new Comparer2()); int count = Math.Min(tmp.Count, mKNnExt); patch.List.Clear(); for (int j = 0; j < count; j++) { patch.List.Add(new KeyDat <double, Patch>(tmp[j].Key, mPatches[tmp[j].Dat])); } patch.ProcessList(); patch.NeedUpdate = false; } else // only new instances need to be considered { //if (i == 1347) { Console.WriteLine("partial update"); } SparseVector <double> simVec = ModelUtils.GetDotProductSimilarity(trNewInst, dsNewInst.Count, vec, mSimThresh); // check if further processing is needed bool needMerge = false; if (test) { foreach (IdxDat <double> item in simVec) { if (item.Dat >= patch.MinSim) { needMerge = true; //Console.WriteLine("{0} {1}", item.Dat, patch.MinSim); break; } } } else { foreach (IdxDat <double> item in simVec) { if (item.Dat > patch.MinSim) { needMerge = true; //Console.WriteLine("{0} {1}", item.Dat, patch.MinSim); break; } } } if (needMerge || patch.List.Count < mKNn) { //if (i == 1347) { Console.WriteLine("merge"); } int oldCount = patch.List.Count; ArrayList <KeyDat <double, Patch> > tmp = new ArrayList <KeyDat <double, Patch> >(); foreach (IdxDat <double> item in simVec) { tmp.Add(new KeyDat <double, Patch>(item.Dat, mPatches[item.Idx + preAddCount])); } // merge the two lists // TODO: speed this up patch.List.AddRange(tmp); patch.List.Sort(new Comparer()); // trim list to size if (oldCount >= mKNn) { patch.List.RemoveRange(oldCount, patch.List.Count - oldCount); } patch.ProcessList(); } } } } /*prof*/ sw.Save("knn.txt", _count); // *** Test *** sw.Reset(); ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true); sw.Save("selfSim.txt", _count, mDataset.Count.ToString()); if (test) { simMtx = ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true); ArrayList <Patch> patches = new ArrayList <Patch>(); for (int i = 0; i < mDataset.Count; i++) { patches.Add(new Patch(i)); } foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx) { if (simMtxRow.Dat.Count <= 1) { mLogger.Warn("Update", "Instance #{0} has no neighborhood.", simMtxRow.Idx); } ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count); foreach (IdxDat <double> item in simMtxRow.Dat) { if (item.Idx != simMtxRow.Idx) { knn.Add(new KeyDat <double, int>(item.Dat, item.Idx)); } } knn.Sort(new Comparer2()); int count = Math.Min(knn.Count, mKNnExt); for (int i = 0; i < count; i++) { patches[simMtxRow.Idx].List.Add(new KeyDat <double, Patch>(knn[i].Key, patches[knn[i].Dat])); } patches[simMtxRow.Idx].ProcessList(); } // compare if (patches.Count != mPatches.Count) { throw new Exception("Count mismatch."); } for (int i = 0; i < mPatches.Count; i++) { if (patches[i].List.Count < mKNn && patches[i].List.Count != mPatches[i].List.Count) { Console.WriteLine(mPatches[i].List.Count); Console.WriteLine(patches[i].List.Count); Output(mPatches[i].List); Output(patches[i].List); Console.WriteLine(i); throw new Exception("List count mismatch."); } int count = Math.Min(mPatches[i].List.Count, mKNn); for (int j = 0; j < count; j++) { //Console.WriteLine("{4} {0}-{1} {2}-{3}", mPatches[i].List[j].Key, mPatches[i].List[j].Dat.Idx, patches[i].List[j].Key, patches[i].List[j].Dat.Idx, i); if (mPatches[i].List[j].Key != patches[i].List[j].Key || mPatches[i].List[j].Dat.Idx != patches[i].List[j].Dat.Idx) { Console.WriteLine("i:{4} fast:{0}-{1} slow:{2}-{3}", mPatches[i].List[j].Key, mPatches[i].List[j].Dat.Idx, patches[i].List[j].Key, patches[i].List[j].Dat.Idx, i); int idxFast = mPatches[i].List[j].Dat.Idx; int idxSlow = patches[i].List[j].Dat.Idx; Console.WriteLine("slow @ fast idx: {0}", GetKey(patches[i].List, idxFast)); Console.WriteLine("fast @ slow idx: {0}", GetKey(mPatches[i].List, idxSlow)); throw new Exception("Patch item mismatch."); } } } } // *** End of test *** //Console.WriteLine("Number of patches: {0}", mPatches.Count); //int waka = 0; //foreach (Patch patch in mPatches) //{ // waka += patch.List.Count; //} //Console.WriteLine("Avg list size: {0}", (double)waka / (double)mPatches.Count); Console.WriteLine((DateTime.Now - t).TotalMilliseconds); /*prof*/ sw.Reset(); mLogger.Info("Update", "Constructing system of linear equations ..."); LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >(); Vector2D[] layout = new Vector2D[mDataset.Count - mKClust]; foreach (Patch patch in mPatches) { int count = Math.Min(patch.List.Count, mKNn); SparseVector <double> eq = new SparseVector <double>(); double wgt = 1.0 / (double)count; for (int i = 0; i < count; i++) { eq.InnerIdx.Add(patch.List[i].Dat.Idx); eq.InnerDat.Add(-wgt); } eq.InnerIdx.Sort(); // *** sort only indices eq[patch.Idx] = 1; lsqrDs.Add(0, eq); } for (int i = mDataset.Count - mKClust, j = 0; i < mDataset.Count; i++, j++) { SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) }); lsqrDs.Add(mRefPos[j].X, eq); } LSqrModel lsqr = new LSqrModel(); mSolX.RemoveRange(0, numDequeue); double[] aux = new double[mKClust]; mSolX.CopyTo(mSolX.Count - mKClust, aux, 0, mKClust); mSolX.RemoveRange(mSolX.Count - mKClust, mKClust); foreach (SparseVector <double> newVec in newInst) { mSolX.Add(0); } mSolX.AddRange(aux); lsqr.InitialSolution = mSolX.ToArray(); lsqr.Train(lsqrDs); mSolX = lsqr.Solution.GetWritableCopy(); //for (int i = 0; i < lsqr.InitialSolution.Length; i++) //{ // Console.WriteLine("{0}\t{1}", lsqr.InitialSolution[i], lsqr.Solution[i]); //} for (int i = 0; i < layout.Length; i++) { layout[i].X = lsqr.Solution[i]; } for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++) { lsqrDs[i].Label = mRefPos[j].Y; } mSolY.RemoveRange(0, numDequeue); aux = new double[mKClust]; mSolY.CopyTo(mSolY.Count - mKClust, aux, 0, mKClust); mSolY.RemoveRange(mSolY.Count - mKClust, mKClust); foreach (SparseVector <double> newVec in newInst) { mSolY.Add(0); } mSolY.AddRange(aux); lsqr.InitialSolution = mSolY.ToArray(); lsqr.Train(lsqrDs); mSolY = lsqr.Solution.GetWritableCopy(); for (int i = 0; i < layout.Length; i++) { layout[i].Y = lsqr.Solution[i]; } /*prof*/ sw.Save("lsqr.txt", _count); // ----------------------------------------------------------------- // make ptInfo // ----------------------------------------------------------------- ptInfo = new PtInfo[layout.Length]; int ii = 0; foreach (Vector2D pt in layout) { ptInfo[ii] = new PtInfo(); ptInfo[ii].X = pt.X; ptInfo[ii].Y = pt.Y; ptInfo[ii].Vec = mDataset[ii]; ii++; } // ----------------------------------------------------------------- return(settings == null ? layout : settings.AdjustLayout(layout)); }
public Vector2D[] ComputeLayout(LayoutSettings settings) { // clustering mLogger.Info("ComputeLayout", "Clustering ..."); mKMeans = new IncrementalKMeans(mKClust); mKMeans.Eps = mKMeansEps; mKMeans.Random = mRandom; mKMeans.Trials = 3; ClusteringResult clustering = mKMeans.Cluster(mDataset); // throws ArgumentValueException // determine reference instances UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >(); foreach (SparseVector <double> centroid in mKMeans.GetCentroids()) { dsRefInst.Add(centroid); // dataset of reference instances mDataset.Add(centroid); // add centroids to the main dataset } // position reference instances mLogger.Info("ComputeLayout", "Positioning reference instances ..."); SparseMatrix <double> simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false); StressMajorizationLayout sm = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx)); sm.Random = mRandom; sm.MaxSteps = int.MaxValue; sm.MinDiff = 0.00001; mRefPos = sm.ComputeLayout(); // k-NN mLogger.Info("ComputeLayout", "Computing similarities ..."); simMtx = ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true); mLogger.Info("ComputeLayout", "Constructing system of linear equations ..."); LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >(); mPatches = new ArrayList <Patch>(mDataset.Count); for (int i = 0; i < mDataset.Count; i++) { mPatches.Add(new Patch(i)); } foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx) { if (simMtxRow.Dat.Count <= 1) { mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx); } ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count); foreach (IdxDat <double> item in simMtxRow.Dat) { if (item.Idx != simMtxRow.Idx) { knn.Add(new KeyDat <double, int>(item.Dat, item.Idx)); } } knn.Sort(DescSort <KeyDat <double, int> > .Instance); int count = Math.Min(knn.Count, mKNnExt); for (int i = 0; i < count; i++) { mPatches[simMtxRow.Idx].List.Add(new KeyDat <double, Patch>(knn[i].Key, mPatches[knn[i].Dat])); } mPatches[simMtxRow.Idx].ProcessList(); count = Math.Min(knn.Count, mKNn); SparseVector <double> eq = new SparseVector <double>(); double wgt = 1.0 / (double)count; for (int i = 0; i < count; i++) { eq.InnerIdx.Add(knn[i].Dat); eq.InnerDat.Add(-wgt); } eq.InnerIdx.Sort(); // *** sort only indices eq[simMtxRow.Idx] = 1; lsqrDs.Add(0, eq); } Vector2D[] layout = new Vector2D[mDataset.Count - mKClust]; for (int i = mDataset.Count - mKClust, j = 0; i < mDataset.Count; i++, j++) { SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) }); lsqrDs.Add(mRefPos[j].X, eq); } LSqrModel lsqr = new LSqrModel(); lsqr.Train(lsqrDs); mSolX = lsqr.Solution.GetWritableCopy(); for (int i = 0; i < layout.Length; i++) { layout[i].X = lsqr.Solution[i]; } for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++) { lsqrDs[i].Label = mRefPos[j].Y; } lsqr.Train(lsqrDs); mSolY = lsqr.Solution.GetWritableCopy(); for (int i = 0; i < layout.Length; i++) { layout[i].Y = lsqr.Solution[i]; } return(settings == null ? layout : settings.AdjustLayout(layout)); }
static void Main(string[] args) { Random rnd = new Random(1); string[] featureNames = "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma,ari,flesch,fog,rWords,rChars,rSyllables,rComplex,M04,M05,M06,M07,M08,M09,M10,M11,M12,M13".Split(','); LabeledDataset <BlogMetaData, SparseVector <double> > dataset = new LabeledDataset <BlogMetaData, SparseVector <double> >(); Console.WriteLine("Analiziram besedila..."); foreach (string fileName in Directory.GetFiles(Config.DataFolder, "*.xml")) { // load XML Console.WriteLine("Datoteka {0}...", fileName); XmlDocument doc = new XmlDocument(); doc.LoadXml(File.ReadAllText(fileName).Replace("xmlns=\"http://www.tei-c.org/ns/1.0\"", "")); Corpus corpus = new Corpus(); corpus.LoadFromXmlFile(fileName, /*tagLen=*/ int.MaxValue); #if TEST_CHUNKER Text text = null; #else Text text = new Text(corpus, doc.SelectSingleNode("//header/naslov").InnerText, doc.SelectSingleNode("//header/blog").InnerText /*blog identifier is used as author identifier*/); text.ComputeFeatures(); // compute Detextive features #endif // run chunker Console.WriteLine("Racunam znacilke..."); ArrayList <Chunk> chunks = Chunker.GetChunks(doc); chunks = new ArrayList <Chunk>(chunks.Where(x => !x.mInner)); // get non-inner chunks only chunks.ForEach(x => x.mType = MapChunkType(x.mType)); // move chunks from Other_* to main categories #if TEST_CHUNKER return; #endif // get blog meta-data BlogMetaData metaData = new BlogMetaData(); metaData.mAuthorAge = doc.SelectSingleNode("//header/avtorStarost").InnerText; metaData.mAuthorEducation = doc.SelectSingleNode("//header/avtorIzobrazba").InnerText; metaData.mAuthorGender = doc.SelectSingleNode("//header/avtorSpol").InnerText; metaData.mAuthorLocation = doc.SelectSingleNode("//header/avtorRegija").InnerText; metaData.mBlog = doc.SelectSingleNode("//header/blog").InnerText; // compute features M04-M13 from Stamatatos et al.: Automatic Text Categorization in Terms of Genre and Author (2000) double totalChunks = chunks.Count; double[] M = new double[10]; double numNP = chunks.Count(x => x.mType == ChunkType.NP); double numVP = chunks.Count(x => x.mType == ChunkType.VP); double numAP = chunks.Count(x => x.mType == ChunkType.AP); double numPP = chunks.Count(x => x.mType == ChunkType.PP); double numCON = chunks.Count(x => x.mType == ChunkType.CON); if (totalChunks > 0) { M[0] = numNP / totalChunks; M[1] = numVP / totalChunks; M[2] = numAP / totalChunks; M[3] = numPP / totalChunks; M[4] = numCON / totalChunks; } double numWordsNP = chunks.Where(x => x.mType == ChunkType.NP).Select(x => x.mItems.Count).Sum(); M[5] = numNP == 0 ? 0 : (numWordsNP / numNP); double numWordsVP = chunks.Where(x => x.mType == ChunkType.VP).Select(x => x.mItems.Count).Sum(); M[6] = numVP == 0 ? 0 : (numWordsVP / numVP); double numWordsAP = chunks.Where(x => x.mType == ChunkType.AP).Select(x => x.mItems.Count).Sum(); M[7] = numAP == 0 ? 0 : (numWordsAP / numAP); double numWordsPP = chunks.Where(x => x.mType == ChunkType.PP).Select(x => x.mItems.Count).Sum(); M[8] = numPP == 0 ? 0 : (numWordsPP / numPP); double numWordsCON = chunks.Where(x => x.mType == ChunkType.CON).Select(x => x.mItems.Count).Sum(); M[9] = numCON == 0 ? 0 : (numWordsCON / numCON); // create dataset SparseVector <double> vec = new SparseVector <double>(); int i = 0; foreach (string featureName in "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma,ari,flesch,fog,rWords,rChars,rSyllables,rComplex".Split(',')) { if (double.IsNaN(text.mFeatures[featureName]) || double.IsInfinity(text.mFeatures[featureName])) { vec[i++] = 0; } else { vec[i++] = text.mFeatures[featureName]; } } foreach (double val in M) { vec[i++] = val; } dataset.Add(new LabeledExample <BlogMetaData, SparseVector <double> >(metaData, vec)); string htmlFileName = Config.HtmlFolder + "\\" + Path.GetFileNameWithoutExtension(fileName) + ".html"; Output.SaveHtml(featureNames, vec, doc, chunks, htmlFileName); } // save as Orange and Weka file Console.WriteLine("Zapisujem datoteke Weka ARFF in Orange TAB..."); foreach (ClassType classType in new ClassType[] { ClassType.AuthorName, ClassType.AuthorAge, ClassType.AuthorGender, ClassType.AuthorEducation, ClassType.AuthorLocation }) { Output.SaveArff(featureNames, dataset, classType, Config.OutputFolder + "\\" + string.Format("OPA-{0}.arff", classType)); Output.SaveTab(featureNames, dataset, classType, Config.OutputFolder + "\\" + string.Format("OPA-{0}.tab", classType)); } // evaluate features via classification Console.WriteLine("Evalviram znacilke s klasifikacijskimi modeli..."); PerfData <string> perfData = new PerfData <string>(); ArrayList <Pair <string, IModel <string> > > models = new ArrayList <Pair <string, IModel <string> > >(); // create classifiers NearestCentroidClassifier <string> ncc = new NearestCentroidClassifier <string>(); ncc.Similarity = new SingleFeatureSimilarity(); models.Add(new Pair <string, IModel <string> >("NCC", ncc)); //KnnClassifier<string, SparseVector<double>> knn = new KnnClassifier<string, SparseVector<double>>(new SingleFeatureSimilarity()); //models.Add(new Pair<string, IModel<string>>("kNN", knn)); // *** kNN is too slow SvmMulticlassClassifier <string> svm = new SvmMulticlassClassifier <string>(); models.Add(new Pair <string, IModel <string> >("SVM", svm)); MajorityClassifier <string, SparseVector <double> > maj = new MajorityClassifier <string, SparseVector <double> >(); models.Add(new Pair <string, IModel <string> >("Majority", maj)); MajorityClassifier <string, SparseVector <double> > backupCfy = new MajorityClassifier <string, SparseVector <double> >(); foreach (Pair <string, IModel <string> > modelInfo in models) // iterate over different classifiers { Console.WriteLine("Kasifikacijski model: {0}...", modelInfo.First); foreach (ClassType classType in new ClassType[] { ClassType.AuthorName, ClassType.AuthorAge, ClassType.AuthorEducation, ClassType.AuthorGender, ClassType.AuthorLocation }) // iterate over different class types { Console.WriteLine("Ciljni razred: {0}...", classType); for (int fIdx = 0; fIdx < featureNames.Count(); fIdx++) // iterate over different features { Console.WriteLine("Znacilka: {0}...", featureNames[fIdx]); LabeledDataset <string, SparseVector <double> > datasetWithSingleFeature = CreateSingleFeatureDataset(dataset, classType, fIdx); datasetWithSingleFeature.Shuffle(rnd); LabeledDataset <string, SparseVector <double> > trainSet, testSet; for (int foldNum = 1; foldNum <= 10; foldNum++) { Console.WriteLine("Sklop " + foldNum + " / 10..."); datasetWithSingleFeature.SplitForCrossValidation(/*numFolds=*/ 10, foldNum, out trainSet, out testSet); IModel <string> model = modelInfo.Second; backupCfy.Train(trainSet); // if there is only one class in trainSet, switch to MajorityClassifier if (((IEnumerable <LabeledExample <string, SparseVector <double> > >)trainSet).Select(x => x.Label).Distinct().Count() == 1) { model = backupCfy; } else { string cacheFileName = Config.OutputFolder + "\\svm-" + classType + "-" + featureNames[fIdx] + "-" + foldNum + ".bin"; if (model is SvmMulticlassClassifier <string> && File.Exists(cacheFileName)) { using (BinarySerializer bs = new BinarySerializer(cacheFileName, FileMode.Open)) { ((SvmMulticlassClassifier <string>)model).Load(bs); } } else { model.Train(trainSet); } #if CACHE_MODELS if (model is SvmMulticlassFast <string> ) { using (BinarySerializer bs = new BinarySerializer(cacheFileName, FileMode.Create)) { model.Save(bs); } } #endif } foreach (LabeledExample <string, SparseVector <double> > lblEx in testSet) { Prediction <string> pred = model.Predict(lblEx.Example); if (pred.Count == 0) { pred = backupCfy.Predict(lblEx.Example); } // if the model is unable to make a prediction, use MajorityClassifier instead perfData.GetPerfMatrix(classType.ToString(), modelInfo.First + "\t" + featureNames[fIdx], foldNum).AddCount(lblEx.Label, pred.BestClassLabel); } } } } } // train full models Console.WriteLine("Treniram klasifikacijske modele..."); models.Clear(); SvmMulticlassClassifier <string> svmFull = new SvmMulticlassClassifier <string>(); models.Add(new Pair <string, IModel <string> >("SVM", svmFull)); //NearestCentroidClassifier<string> nccFull = new NearestCentroidClassifier<string>(); //nccFull.Similarity = new ManhattanSimilarity(); //models.Add(new Pair<string, IModel<string>>("NCC", nccFull)); foreach (Pair <string, IModel <string> > modelInfo in models) // iterate over different classifiers { Console.WriteLine("Kasifikacijski model: {0}...", modelInfo.First); IModel <string> model = modelInfo.Second; foreach (ClassType classType in new ClassType[] { ClassType.AuthorName, ClassType.AuthorAge, ClassType.AuthorEducation, ClassType.AuthorGender, ClassType.AuthorLocation }) // iterate over different class types { Console.WriteLine("Ciljni razred: {0}...", classType); LabeledDataset <string, SparseVector <double> > nrmDataset = CreateNormalizedDataset(dataset, classType); nrmDataset.Shuffle(rnd); LabeledDataset <string, SparseVector <double> > trainSet, testSet; for (int foldNum = 1; foldNum <= 10; foldNum++) { Console.WriteLine("Sklop " + foldNum + " / 10..."); nrmDataset.SplitForCrossValidation(/*numFolds=*/ 10, foldNum, out trainSet, out testSet); backupCfy.Train(trainSet); // if there is only one class in trainSet, switch to MajorityClassifier if (((IEnumerable <LabeledExample <string, SparseVector <double> > >)trainSet).Select(x => x.Label).Distinct().Count() == 1) { model = backupCfy; } else { string cacheFileName = Config.OutputFolder + "\\svm-" + classType + "-full-" + foldNum + ".bin"; if (model is SvmMulticlassClassifier <string> && File.Exists(cacheFileName)) { using (BinarySerializer bs = new BinarySerializer(cacheFileName, FileMode.Open)) { ((SvmMulticlassClassifier <string>)model).Load(bs); } } else { model.Train(trainSet); } #if CACHE_MODELS if (model is SvmMulticlassFast <string> ) { using (BinarySerializer bs = new BinarySerializer(cacheFileName, FileMode.Create)) { model.Save(bs); } } #endif } foreach (LabeledExample <string, SparseVector <double> > lblEx in testSet) { Prediction <string> pred = model.Predict(lblEx.Example); if (pred.Count == 0) { pred = backupCfy.Predict(lblEx.Example); } // if the model is unable to make a prediction, use MajorityClassifier instead perfData.GetPerfMatrix(classType.ToString(), modelInfo.First + "\tfull", foldNum).AddCount(lblEx.Label, pred.BestClassLabel); } } // save model string modelFileName = Config.OutputFolder + "\\" + modelInfo.First + "-" + classType + ".model"; if (!File.Exists(modelFileName)) { using (BinarySerializer bs = new BinarySerializer(modelFileName, FileMode.Create)) { model.Train(nrmDataset); model.Save(bs); } } } } using (StreamWriter w = new StreamWriter(Config.OutputFolder + "\\ClassifierEval.txt")) { w.WriteLine("*** Macro F1 ***"); w.WriteLine(); w.WriteLine("\t" + perfData.ToString(null, PerfMetric.MacroF1)); w.WriteLine(); w.WriteLine("*** Micro F1 ***"); w.WriteLine(); w.WriteLine("\t" + perfData.ToString(null, PerfMetric.MicroF1)); w.WriteLine(); w.WriteLine("*** Macro accuracy ***"); w.WriteLine(); w.WriteLine("\t" + perfData.ToString(null, PerfMetric.MacroAccuracy)); w.WriteLine(); w.WriteLine("*** Micro accuracy ***"); w.WriteLine(); w.WriteLine("\t" + perfData.ToString(null, PerfMetric.MicroAccuracy)); } // all done Console.WriteLine("Koncano."); }
static void Main(string[] args) { try { if (args.Length < 2) { OutputHelp(); } else { int cutOff = 2; int numIter = 50; int numThreads = 1; string corpusFileName = null, modelFileName = null, lexiconFileName = null; bool verbose = false; if (ParseParams(args, ref verbose, ref cutOff, ref numIter, ref numThreads, ref corpusFileName, ref modelFileName, ref lexiconFileName)) { Logger logger = Logger.GetRootLogger(); if (!verbose) { logger.LocalLevel = Logger.Level.Off; logger.LocalProgressOutputType = Logger.ProgressOutputType.Off; } else { logger.LocalOutputType = Logger.OutputType.Custom; Logger.CustomOutput = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e, string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); }); } Corpus corpus = new Corpus(); logger.Info(/*funcName=*/ null, "Nalagam učni korpus ..."); corpus.LoadFromXmlFile(corpusFileName, /*tagLen=*/ -1); GC.Collect(); long oldMemUse = Process.GetCurrentProcess().PrivateMemorySize64; PatriciaTree suffixTree = new PatriciaTree(); foreach (TaggedWord word in corpus.TaggedWords) { suffixTree.AddWordTagPair(word.WordLower, word.Tag); } if (lexiconFileName != null) { logger.Info(/*funcName=*/ null, "Nalagam leksikon ..."); StreamReader lexReader = new StreamReader(lexiconFileName); string lexLine; while ((lexLine = lexReader.ReadLine()) != null) { string[] lexData = lexLine.Split('\t'); suffixTree.AddWordTagPair(lexData[0].ToLower(), lexData[2]); } lexReader.Close(); } GC.Collect(); long memUse = Process.GetCurrentProcess().PrivateMemorySize64; Console.WriteLine("Poraba pomnilnika (drevo končnic): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0); oldMemUse = memUse; suffixTree.PropagateTags(); GC.Collect(); memUse = Process.GetCurrentProcess().PrivateMemorySize64; Console.WriteLine("Poraba pomnilnika (propagirane oznake): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0); MaximumEntropyClassifierFast <string> model = new MaximumEntropyClassifierFast <string>(); LabeledDataset <string, BinaryVector> dataset = new LabeledDataset <string, BinaryVector>(); Dictionary <string, int> featureSpace = new Dictionary <string, int>(); logger.Info(/*funcName=*/ null, "Pripravljam vektorje značilk ..."); for (int i = 0; i < corpus.TaggedWords.Count; i++) { logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0} / {1}", i + 1, corpus.TaggedWords.Count); BinaryVector featureVector = corpus.GenerateFeatureVector(i, featureSpace, /*extendFeatureSpace=*/ true, suffixTree); dataset.Add(corpus.TaggedWords[i].Tag, featureVector); } logger.Info(/*funcName=*/ null, "Gradim model ..."); DateTime startTime = DateTime.Now; model.CutOff = cutOff; model.NumThreads = numThreads; model.NumIter = numIter; model.Train(dataset); TimeSpan span = DateTime.Now - startTime; logger.Info(/*funcName=*/ null, "Trajanje gradnje modela: {0:00}:{1:00}:{2:00}.{3:000}.", span.Hours, span.Minutes, span.Seconds, span.Milliseconds); logger.Info(/*funcName=*/ null, "Zapisujem model ..."); BinarySerializer writer = new BinarySerializer(modelFileName, FileMode.Create); suffixTree.Save(writer); Utils.SaveDictionary(featureSpace, writer); model.Save(writer); writer.Close(); logger.Info(/*funcName=*/ null, "Končano."); } } } catch (Exception exception) { Console.WriteLine(); Console.WriteLine("*** Nepričakovana napaka. Podrobnosti: {0}\r\n{1}", exception, exception.StackTrace); } }