Ejemplo n.º 1
0
        public static LabeledDataset <SentimentLabel, SparseVector <double> > InitBowSpace(BowSpace bowSpace,
                                                                                           IEnumerable <LabeledExample <SentimentLabel, string> > labeledExamples, IEnumerable <string> initExamples = null)
        {
            LabeledExample <SentimentLabel, string>[] examples = labeledExamples as LabeledExample <SentimentLabel, string>[] ?? labeledExamples.ToArray();

            List <SparseVector <double> > bowData;

            if (initExamples != null)
            {
                Preconditions.CheckArgument(!(bowSpace is DeltaBowSpace <SentimentLabel>));
                bowSpace.Initialize(initExamples);
                bowData = examples.Select(le => bowSpace.ProcessDocument(le.Example)).ToList();
            }
            else
            {
                bowData = bowSpace is DeltaBowSpace <SentimentLabel>
                          ?((DeltaBowSpace <SentimentLabel>)bowSpace).Initialize(new LabeledDataset <SentimentLabel, string>(examples))
                              : bowSpace.Initialize(examples.Select(d => d.Example));
            }

            var bowDataset = new LabeledDataset <SentimentLabel, SparseVector <double> >();

            for (int i = 0; i < bowData.Count; i++)
            {
                bowDataset.Add(examples[i].Label, bowData[i]);
            }
            return(bowDataset);
        }
Ejemplo n.º 2
0
        public override void Train(ILabeledExampleCollection <SentimentLabel, SparseVector <double> > dataset)
        {
            Preconditions.CheckState(BowSpace != null);
            var replDataset = new LabeledDataset <SentimentLabel, SparseVector <double> >();

            foreach (LabeledExample <SentimentLabel, SparseVector <double> > le in dataset)
            {
                SparseVector <double> vector1, vector2;
                Replicate(le.Example, out vector1, out vector2);

                replDataset.Add(new LabeledExample <SentimentLabel, SparseVector <double> >(
                                    le.Label == SentimentLabel.Neutral ? SentimentLabel.Negative : le.Label, vector1));
                replDataset.Add(new LabeledExample <SentimentLabel, SparseVector <double> >(
                                    le.Label == SentimentLabel.Neutral ? SentimentLabel.Positive : le.Label, vector2));
            }

            mClassifier = CreateModel();
            mClassifier.Train(replDataset);

            IsTrained = true;
        }
Ejemplo n.º 3
0
        static LabeledDataset <string, SparseVector <double> > CreateSingleFeatureDataset(LabeledDataset <BlogMetaData, SparseVector <double> > srcDataset, ClassType classType, int fIdx)
        {
            SparseVector <double> minValues, maxValues;

            GetExtremes(srcDataset, out minValues, out maxValues);
            LabeledDataset <string, SparseVector <double> > dataset = new LabeledDataset <string, SparseVector <double> >();

            ((IEnumerable <LabeledExample <BlogMetaData, SparseVector <double> > >)srcDataset).ToList()
            .ForEach(x => dataset.Add(new LabeledExample <string, SparseVector <double> >(AnalysisUtils.GetLabel(x.Label, classType),
                                                                                          new SparseVector <double>(
                                                                                              new double[] { (x.Example[fIdx] - minValues[fIdx]) / (maxValues[fIdx] - minValues[fIdx]) } // simple normalization
                                                                                              ))));
            return(dataset);
        }
        private static LabeledDataset <int, int> NewData(int[,] labelCounts, bool sortShuffled = false)
        {
            var result = new LabeledDataset <int, int>();

            for (int i = 0, k = 1; i <= labelCounts.GetUpperBound(0); i++)
            {
                int label = labelCounts[i, 0], count = labelCounts[i, 1];
                for (int j = 0; j < count; j++)
                {
                    result.Add(label, k++);
                }
            }
            if (sortShuffled)
            {
                result.GroupLabels(true);
            }
            return(result);
        }
Ejemplo n.º 5
0
        protected override ILabeledDataset <LblT, SparseVector <double> > MapTrainSet(int foldN, ILabeledDataset <LblT, string> trainSet)
        {
            BowSpace bowSpace;

            Preconditions.CheckState(!mFoldBowSpaces.TryGetValue(foldN, out bowSpace));
            Preconditions.CheckState(mFoldBowSpaces.TryAdd(foldN, bowSpace = BowSpaceFunc()));

            List <SparseVector <double> > bowData = bowSpace is DeltaBowSpace <LblT>
                                                    ?((DeltaBowSpace <LblT>)bowSpace).Initialize(trainSet)
                                                        : bowSpace.Initialize(trainSet.Select(d => d.Example));

            var bowDataset = new LabeledDataset <LblT, SparseVector <double> >();

            for (int i = 0; i < bowData.Count; i++)
            {
                bowDataset.Add(trainSet[i].Label, bowData[i]);
            }

            return(bowDataset);
        }
Ejemplo n.º 6
0
 public void TrainModels(IEnumerable <Author> authors)
 {
     foreach (Author author in authors)
     {
         LabeledDataset <string, SparseVector <double> > ds = new LabeledDataset <string, SparseVector <double> >();
         foreach (Author otherAuthor in authors)
         {
             if (otherAuthor != author && !otherAuthor.mIsTagged)
             {
                 foreach (Text text in otherAuthor.mTexts)
                 {
                     ds.Add(new LabeledExample <string, SparseVector <double> >(otherAuthor.mName, text.mFeatureVectors[mSelector]));
                 }
             }
         }
         SvmMulticlassClassifier <string> model = new SvmMulticlassClassifier <string>();
         model.C = Convert.ToDouble(Utils.GetConfigValue("SvmMultiClassC", "5000"));
         model.Train(ds);
         mModels.Add(author.mName, model);
     }
 }
Ejemplo n.º 7
0
        public void Train(ILabeledExampleCollection <LblT, string> dataset)
        {
            Preconditions.CheckState(!IsTrained);
            Preconditions.CheckNotNull(dataset);
            Preconditions.CheckNotNull(BowSpace);
            Preconditions.CheckNotNull(FeatureProcessor);
            Preconditions.CheckNotNull(Model);

            // preprocess the text
            foreach (LabeledExample <LblT, string> le in dataset)
            {
                le.Example = FeatureProcessor.Run(le.Example);
            }

            // bow vectors
            List <SparseVector <double> > bowData = BowSpace is DeltaBowSpace <LblT>
                                                    ?(BowSpace as DeltaBowSpace <LblT>).Initialize(dataset as ILabeledDataset <LblT, string> ?? new LabeledDataset <LblT, string>(dataset))
                                                        : BowSpace.Initialize(dataset.Select(d => d.Example));
            var bowDataset = new LabeledDataset <LblT, SparseVector <double> >();

            for (int i = 0; i < bowData.Count; i++)
            {
                bowDataset.Add(dataset[i].Label, bowData[i]);
            }

            // train
            if (OnTrainModel == null)
            {
                Model.Train(bowDataset);
            }
            else
            {
                OnTrainModel(this, bowDataset);
            }

            IsTrained = true;
        }
Ejemplo n.º 8
0
        public override void Run(object[] args)
        {
            // prepare data
            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;
            TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer);

            // Create a tokenizer.
            var tokenizer = new UnicodeTokenizer
            {
                MinTokenLen = 2,                            // Each token must be at least 2 characters long.
                Filter      = TokenizerFilter.AlphaStrict   // Tokens can consist of alphabetic characters only.
            };

            // take data for two classes from cvs file
            var data = new List <LabeledTweet>(GetLabeledTweets().Where(lt => lt.Polarity != 2)).ToList();

            // Create a bag-of-words space.
            var bowSpc = new BowSpace
            {
                Tokenizer      = tokenizer,                 // Assign the tokenizer.
                StopWords      = stopWords,                 // Assign the stop words.
                Stemmer        = stemmer,                   // Assign the stemmer.
                MinWordFreq    = 1,                         // A term must appear at least n-times in the corpus for it to be part of the vocabulary.
                MaxNGramLen    = 2,                         // Terms consisting of at most n-consecutive words will be considered.
                WordWeightType = WordWeightType.TermFreq,   // Set the weighting scheme for the bag-of-words vectors to TF.
                //WordWeightType = WordWeightType.TfIdf,  // Set the weighting scheme for the bag-of-words vectors to TF-IDF.
                NormalizeVectors  = true,                   // The TF-IDF vectors will be normalized.
                CutLowWeightsPerc = 0                       // The terms with the lowest weights, summing up to 20% of the overall weight sum, will be removed from each TF-IDF vector.
            };
            ArrayList <SparseVector <double> > bowData = bowSpc.Initialize(data.Select(d => d.Text));

            // label data
            var labeledSet = new LabeledDataset <string, SparseVector <double> >();

            for (int i = 0; i < data.Count; i++)
            {
                labeledSet.Add(data[i].Label, bowData[i]);
            }
            labeledSet.Shuffle();

            int testSize    = labeledSet.Count / 10;
            var trainingSet = new LabeledDataset <string, SparseVector <double> >(labeledSet.Skip(testSize));
            var testSet     = new LabeledDataset <string, SparseVector <double> >(labeledSet.Take(testSize));

            //-------------------- SVM

            var svmBinClass = new SvmBinaryClassifier <string> {
                VerbosityLevel = SvmLightVerbosityLevel.Off
            };

            if (args.Any())
            {
                svmBinClass.C = (int)args[0];
            }
            //svmBinClass.BiasedHyperplane = true;
            //svmBinClass.CustomParams = "-t 3";   // non-linear kernel
            //svmBinClass.CustomParams = String.Format("-j {0}",j);

            svmBinClass.Train(trainingSet);

            int    correct = 0;
            double avgDist = 0;

            foreach (LabeledExample <string, SparseVector <double> > labeledExample in testSet)
            {
                var prediction = svmBinClass.Predict(labeledExample.Example);
                //Output.WriteLine("actual: {0}\tpredicted: {1}\t score: {2:0.0000}", labeledExample.Label, prediction.BestClassLabel, prediction.BestScore);
                avgDist += prediction.BestScore;
                if (prediction.BestClassLabel == labeledExample.Label)
                {
                    correct++;
                }
            }

            Output.WriteLine("Accuracy: {0:0.00}", 100.0 * correct / testSet.Count);
            Output.WriteLine("Avg. distance: {0:0.00}", avgDist / testSet.Count);

            Result.Add("accuracy", (double)correct / testSet.Count);

            Result.Add("classifier", svmBinClass);
            Result.Add("labeled_data", labeledSet);
        }
Ejemplo n.º 9
0
        public Vector2D[] ComputeLayout(LayoutSettings settings)
        {
            UnlabeledDataset <SparseVector <double> > dataset = new UnlabeledDataset <SparseVector <double> >(mDataset);

            // clustering
            mLogger.Info("ComputeLayout", "Clustering ...");
            KMeansFast kMeans = new KMeansFast(mKClust);

            kMeans.Eps    = mKMeansEps;
            kMeans.Random = mRandom;
            kMeans.Trials = 1;
            ClusteringResult clustering = kMeans.Cluster(mDataset); // throws ArgumentValueException
            // determine reference instances
            UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >();

            foreach (Cluster cluster in clustering.Roots)
            {
                SparseVector <double> centroid
                    = cluster.Items.Count > 0 ? cluster.ComputeCentroid(mDataset, CentroidType.NrmL2) : new SparseVector <double>();
                dsRefInst.Add(centroid); // dataset of reference instances
                dataset.Add(centroid);   // add centroids to the main dataset
            }
            // position reference instances
            mLogger.Info("ComputeLayout", "Positioning reference instances ...");
            SparseMatrix <double>    simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false);
            StressMajorizationLayout sm     = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx));

            sm.Random = mRandom;
            Vector2D[] centrPos = sm.ComputeLayout();
            // k-NN
            mLogger.Info("ComputeLayout", "Computing similarities ...");
            simMtx = ModelUtils.GetDotProductSimilarity(dataset, mSimThresh, /*fullMatrix=*/ true);
            mLogger.Info("ComputeLayout", "Constructing system of linear equations ...");
            LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >();

            foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx)
            {
                if (simMtxRow.Dat.Count <= 1)
                {
                    mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx);
                }
                ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count);
                foreach (IdxDat <double> item in simMtxRow.Dat)
                {
                    if (item.Idx != simMtxRow.Idx)
                    {
                        knn.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                    }
                }
                knn.Sort(DescSort <KeyDat <double, int> > .Instance);
                int count = Math.Min(knn.Count, mKNN);
                SparseVector <double> eq = new SparseVector <double>();
                double wgt = 1.0 / (double)count;
                for (int i = 0; i < count; i++)
                {
                    eq.InnerIdx.Add(knn[i].Dat);
                    eq.InnerDat.Add(-wgt);
                }
                eq.InnerIdx.Sort(); // *** sort only indices
                eq[simMtxRow.Idx] = 1;
                lsqrDs.Add(0, eq);
            }
            Vector2D[] layout = new Vector2D[dataset.Count - mKClust];
            for (int i = dataset.Count - mKClust, j = 0; i < dataset.Count; i++, j++)
            {
                SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) });
                lsqrDs.Add(centrPos[j].X, eq);
            }
            LSqrModel lsqr = new LSqrModel();

            lsqr.Train(lsqrDs);
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].X = lsqr.Solution[i];
            }
            for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++)
            {
                lsqrDs[i].Label = centrPos[j].Y;
            }
            lsqr.Train(lsqrDs);
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].Y = lsqr.Solution[i];
            }
            return(settings == null ? layout : settings.AdjustLayout(layout));
        }
        // TODO: exceptions
        public Vector2D[] Update(int numDequeue, IEnumerable <SparseVector <double> > newInst, bool test, LayoutSettings settings, ref PtInfo[] ptInfo, int _count)
        {
            // clustering
            mLogger.Info("Update", "Clustering ...");
            /*prof*/ StopWatch sw = new StopWatch();

            mKMeans.Eps = mKMeansEps;
            int iter = 0;

            mKMeans.Update(numDequeue, newInst, ref iter);
            /*prof*/ sw.Save("cl.txt", _count, iter.ToString());
            // determine reference instances
            /*prof*/ sw.Reset();
            UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >();
            UnlabeledDataset <SparseVector <double> > dsNewInst = new UnlabeledDataset <SparseVector <double> >(newInst);

            foreach (SparseVector <double> centroid in mKMeans.GetCentroids())
            {
                dsRefInst.Add(centroid); // dataset of reference instances
                dsNewInst.Add(centroid); // dataset of new instances
            }
            // position reference instances
            mLogger.Info("Update", "Positioning reference instances ...");
            SparseMatrix <double>    simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false);
            StressMajorizationLayout sm     = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx));

            sm.Random   = mRandom;
            sm.MaxSteps = int.MaxValue;
            sm.MinDiff  = 1E-3;
            mRefPos     = sm.ComputeLayout(/*settings=*/ null, mRefPos /*make this a property!!!*/);
            /*prof*/ sw.Save("sm.txt", _count);
            // k-NN
            /*prof*/ sw.Reset();
            DateTime t = DateTime.Now;

            mLogger.Info("Update", "Computing similarities ...");
            // update list of neighborhoods
            mPatches.RemoveRange(mDataset.Count - mKClust, mKClust);
            mPatches.RemoveRange(0, numDequeue);
            // remove instances from [dataset and] neighborhoods
            foreach (Patch patch in mPatches)
            {
                if (patch.Min != null && (patch.Min.Idx < numDequeue || patch.Max.Idx >= mDataset.Count - mKClust))
                {
                    int oldCount = patch.List.Count;
                    ArrayList <KeyDat <double, Patch> > tmp = new ArrayList <KeyDat <double, Patch> >();
                    foreach (KeyDat <double, Patch> item in patch.List)
                    {
                        if (item.Dat.Idx >= numDequeue && item.Dat.Idx < mDataset.Count - mKClust)
                        {
                            tmp.Add(item);
                        }
                        //else
                        //{
                        //    Console.WriteLine("Remove {0}", item.Dat.Idx - numDequeue);
                        //}
                    }
                    patch.List = tmp;
                    patch.ProcessList();
                    patch.NeedUpdate = patch.List.Count < mKNn && oldCount >= mKNn;
                }
            }
            // update dataset
            mDataset.RemoveRange(mDataset.Count - mKClust, mKClust);
            mDataset.RemoveRange(0, numDequeue);
            // add new instances to dataset
            int preAddCount = mDataset.Count;

            mDataset.AddRange(dsNewInst);
            // precompute transposed matrices
            SparseMatrix <double> trNewInst = ModelUtils.GetTransposedMatrix(dsNewInst);
            SparseMatrix <double> trDataset = ModelUtils.GetTransposedMatrix(mDataset);

            // add new instances to neighborhoods
            for (int i = 0; i < dsNewInst.Count; i++)
            {
                mPatches.Add(new Patch(-1));
                mPatches.Last.NeedUpdate = true;
            }
            for (int i = 0; i < mPatches.Count; i++)
            {
                mPatches[i].Idx = i;
            }
            for (int i = 0; i < mPatches.Count; i++)
            {
                Patch patch = mPatches[i];
                SparseVector <double> vec = mDataset[i];
                if (vec != null)
                {
                    if (patch.NeedUpdate) // full update required
                    {
                        //if (i == 1347) { Console.WriteLine("full update"); }
                        SparseVector <double>             simVec = ModelUtils.GetDotProductSimilarity(trDataset, mDataset.Count, vec, mSimThresh);
                        ArrayList <KeyDat <double, int> > tmp    = new ArrayList <KeyDat <double, int> >();
                        foreach (IdxDat <double> item in simVec)
                        {
                            if (item.Idx != i)
                            {
                                tmp.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                            }
                        }
                        tmp.Sort(new Comparer2());
                        int count = Math.Min(tmp.Count, mKNnExt);
                        patch.List.Clear();
                        for (int j = 0; j < count; j++)
                        {
                            patch.List.Add(new KeyDat <double, Patch>(tmp[j].Key, mPatches[tmp[j].Dat]));
                        }
                        patch.ProcessList();
                        patch.NeedUpdate = false;
                    }
                    else // only new instances need to be considered
                    {
                        //if (i == 1347) { Console.WriteLine("partial update"); }
                        SparseVector <double> simVec = ModelUtils.GetDotProductSimilarity(trNewInst, dsNewInst.Count, vec, mSimThresh);
                        // check if further processing is needed
                        bool needMerge = false;
                        if (test)
                        {
                            foreach (IdxDat <double> item in simVec)
                            {
                                if (item.Dat >= patch.MinSim)
                                {
                                    needMerge = true;
                                    //Console.WriteLine("{0} {1}", item.Dat, patch.MinSim);
                                    break;
                                }
                            }
                        }
                        else
                        {
                            foreach (IdxDat <double> item in simVec)
                            {
                                if (item.Dat > patch.MinSim)
                                {
                                    needMerge = true;
                                    //Console.WriteLine("{0} {1}", item.Dat, patch.MinSim);
                                    break;
                                }
                            }
                        }
                        if (needMerge || patch.List.Count < mKNn)
                        {
                            //if (i == 1347) { Console.WriteLine("merge"); }
                            int oldCount = patch.List.Count;
                            ArrayList <KeyDat <double, Patch> > tmp = new ArrayList <KeyDat <double, Patch> >();
                            foreach (IdxDat <double> item in simVec)
                            {
                                tmp.Add(new KeyDat <double, Patch>(item.Dat, mPatches[item.Idx + preAddCount]));
                            }
                            // merge the two lists
                            // TODO: speed this up
                            patch.List.AddRange(tmp);
                            patch.List.Sort(new Comparer());
                            // trim list to size
                            if (oldCount >= mKNn)
                            {
                                patch.List.RemoveRange(oldCount, patch.List.Count - oldCount);
                            }
                            patch.ProcessList();
                        }
                    }
                }
            }
            /*prof*/ sw.Save("knn.txt", _count);
            // *** Test ***
            sw.Reset();
            ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true);
            sw.Save("selfSim.txt", _count, mDataset.Count.ToString());
            if (test)
            {
                simMtx = ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true);
                ArrayList <Patch> patches = new ArrayList <Patch>();
                for (int i = 0; i < mDataset.Count; i++)
                {
                    patches.Add(new Patch(i));
                }
                foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx)
                {
                    if (simMtxRow.Dat.Count <= 1)
                    {
                        mLogger.Warn("Update", "Instance #{0} has no neighborhood.", simMtxRow.Idx);
                    }
                    ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count);
                    foreach (IdxDat <double> item in simMtxRow.Dat)
                    {
                        if (item.Idx != simMtxRow.Idx)
                        {
                            knn.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                        }
                    }
                    knn.Sort(new Comparer2());
                    int count = Math.Min(knn.Count, mKNnExt);
                    for (int i = 0; i < count; i++)
                    {
                        patches[simMtxRow.Idx].List.Add(new KeyDat <double, Patch>(knn[i].Key, patches[knn[i].Dat]));
                    }
                    patches[simMtxRow.Idx].ProcessList();
                }
                // compare
                if (patches.Count != mPatches.Count)
                {
                    throw new Exception("Count mismatch.");
                }
                for (int i = 0; i < mPatches.Count; i++)
                {
                    if (patches[i].List.Count < mKNn && patches[i].List.Count != mPatches[i].List.Count)
                    {
                        Console.WriteLine(mPatches[i].List.Count);
                        Console.WriteLine(patches[i].List.Count);
                        Output(mPatches[i].List);
                        Output(patches[i].List);
                        Console.WriteLine(i);
                        throw new Exception("List count mismatch.");
                    }
                    int count = Math.Min(mPatches[i].List.Count, mKNn);
                    for (int j = 0; j < count; j++)
                    {
                        //Console.WriteLine("{4} {0}-{1} {2}-{3}", mPatches[i].List[j].Key, mPatches[i].List[j].Dat.Idx, patches[i].List[j].Key, patches[i].List[j].Dat.Idx, i);
                        if (mPatches[i].List[j].Key != patches[i].List[j].Key || mPatches[i].List[j].Dat.Idx != patches[i].List[j].Dat.Idx)
                        {
                            Console.WriteLine("i:{4} fast:{0}-{1} slow:{2}-{3}", mPatches[i].List[j].Key, mPatches[i].List[j].Dat.Idx, patches[i].List[j].Key, patches[i].List[j].Dat.Idx, i);
                            int idxFast = mPatches[i].List[j].Dat.Idx;
                            int idxSlow = patches[i].List[j].Dat.Idx;
                            Console.WriteLine("slow @ fast idx: {0}", GetKey(patches[i].List, idxFast));
                            Console.WriteLine("fast @ slow idx: {0}", GetKey(mPatches[i].List, idxSlow));
                            throw new Exception("Patch item mismatch.");
                        }
                    }
                }
            }
            // *** End of test ***
            //Console.WriteLine("Number of patches: {0}", mPatches.Count);
            //int waka = 0;
            //foreach (Patch patch in mPatches)
            //{
            //    waka += patch.List.Count;
            //}
            //Console.WriteLine("Avg list size: {0}", (double)waka / (double)mPatches.Count);
            Console.WriteLine((DateTime.Now - t).TotalMilliseconds);
            /*prof*/ sw.Reset();
            mLogger.Info("Update", "Constructing system of linear equations ...");
            LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >();

            Vector2D[] layout = new Vector2D[mDataset.Count - mKClust];
            foreach (Patch patch in mPatches)
            {
                int count = Math.Min(patch.List.Count, mKNn);
                SparseVector <double> eq = new SparseVector <double>();
                double wgt = 1.0 / (double)count;
                for (int i = 0; i < count; i++)
                {
                    eq.InnerIdx.Add(patch.List[i].Dat.Idx);
                    eq.InnerDat.Add(-wgt);
                }
                eq.InnerIdx.Sort(); // *** sort only indices
                eq[patch.Idx] = 1;
                lsqrDs.Add(0, eq);
            }
            for (int i = mDataset.Count - mKClust, j = 0; i < mDataset.Count; i++, j++)
            {
                SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) });
                lsqrDs.Add(mRefPos[j].X, eq);
            }
            LSqrModel lsqr = new LSqrModel();

            mSolX.RemoveRange(0, numDequeue);
            double[] aux = new double[mKClust];
            mSolX.CopyTo(mSolX.Count - mKClust, aux, 0, mKClust);
            mSolX.RemoveRange(mSolX.Count - mKClust, mKClust);
            foreach (SparseVector <double> newVec in newInst)
            {
                mSolX.Add(0);
            }
            mSolX.AddRange(aux);
            lsqr.InitialSolution = mSolX.ToArray();
            lsqr.Train(lsqrDs);
            mSolX = lsqr.Solution.GetWritableCopy();
            //for (int i = 0; i < lsqr.InitialSolution.Length; i++)
            //{
            //    Console.WriteLine("{0}\t{1}", lsqr.InitialSolution[i], lsqr.Solution[i]);
            //}
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].X = lsqr.Solution[i];
            }
            for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++)
            {
                lsqrDs[i].Label = mRefPos[j].Y;
            }
            mSolY.RemoveRange(0, numDequeue);
            aux = new double[mKClust];
            mSolY.CopyTo(mSolY.Count - mKClust, aux, 0, mKClust);
            mSolY.RemoveRange(mSolY.Count - mKClust, mKClust);
            foreach (SparseVector <double> newVec in newInst)
            {
                mSolY.Add(0);
            }
            mSolY.AddRange(aux);
            lsqr.InitialSolution = mSolY.ToArray();
            lsqr.Train(lsqrDs);
            mSolY = lsqr.Solution.GetWritableCopy();
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].Y = lsqr.Solution[i];
            }
            /*prof*/ sw.Save("lsqr.txt", _count);
            // -----------------------------------------------------------------
            // make ptInfo
            // -----------------------------------------------------------------
            ptInfo = new PtInfo[layout.Length];
            int ii = 0;

            foreach (Vector2D pt in layout)
            {
                ptInfo[ii]     = new PtInfo();
                ptInfo[ii].X   = pt.X;
                ptInfo[ii].Y   = pt.Y;
                ptInfo[ii].Vec = mDataset[ii];
                ii++;
            }
            // -----------------------------------------------------------------
            return(settings == null ? layout : settings.AdjustLayout(layout));
        }
        public Vector2D[] ComputeLayout(LayoutSettings settings)
        {
            // clustering
            mLogger.Info("ComputeLayout", "Clustering ...");
            mKMeans        = new IncrementalKMeans(mKClust);
            mKMeans.Eps    = mKMeansEps;
            mKMeans.Random = mRandom;
            mKMeans.Trials = 3;
            ClusteringResult clustering = mKMeans.Cluster(mDataset); // throws ArgumentValueException
            // determine reference instances
            UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >();

            foreach (SparseVector <double> centroid in mKMeans.GetCentroids())
            {
                dsRefInst.Add(centroid); // dataset of reference instances
                mDataset.Add(centroid);  // add centroids to the main dataset
            }
            // position reference instances
            mLogger.Info("ComputeLayout", "Positioning reference instances ...");
            SparseMatrix <double>    simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false);
            StressMajorizationLayout sm     = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx));

            sm.Random   = mRandom;
            sm.MaxSteps = int.MaxValue;
            sm.MinDiff  = 0.00001;
            mRefPos     = sm.ComputeLayout();
            // k-NN
            mLogger.Info("ComputeLayout", "Computing similarities ...");
            simMtx = ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true);
            mLogger.Info("ComputeLayout", "Constructing system of linear equations ...");
            LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >();

            mPatches = new ArrayList <Patch>(mDataset.Count);
            for (int i = 0; i < mDataset.Count; i++)
            {
                mPatches.Add(new Patch(i));
            }
            foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx)
            {
                if (simMtxRow.Dat.Count <= 1)
                {
                    mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx);
                }
                ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count);
                foreach (IdxDat <double> item in simMtxRow.Dat)
                {
                    if (item.Idx != simMtxRow.Idx)
                    {
                        knn.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                    }
                }
                knn.Sort(DescSort <KeyDat <double, int> > .Instance);
                int count = Math.Min(knn.Count, mKNnExt);
                for (int i = 0; i < count; i++)
                {
                    mPatches[simMtxRow.Idx].List.Add(new KeyDat <double, Patch>(knn[i].Key, mPatches[knn[i].Dat]));
                }
                mPatches[simMtxRow.Idx].ProcessList();
                count = Math.Min(knn.Count, mKNn);
                SparseVector <double> eq = new SparseVector <double>();
                double wgt = 1.0 / (double)count;
                for (int i = 0; i < count; i++)
                {
                    eq.InnerIdx.Add(knn[i].Dat);
                    eq.InnerDat.Add(-wgt);
                }
                eq.InnerIdx.Sort(); // *** sort only indices
                eq[simMtxRow.Idx] = 1;
                lsqrDs.Add(0, eq);
            }
            Vector2D[] layout = new Vector2D[mDataset.Count - mKClust];
            for (int i = mDataset.Count - mKClust, j = 0; i < mDataset.Count; i++, j++)
            {
                SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) });
                lsqrDs.Add(mRefPos[j].X, eq);
            }
            LSqrModel lsqr = new LSqrModel();

            lsqr.Train(lsqrDs);
            mSolX = lsqr.Solution.GetWritableCopy();
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].X = lsqr.Solution[i];
            }
            for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++)
            {
                lsqrDs[i].Label = mRefPos[j].Y;
            }
            lsqr.Train(lsqrDs);
            mSolY = lsqr.Solution.GetWritableCopy();
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].Y = lsqr.Solution[i];
            }
            return(settings == null ? layout : settings.AdjustLayout(layout));
        }
Ejemplo n.º 12
0
        static void Main(string[] args)
        {
            Random rnd = new Random(1);

            string[] featureNames = "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma,ari,flesch,fog,rWords,rChars,rSyllables,rComplex,M04,M05,M06,M07,M08,M09,M10,M11,M12,M13".Split(',');
            LabeledDataset <BlogMetaData, SparseVector <double> > dataset = new LabeledDataset <BlogMetaData, SparseVector <double> >();

            Console.WriteLine("Analiziram besedila...");
            foreach (string fileName in Directory.GetFiles(Config.DataFolder, "*.xml"))
            {
                // load XML
                Console.WriteLine("Datoteka {0}...", fileName);
                XmlDocument doc = new XmlDocument();
                doc.LoadXml(File.ReadAllText(fileName).Replace("xmlns=\"http://www.tei-c.org/ns/1.0\"", ""));
                Corpus corpus = new Corpus();
                corpus.LoadFromXmlFile(fileName, /*tagLen=*/ int.MaxValue);
#if TEST_CHUNKER
                Text text = null;
#else
                Text text = new Text(corpus, doc.SelectSingleNode("//header/naslov").InnerText, doc.SelectSingleNode("//header/blog").InnerText /*blog identifier is used as author identifier*/);
                text.ComputeFeatures(); // compute Detextive features
#endif
                // run chunker
                Console.WriteLine("Racunam znacilke...");
                ArrayList <Chunk> chunks = Chunker.GetChunks(doc);
                chunks = new ArrayList <Chunk>(chunks.Where(x => !x.mInner)); // get non-inner chunks only
                chunks.ForEach(x => x.mType = MapChunkType(x.mType));         // move chunks from Other_* to main categories
#if TEST_CHUNKER
                return;
#endif
                // get blog meta-data
                BlogMetaData metaData = new BlogMetaData();
                metaData.mAuthorAge       = doc.SelectSingleNode("//header/avtorStarost").InnerText;
                metaData.mAuthorEducation = doc.SelectSingleNode("//header/avtorIzobrazba").InnerText;
                metaData.mAuthorGender    = doc.SelectSingleNode("//header/avtorSpol").InnerText;
                metaData.mAuthorLocation  = doc.SelectSingleNode("//header/avtorRegija").InnerText;
                metaData.mBlog            = doc.SelectSingleNode("//header/blog").InnerText;
                // compute features M04-M13 from Stamatatos et al.: Automatic Text Categorization in Terms of Genre and Author (2000)
                double   totalChunks = chunks.Count;
                double[] M           = new double[10];
                double   numNP       = chunks.Count(x => x.mType == ChunkType.NP);
                double   numVP       = chunks.Count(x => x.mType == ChunkType.VP);
                double   numAP       = chunks.Count(x => x.mType == ChunkType.AP);
                double   numPP       = chunks.Count(x => x.mType == ChunkType.PP);
                double   numCON      = chunks.Count(x => x.mType == ChunkType.CON);
                if (totalChunks > 0)
                {
                    M[0] = numNP / totalChunks;
                    M[1] = numVP / totalChunks;
                    M[2] = numAP / totalChunks;
                    M[3] = numPP / totalChunks;
                    M[4] = numCON / totalChunks;
                }
                double numWordsNP = chunks.Where(x => x.mType == ChunkType.NP).Select(x => x.mItems.Count).Sum();
                M[5] = numNP == 0 ? 0 : (numWordsNP / numNP);
                double numWordsVP = chunks.Where(x => x.mType == ChunkType.VP).Select(x => x.mItems.Count).Sum();
                M[6] = numVP == 0 ? 0 : (numWordsVP / numVP);
                double numWordsAP = chunks.Where(x => x.mType == ChunkType.AP).Select(x => x.mItems.Count).Sum();
                M[7] = numAP == 0 ? 0 : (numWordsAP / numAP);
                double numWordsPP = chunks.Where(x => x.mType == ChunkType.PP).Select(x => x.mItems.Count).Sum();
                M[8] = numPP == 0 ? 0 : (numWordsPP / numPP);
                double numWordsCON = chunks.Where(x => x.mType == ChunkType.CON).Select(x => x.mItems.Count).Sum();
                M[9] = numCON == 0 ? 0 : (numWordsCON / numCON);
                // create dataset
                SparseVector <double> vec = new SparseVector <double>();
                int i = 0;
                foreach (string featureName in "ttr,brunet,honore,hl,ttrLemma,brunetLemma,honoreLemma,hlLemma,ari,flesch,fog,rWords,rChars,rSyllables,rComplex".Split(','))
                {
                    if (double.IsNaN(text.mFeatures[featureName]) || double.IsInfinity(text.mFeatures[featureName]))
                    {
                        vec[i++] = 0;
                    }
                    else
                    {
                        vec[i++] = text.mFeatures[featureName];
                    }
                }
                foreach (double val in M)
                {
                    vec[i++] = val;
                }
                dataset.Add(new LabeledExample <BlogMetaData, SparseVector <double> >(metaData, vec));
                string htmlFileName = Config.HtmlFolder + "\\" + Path.GetFileNameWithoutExtension(fileName) + ".html";
                Output.SaveHtml(featureNames, vec, doc, chunks, htmlFileName);
            }
            // save as Orange and Weka file
            Console.WriteLine("Zapisujem datoteke Weka ARFF in Orange TAB...");
            foreach (ClassType classType in new ClassType[] { ClassType.AuthorName, ClassType.AuthorAge, ClassType.AuthorGender, ClassType.AuthorEducation, ClassType.AuthorLocation })
            {
                Output.SaveArff(featureNames, dataset, classType, Config.OutputFolder + "\\" + string.Format("OPA-{0}.arff", classType));
                Output.SaveTab(featureNames, dataset, classType, Config.OutputFolder + "\\" + string.Format("OPA-{0}.tab", classType));
            }
            // evaluate features via classification
            Console.WriteLine("Evalviram znacilke s klasifikacijskimi modeli...");
            PerfData <string> perfData = new PerfData <string>();
            ArrayList <Pair <string, IModel <string> > > models = new ArrayList <Pair <string, IModel <string> > >();
            // create classifiers
            NearestCentroidClassifier <string> ncc = new NearestCentroidClassifier <string>();
            ncc.Similarity = new SingleFeatureSimilarity();
            models.Add(new Pair <string, IModel <string> >("NCC", ncc));
            //KnnClassifier<string, SparseVector<double>> knn = new KnnClassifier<string, SparseVector<double>>(new SingleFeatureSimilarity());
            //models.Add(new Pair<string, IModel<string>>("kNN", knn)); // *** kNN is too slow
            SvmMulticlassClassifier <string> svm = new SvmMulticlassClassifier <string>();
            models.Add(new Pair <string, IModel <string> >("SVM", svm));
            MajorityClassifier <string, SparseVector <double> > maj = new MajorityClassifier <string, SparseVector <double> >();
            models.Add(new Pair <string, IModel <string> >("Majority", maj));
            MajorityClassifier <string, SparseVector <double> > backupCfy = new MajorityClassifier <string, SparseVector <double> >();
            foreach (Pair <string, IModel <string> > modelInfo in models) // iterate over different classifiers
            {
                Console.WriteLine("Kasifikacijski model: {0}...", modelInfo.First);
                foreach (ClassType classType in new ClassType[] { ClassType.AuthorName, ClassType.AuthorAge, ClassType.AuthorEducation, ClassType.AuthorGender, ClassType.AuthorLocation }) // iterate over different class types
                {
                    Console.WriteLine("Ciljni razred: {0}...", classType);
                    for (int fIdx = 0; fIdx < featureNames.Count(); fIdx++) // iterate over different features
                    {
                        Console.WriteLine("Znacilka: {0}...", featureNames[fIdx]);
                        LabeledDataset <string, SparseVector <double> > datasetWithSingleFeature = CreateSingleFeatureDataset(dataset, classType, fIdx);
                        datasetWithSingleFeature.Shuffle(rnd);
                        LabeledDataset <string, SparseVector <double> > trainSet, testSet;
                        for (int foldNum = 1; foldNum <= 10; foldNum++)
                        {
                            Console.WriteLine("Sklop " + foldNum + " / 10...");
                            datasetWithSingleFeature.SplitForCrossValidation(/*numFolds=*/ 10, foldNum, out trainSet, out testSet);
                            IModel <string> model = modelInfo.Second;
                            backupCfy.Train(trainSet);
                            // if there is only one class in trainSet, switch to MajorityClassifier
                            if (((IEnumerable <LabeledExample <string, SparseVector <double> > >)trainSet).Select(x => x.Label).Distinct().Count() == 1)
                            {
                                model = backupCfy;
                            }
                            else
                            {
                                string cacheFileName = Config.OutputFolder + "\\svm-" + classType + "-" + featureNames[fIdx] + "-" + foldNum + ".bin";
                                if (model is SvmMulticlassClassifier <string> && File.Exists(cacheFileName))
                                {
                                    using (BinarySerializer bs = new BinarySerializer(cacheFileName, FileMode.Open))
                                    {
                                        ((SvmMulticlassClassifier <string>)model).Load(bs);
                                    }
                                }
                                else
                                {
                                    model.Train(trainSet);
                                }
#if CACHE_MODELS
                                if (model is SvmMulticlassFast <string> )
                                {
                                    using (BinarySerializer bs = new BinarySerializer(cacheFileName, FileMode.Create))
                                    {
                                        model.Save(bs);
                                    }
                                }
#endif
                            }
                            foreach (LabeledExample <string, SparseVector <double> > lblEx in testSet)
                            {
                                Prediction <string> pred = model.Predict(lblEx.Example);
                                if (pred.Count == 0)
                                {
                                    pred = backupCfy.Predict(lblEx.Example);
                                }                                                                 // if the model is unable to make a prediction, use MajorityClassifier instead
                                perfData.GetPerfMatrix(classType.ToString(), modelInfo.First + "\t" + featureNames[fIdx], foldNum).AddCount(lblEx.Label, pred.BestClassLabel);
                            }
                        }
                    }
                }
            }
            // train full models
            Console.WriteLine("Treniram klasifikacijske modele...");
            models.Clear();
            SvmMulticlassClassifier <string> svmFull = new SvmMulticlassClassifier <string>();
            models.Add(new Pair <string, IModel <string> >("SVM", svmFull));
            //NearestCentroidClassifier<string> nccFull = new NearestCentroidClassifier<string>();
            //nccFull.Similarity = new ManhattanSimilarity();
            //models.Add(new Pair<string, IModel<string>>("NCC", nccFull));
            foreach (Pair <string, IModel <string> > modelInfo in models) // iterate over different classifiers
            {
                Console.WriteLine("Kasifikacijski model: {0}...", modelInfo.First);
                IModel <string> model = modelInfo.Second;
                foreach (ClassType classType in new ClassType[] { ClassType.AuthorName, ClassType.AuthorAge, ClassType.AuthorEducation, ClassType.AuthorGender, ClassType.AuthorLocation }) // iterate over different class types
                {
                    Console.WriteLine("Ciljni razred: {0}...", classType);
                    LabeledDataset <string, SparseVector <double> > nrmDataset = CreateNormalizedDataset(dataset, classType);
                    nrmDataset.Shuffle(rnd);
                    LabeledDataset <string, SparseVector <double> > trainSet, testSet;
                    for (int foldNum = 1; foldNum <= 10; foldNum++)
                    {
                        Console.WriteLine("Sklop " + foldNum + " / 10...");
                        nrmDataset.SplitForCrossValidation(/*numFolds=*/ 10, foldNum, out trainSet, out testSet);
                        backupCfy.Train(trainSet);
                        // if there is only one class in trainSet, switch to MajorityClassifier
                        if (((IEnumerable <LabeledExample <string, SparseVector <double> > >)trainSet).Select(x => x.Label).Distinct().Count() == 1)
                        {
                            model = backupCfy;
                        }
                        else
                        {
                            string cacheFileName = Config.OutputFolder + "\\svm-" + classType + "-full-" + foldNum + ".bin";
                            if (model is SvmMulticlassClassifier <string> && File.Exists(cacheFileName))
                            {
                                using (BinarySerializer bs = new BinarySerializer(cacheFileName, FileMode.Open))
                                {
                                    ((SvmMulticlassClassifier <string>)model).Load(bs);
                                }
                            }
                            else
                            {
                                model.Train(trainSet);
                            }
#if CACHE_MODELS
                            if (model is SvmMulticlassFast <string> )
                            {
                                using (BinarySerializer bs = new BinarySerializer(cacheFileName, FileMode.Create))
                                {
                                    model.Save(bs);
                                }
                            }
#endif
                        }
                        foreach (LabeledExample <string, SparseVector <double> > lblEx in testSet)
                        {
                            Prediction <string> pred = model.Predict(lblEx.Example);
                            if (pred.Count == 0)
                            {
                                pred = backupCfy.Predict(lblEx.Example);
                            }                                                                 // if the model is unable to make a prediction, use MajorityClassifier instead
                            perfData.GetPerfMatrix(classType.ToString(), modelInfo.First + "\tfull", foldNum).AddCount(lblEx.Label, pred.BestClassLabel);
                        }
                    }
                    // save model
                    string modelFileName = Config.OutputFolder + "\\" + modelInfo.First + "-" + classType + ".model";
                    if (!File.Exists(modelFileName))
                    {
                        using (BinarySerializer bs = new BinarySerializer(modelFileName, FileMode.Create))
                        {
                            model.Train(nrmDataset);
                            model.Save(bs);
                        }
                    }
                }
            }
            using (StreamWriter w = new StreamWriter(Config.OutputFolder + "\\ClassifierEval.txt"))
            {
                w.WriteLine("*** Macro F1 ***");
                w.WriteLine();
                w.WriteLine("\t" + perfData.ToString(null, PerfMetric.MacroF1));
                w.WriteLine();
                w.WriteLine("*** Micro F1 ***");
                w.WriteLine();
                w.WriteLine("\t" + perfData.ToString(null, PerfMetric.MicroF1));
                w.WriteLine();
                w.WriteLine("*** Macro accuracy ***");
                w.WriteLine();
                w.WriteLine("\t" + perfData.ToString(null, PerfMetric.MacroAccuracy));
                w.WriteLine();
                w.WriteLine("*** Micro accuracy ***");
                w.WriteLine();
                w.WriteLine("\t" + perfData.ToString(null, PerfMetric.MicroAccuracy));
            }
            // all done
            Console.WriteLine("Koncano.");
        }
Ejemplo n.º 13
0
 static void Main(string[] args)
 {
     try
     {
         if (args.Length < 2)
         {
             OutputHelp();
         }
         else
         {
             int    cutOff = 2;
             int    numIter = 50;
             int    numThreads = 1;
             string corpusFileName = null, modelFileName = null, lexiconFileName = null;
             bool   verbose = false;
             if (ParseParams(args, ref verbose, ref cutOff, ref numIter, ref numThreads, ref corpusFileName, ref modelFileName, ref lexiconFileName))
             {
                 Logger logger = Logger.GetRootLogger();
                 if (!verbose)
                 {
                     logger.LocalLevel = Logger.Level.Off;
                     logger.LocalProgressOutputType = Logger.ProgressOutputType.Off;
                 }
                 else
                 {
                     logger.LocalOutputType = Logger.OutputType.Custom;
                     Logger.CustomOutput    = new Logger.CustomOutputDelegate(delegate(string loggerName, Logger.Level level, string funcName, Exception e,
                                                                                       string message, object[] msgArgs) { Console.WriteLine(message, msgArgs); });
                 }
                 Corpus corpus = new Corpus();
                 logger.Info(/*funcName=*/ null, "Nalagam učni korpus ...");
                 corpus.LoadFromXmlFile(corpusFileName, /*tagLen=*/ -1);
                 GC.Collect();
                 long         oldMemUse  = Process.GetCurrentProcess().PrivateMemorySize64;
                 PatriciaTree suffixTree = new PatriciaTree();
                 foreach (TaggedWord word in corpus.TaggedWords)
                 {
                     suffixTree.AddWordTagPair(word.WordLower, word.Tag);
                 }
                 if (lexiconFileName != null)
                 {
                     logger.Info(/*funcName=*/ null, "Nalagam leksikon ...");
                     StreamReader lexReader = new StreamReader(lexiconFileName);
                     string       lexLine;
                     while ((lexLine = lexReader.ReadLine()) != null)
                     {
                         string[] lexData = lexLine.Split('\t');
                         suffixTree.AddWordTagPair(lexData[0].ToLower(), lexData[2]);
                     }
                     lexReader.Close();
                 }
                 GC.Collect();
                 long memUse = Process.GetCurrentProcess().PrivateMemorySize64;
                 Console.WriteLine("Poraba pomnilnika (drevo končnic): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0);
                 oldMemUse = memUse;
                 suffixTree.PropagateTags();
                 GC.Collect();
                 memUse = Process.GetCurrentProcess().PrivateMemorySize64;
                 Console.WriteLine("Poraba pomnilnika (propagirane oznake): {0:0.00} MB", (double)(memUse - oldMemUse) / 1048576.0);
                 MaximumEntropyClassifierFast <string> model   = new MaximumEntropyClassifierFast <string>();
                 LabeledDataset <string, BinaryVector> dataset = new LabeledDataset <string, BinaryVector>();
                 Dictionary <string, int> featureSpace         = new Dictionary <string, int>();
                 logger.Info(/*funcName=*/ null, "Pripravljam vektorje značilk ...");
                 for (int i = 0; i < corpus.TaggedWords.Count; i++)
                 {
                     logger.ProgressFast(Logger.Level.Info, /*funcName=*/ null, "{0} / {1}", i + 1, corpus.TaggedWords.Count);
                     BinaryVector featureVector = corpus.GenerateFeatureVector(i, featureSpace, /*extendFeatureSpace=*/ true, suffixTree);
                     dataset.Add(corpus.TaggedWords[i].Tag, featureVector);
                 }
                 logger.Info(/*funcName=*/ null, "Gradim model ...");
                 DateTime startTime = DateTime.Now;
                 model.CutOff     = cutOff;
                 model.NumThreads = numThreads;
                 model.NumIter    = numIter;
                 model.Train(dataset);
                 TimeSpan span = DateTime.Now - startTime;
                 logger.Info(/*funcName=*/ null, "Trajanje gradnje modela: {0:00}:{1:00}:{2:00}.{3:000}.", span.Hours, span.Minutes, span.Seconds, span.Milliseconds);
                 logger.Info(/*funcName=*/ null, "Zapisujem model ...");
                 BinarySerializer writer = new BinarySerializer(modelFileName, FileMode.Create);
                 suffixTree.Save(writer);
                 Utils.SaveDictionary(featureSpace, writer);
                 model.Save(writer);
                 writer.Close();
                 logger.Info(/*funcName=*/ null, "Končano.");
             }
         }
     }
     catch (Exception exception)
     {
         Console.WriteLine();
         Console.WriteLine("*** Nepričakovana napaka. Podrobnosti: {0}\r\n{1}", exception, exception.StackTrace);
     }
 }