public void Train(ILabeledExampleCollection <LblT, ExT> dataset)
        {
            Preconditions.CheckNotNull(dataset);

            var trainDataset = new LabeledDataset <LblT, ExT>(dataset);

            for (int i = 0; i < mInnerModels.Length; i++)
            {
                if (mInnerModels[i] == null)
                {
                    mInnerModels[i] = CreateModel(i);
                }
                mInnerModels[i].Train(GetTrainSet(i, mInnerModels[i], trainDataset));
            }

            foreach (LabeledExample <LblT, ExT> le in trainDataset)
            {
                LabeledExample <LblT, ExT> le_ = le;
                string      key         = StringOf(mInnerModels.Select(m => m.Predict(le_.Example).BestClassLabel));
                VotingEntry votingEntry = mVotingEntries[key];
                votingEntry.LabelCounts[le.Label]++;
            }
            foreach (VotingEntry entry in mVotingEntries.Values)
            {
                PerformVoting(entry);
            }

            IsTrained = true;
        }
        public void Train(ILabeledExampleCollection <LblT, ExT> dataset)
        {
            Preconditions.CheckNotNull(dataset);

            var trainDataset = new LabeledDataset <LblT, ExT>(dataset);

            for (int i = 0; i < mInnerModels.Length; i++)
            {
                if (mInnerModels[i] == null)
                {
                    mInnerModels[i] = CreateModel(i);
                }
                mInnerModels[i].Train(GetTrainSet(i, mInnerModels[i], trainDataset));
            }

            foreach (LabeledExample <LblT, ExT> le in trainDataset)
            {
                LabeledExample <LblT, ExT> le_ = le;
                double[] scores = GetPredictionScores(mInnerModels.Select(m => m.Predict(le_.Example)).ToArray()).ToArray();
                mTagDistrTable.AddCount(le.Label, scores);
            }
            mTagDistrTable.Calculate();

            IsTrained = true;
        }
Beispiel #3
0
        public static LabeledDataset <SentimentLabel, SparseVector <double> > InitBowSpace(BowSpace bowSpace,
                                                                                           IEnumerable <LabeledExample <SentimentLabel, string> > labeledExamples, IEnumerable <string> initExamples = null)
        {
            LabeledExample <SentimentLabel, string>[] examples = labeledExamples as LabeledExample <SentimentLabel, string>[] ?? labeledExamples.ToArray();

            List <SparseVector <double> > bowData;

            if (initExamples != null)
            {
                Preconditions.CheckArgument(!(bowSpace is DeltaBowSpace <SentimentLabel>));
                bowSpace.Initialize(initExamples);
                bowData = examples.Select(le => bowSpace.ProcessDocument(le.Example)).ToList();
            }
            else
            {
                bowData = bowSpace is DeltaBowSpace <SentimentLabel>
                          ?((DeltaBowSpace <SentimentLabel>)bowSpace).Initialize(new LabeledDataset <SentimentLabel, string>(examples))
                              : bowSpace.Initialize(examples.Select(d => d.Example));
            }

            var bowDataset = new LabeledDataset <SentimentLabel, SparseVector <double> >();

            for (int i = 0; i < bowData.Count; i++)
            {
                bowDataset.Add(examples[i].Label, bowData[i]);
            }
            return(bowDataset);
        }
Beispiel #4
0
 public static void SaveTab(string[] featureNames, LabeledDataset <BlogMetaData, SparseVector <double> > dataset, ClassType classType, string fileName)
 {
     using (StreamWriter w = new StreamWriter(fileName, /*append=*/ false, Encoding.ASCII))
     {
         for (int i = 0; i < featureNames.Length; i++)
         {
             w.Write(featureNames[i] + "\t");
         }
         w.WriteLine("author");
         for (int i = 0; i < featureNames.Length; i++)
         {
             w.Write("c\t");
         }
         w.WriteLine("d");
         for (int i = 0; i < featureNames.Length; i++)
         {
             w.Write("\t");
         }
         w.WriteLine("class");
         foreach (LabeledExample <BlogMetaData, SparseVector <double> > lblEx in dataset)
         {
             foreach (string lblStr in AnalysisUtils.GetLabel(lblEx.Label, classType).Split(','))
             {
                 if (lblStr != "")
                 {
                     foreach (IdxDat <double> item in lblEx.Example)
                     {
                         w.Write(item.Dat + "\t");
                     }
                     w.WriteLine(lblStr);
                 }
             }
         }
     }
 }
Beispiel #5
0
 public static void SaveArff(string[] featureNames, LabeledDataset <BlogMetaData, SparseVector <double> > dataset, ClassType classType, string fileName)
 {
     using (StreamWriter w = new StreamWriter(fileName, /*append=*/ false, Encoding.ASCII))
     {
         w.WriteLine("@RELATION r" + Guid.NewGuid().ToString("N"));
         w.WriteLine();
         foreach (string featureName in featureNames)
         {
             w.WriteLine("@ATTRIBUTE " + featureName + " NUMERIC");
         }
         w.Write("@ATTRIBUTE class ");
         ArrayList <string> classes = new ArrayList <string>();
         ((IEnumerable <LabeledExample <BlogMetaData, SparseVector <double> > >)dataset).ToList().ForEach(
             x => classes.AddRange(AnalysisUtils.GetLabel(x.Label, classType).Split(',')));
         classes = new ArrayList <string>(classes.Distinct());
         w.WriteLine(classes.ToString().Replace("( ", "{").Replace(" )", "}").Replace(" ", ","));
         w.WriteLine();
         w.WriteLine("@DATA");
         foreach (LabeledExample <BlogMetaData, SparseVector <double> > lblEx in dataset)
         {
             foreach (string lblStr in AnalysisUtils.GetLabel(lblEx.Label, classType).Split(','))
             {
                 if (lblStr != "")
                 {
                     foreach (IdxDat <double> item in lblEx.Example)
                     {
                         w.Write(item.Dat + ",");
                     }
                     w.WriteLine(lblStr);
                 }
             }
         }
     }
 }
        public void Train(ILabeledExampleCollection <LblT, ExT> dataset)
        {
            var binaryDataset = new LabeledDataset <LblT, ExT>(dataset.Select(le =>
                                                                              new LabeledExample <LblT, ExT>(le.Label.Equals(OneLabel) ? OneLabel : OtherLabel, le.Example)));

            mBinaryModel.Train(binaryDataset);
            IsTrained = true;
        }
        public void TestSortedCheckFail()
        {
            LabeledDataset <int, int> testSet, trainSet;
            LabeledDataset <int, int> ld = NewData(new[, ] {
                { 1, 10 }, { 2, 1 }, { 1, 1 }
            });

            ld.SplitForStratifiedCrossValidation(2, 1, out trainSet, out testSet);
        }
        private Model TrainModel(LabeledDataset <SentimentLabel, SparseVector <double> > dataset,
                                 SentimentLabel label, SentimentLabel otherLabel1, SentimentLabel otherLabel2)
        {
            IModel <SentimentLabel, SparseVector <double> > model = CreateModel();

            var otherLabelWeight1 = (double)dataset.Count(le => le.Label == otherLabel1) / dataset.Count(le => le.Label != label);
            var otherLabelWeight2 = (double)dataset.Count(le => le.Label == otherLabel2) / dataset.Count(le => le.Label != label);

            dataset = new LabeledDataset <SentimentLabel, SparseVector <double> >(dataset.Select(le =>
                                                                                                 new LabeledExample <SentimentLabel, SparseVector <double> >(le.Label == label ? label : otherLabel1, le.Example)));

            var scores       = new List <double>();
            var scoresOthers = new List <double>();
            var validation   = new CrossValidator <SentimentLabel, SparseVector <double> >
            {
                NumFolds = NumTrainFolds,
                Dataset  = dataset,

                OnAfterPrediction = (sender, foldN, m, ex, le, prediction) =>
                {
                    if (le.Label == prediction.BestClassLabel)
                    {
                        if (prediction.BestClassLabel == label)
                        {
                            scores.Add(prediction.BestScore);
                        }
                        else
                        {
                            scoresOthers.Add(prediction.BestScore);
                        }
                    }
                    return(true);
                }
            };

            validation.Models.Add(model);
            validation.Run();

            // train model
            model.Train(dataset);

            return(new Model
            {
                InnerModel = model,
                Weight = validation.PerfData.GetSumPerfMatrix(validation.ExpName, validation.GetModelName(model)).GetMacroF1(),
                Label = label,
                OtherLabel1 = otherLabel1,
                OtherLabelWeight1 = otherLabelWeight1,
                OtherLabel2 = otherLabel2,
                OtherLabelWeight2 = otherLabelWeight2,
                Scores = scores.OrderBy(s => s).ToArray(),
                ScoresOthers = scoresOthers.OrderBy(s => s).ToArray()
            });
        }
 public void Train(ILabeledExampleCollection <LblT, ExT> dataset)
 {
     foreach (ModelLabel modelLabel in ModelLabels.Take(ModelLabels.Count() - 1))
     {
         modelLabel.Model.Train(dataset);
         ModelLabel modelLabel_ = modelLabel;
         dataset = new LabeledDataset <LblT, ExT>(dataset.Where(le => !le.Label.Equals(modelLabel_.Label)));
     }
     ModelLabels.Last().Model.Train(dataset);
     IsTrained = true;
 }
Beispiel #10
0
        static void GetExtremes <T>(LabeledDataset <T, SparseVector <double> > dataset, out SparseVector <double> minValues, out SparseVector <double> maxValues)
        {
            minValues = new SparseVector <double>();
            maxValues = new SparseVector <double>();
            int maxIdx = ((IEnumerableList <SparseVector <double> >)dataset).Max(x => x.Max(y => y.Idx));

            for (int featureIdx = 0; featureIdx <= maxIdx; featureIdx++)
            {
                minValues[featureIdx] = ((IEnumerableList <SparseVector <double> >)dataset).Min(x => x[featureIdx]);
                maxValues[featureIdx] = ((IEnumerableList <SparseVector <double> >)dataset).Max(x => x[featureIdx]);
            }
        }
        public override void Train(ILabeledExampleCollection <SentimentLabel, SparseVector <double> > dataset)
        {
            Preconditions.CheckNotNull(dataset);

            var ds = new LabeledDataset <SentimentLabel, SparseVector <double> >(dataset
                                                                                 .Select(le => new LabeledExample <SentimentLabel, SparseVector <double> >(le.Label, le.Example)));

            mPosModel = TrainModel(ds, SentimentLabel.Positive, SentimentLabel.Negative, SentimentLabel.Neutral);
            mNegModel = TrainModel(ds, SentimentLabel.Negative, SentimentLabel.Positive, SentimentLabel.Neutral);
            mNeuModel = TrainModel(ds, SentimentLabel.Neutral, SentimentLabel.Positive, SentimentLabel.Negative);

            IsTrained = true;
        }
Beispiel #12
0
        static LabeledDataset <string, SparseVector <double> > CreateSingleFeatureDataset(LabeledDataset <BlogMetaData, SparseVector <double> > srcDataset, ClassType classType, int fIdx)
        {
            SparseVector <double> minValues, maxValues;

            GetExtremes(srcDataset, out minValues, out maxValues);
            LabeledDataset <string, SparseVector <double> > dataset = new LabeledDataset <string, SparseVector <double> >();

            ((IEnumerable <LabeledExample <BlogMetaData, SparseVector <double> > >)srcDataset).ToList()
            .ForEach(x => dataset.Add(new LabeledExample <string, SparseVector <double> >(AnalysisUtils.GetLabel(x.Label, classType),
                                                                                          new SparseVector <double>(
                                                                                              new double[] { (x.Example[fIdx] - minValues[fIdx]) / (maxValues[fIdx] - minValues[fIdx]) } // simple normalization
                                                                                              ))));
            return(dataset);
        }
Beispiel #13
0
        public override sealed void Train(ILabeledExampleCollection <SentimentLabel, SparseVector <double> > dataset)
        {
            Preconditions.CheckNotNull(dataset);

            var labeledDataset = (LabeledDataset <SentimentLabel, SparseVector <double> >)dataset;

            var trainDataset = new LabeledDataset <SentimentLabel, SparseVector <double> >(labeledDataset.Where(le => le.Label != SentimentLabel.Neutral));

            if (mBinaryClassifier == null)
            {
                mBinaryClassifier = CreateModel();
                mBinaryClassifier.Train(trainDataset);
            }

            IsTrained = true;

            /*Calculate positive and negative average distances*/
            int positiveTweetsNumber = 0;
            int negativeTweetsNumber = 0;

            PosAverageDistance = 0;
            NegAverageDistance = 0;

            foreach (LabeledExample <SentimentLabel, SparseVector <double> > example in trainDataset)
            {
                Prediction <SentimentLabel> prediction = mBinaryClassifier.Predict(example.Example);
                SentimentLabel bestLabelPredicted      = prediction.BestClassLabel;
                double         bestScorePredicted      = bestLabelPredicted == SentimentLabel.Negative ? -prediction.BestScore : prediction.BestScore;

                SentimentLabel actualLabel = example.Label;

                if (actualLabel == SentimentLabel.Positive)
                {
                    PosAverageDistance += bestScorePredicted;
                    positiveTweetsNumber++;
                }
                else if (actualLabel == SentimentLabel.Negative)
                {
                    NegAverageDistance += bestScorePredicted;
                    negativeTweetsNumber++;
                }
            }

            PosAverageDistance = PosAverageDistance / positiveTweetsNumber;
            NegAverageDistance = NegAverageDistance / negativeTweetsNumber;
        }
        private static LabeledDataset <int, int> NewData(int[,] labelCounts, bool sortShuffled = false)
        {
            var result = new LabeledDataset <int, int>();

            for (int i = 0, k = 1; i <= labelCounts.GetUpperBound(0); i++)
            {
                int label = labelCounts[i, 0], count = labelCounts[i, 1];
                for (int j = 0; j < count; j++)
                {
                    result.Add(label, k++);
                }
            }
            if (sortShuffled)
            {
                result.GroupLabels(true);
            }
            return(result);
        }
Beispiel #15
0
        public override void Train(ILabeledExampleCollection <SentimentLabel, SparseVector <double> > dataset)
        {
            Preconditions.CheckNotNull(dataset);

            var posDataset = new LabeledDataset <SentimentLabel, SparseVector <double> >(dataset.Select(le =>
                                                                                                        new LabeledExample <SentimentLabel, SparseVector <double> >(le.Label == SentimentLabel.Positive
                    ? SentimentLabel.Positive : SentimentLabel.Negative, le.Example)));

            mPosClassifier = CreateModel();
            mPosClassifier.Train(posDataset);

            var negDataset = new LabeledDataset <SentimentLabel, SparseVector <double> >(dataset.Select(le =>
                                                                                                        new LabeledExample <SentimentLabel, SparseVector <double> >(le.Label == SentimentLabel.Negative
                    ? SentimentLabel.Negative : SentimentLabel.Positive, le.Example)));

            mNegClassifier = CreateModel();
            mNegClassifier.Train(negDataset);

            if (PosBiasCalibration != null || NegBiasCalibration != null)
            {
                var    labeledDataset = new LabeledDataset <SentimentLabel, SparseVector <double> >(dataset);
                double?posBias        = Calibrate(true, labeledDataset);
                double?negBias        = Calibrate(false, labeledDataset);
                BiasToPosRate = posBias ?? BiasToPosRate;
                BiasToNegRate = negBias ?? BiasToNegRate;
            }

            mPosSortedScores = mNegSortedScores = null;
            mExampleScores   = dataset.Select(le =>
            {
                Prediction <SentimentLabel> posPrediction = mPosClassifier.Predict(le.Example);
                Prediction <SentimentLabel> negPrediction = mNegClassifier.Predict(le.Example);
                return(new ExampleScore
                {
                    Label = le.Label,
                    PosScore = posPrediction.BestClassLabel == SentimentLabel.Positive ? posPrediction.BestScore : -posPrediction.BestScore,
                    NegScore = negPrediction.BestClassLabel == SentimentLabel.Negative ? -negPrediction.BestScore : negPrediction.BestScore
                });
            }).ToArray();

            UpdateDistrTable();

            IsTrained = true;
        }
        private Model TrainModel(LabeledDataset <SentimentLabel, SparseVector <double> > dataset,
                                 SentimentLabel label1, SentimentLabel label2)
        {
            IModel <SentimentLabel, SparseVector <double> > model = CreateModel();
            var scores1 = new List <double>();
            var scores2 = new List <double>();

            var validation = new CrossValidator <SentimentLabel, SparseVector <double> >
            {
                NumFolds = NumTrainFolds,
                Dataset  = dataset,

                OnAfterPrediction = (sender, foldN, m, ex, le, prediction) =>
                {
                    if (le.Label == prediction.BestClassLabel)
                    {
                        if (prediction.BestClassLabel == label1)
                        {
                            scores1.Add(prediction.BestScore);
                        }
                        else if (prediction.BestClassLabel == label2)
                        {
                            scores2.Add(prediction.BestScore);
                        }
                    }
                    return(true);
                }
            };

            validation.Models.Add(model);
            validation.Run();

            // train model
            model.Train(dataset);
            return(new Model
            {
                InnerModel = model,
                Label1 = label1,
                Label2 = label2,
                Scores1 = scores1.OrderBy(s => s).ToArray(),
                Scores2 = scores2.OrderBy(s => s).ToArray(),
                Weight = validation.PerfData.GetSumPerfMatrix(validation.ExpName, validation.GetModelName(model)).GetMacroF1()
            });
        }
Beispiel #17
0
        public ValidationTask(TaskContext taskContext)
        {
            Context = Preconditions.CheckNotNull(taskContext);

            LabeledExample <SentimentLabel, string>[] labeledExamples = taskContext.DataSource.GetData().ToArray();
            TaskUtils.ProcessFeatures(taskContext, labeledExamples);
            var labeledDataset = new LabeledDataset <SentimentLabel, string>(labeledExamples);

            // lazy model creation
            IEnumerable <Func <IModel <SentimentLabel, SparseVector <double> > > > modelFacotry = Enumerable.Range(0, taskContext.Models.Length)
                                                                                                  .Select <int, Func <IModel <SentimentLabel, SparseVector <double> > > >(i => () => taskContext.ModelFactory(i));

            Validator = new FoldLocalBowCrossValidator <SentimentLabel>(modelFacotry)
            {
                Dataset       = labeledDataset,
                BowSpaceFunc  = taskContext.BowSpaceFactory,
                ModelNameFunc = (sender, m) => taskContext.GetModelName(m)
            };
        }
Beispiel #18
0
        private double?Calibrate(bool doPosPlane, LabeledDataset <SentimentLabel, SparseVector <double> > dataset)
        {
            BiasCalibration calibration = doPosPlane ? PosBiasCalibration : NegBiasCalibration;

            if (calibration == null)
            {
                return(null);
            }

            Preconditions.CheckArgument(calibration.BiasStep > 0);
            Preconditions.CheckNotNull(calibration.OptimizationFunc);

            double maxScore       = double.MinValue;
            double optimalBias    = 0;
            var    biasScorePairs = calibration.IsSaveBiasScorePairs ? new List <Tuple <double, double> >() : null;

            for (double bias = calibration.BiasLowerBound; bias <= calibration.BiasUpperBound; bias += calibration.BiasStep)
            {
                var matrix = new PerfMatrix <SentimentLabel>(null);
                foreach (LabeledExample <SentimentLabel, SparseVector <double> > le in dataset)
                {
                    Prediction <SentimentLabel> prediction = PredictInternal(le.Example, doPosPlane ? bias : 0, doPosPlane ? 0 : bias);
                    matrix.AddCount(le.Label, prediction.BestClassLabel);
                }
                double score = calibration.OptimizationFunc(matrix);
                if (score > maxScore)
                {
                    maxScore    = score;
                    optimalBias = bias;
                }
                if (biasScorePairs != null)
                {
                    biasScorePairs.Add(new Tuple <double, double>(bias, score));
                }
                Console.WriteLine("{0}\t{1:0.000}\t{2:0.000}", doPosPlane, bias, score);
            }
            if (biasScorePairs != null)
            {
                calibration.BiasScorePairs = biasScorePairs.ToArray();
            }

            return(optimalBias);
        }
        public void TestEvenlyDistributed()
        {
            int size = DatasetSize;

            for (int numLabels = 2; numLabels <= size / 2; numLabels++)
            {
                var labelCounts = new int[numLabels, 2];
                for (int label = 1; label <= numLabels; label++)
                {
                    int segSize = size / numLabels;
                    if (label <= size % numLabels)
                    {
                        segSize++;
                    }
                    labelCounts[label - 1, 0] = label;
                    labelCounts[label - 1, 1] = segSize;
                }
                double labelDistr            = 1.0 / numLabels;
                LabeledDataset <int, int> ld = NewData(labelCounts, true);
                for (int numFolds = 2; numFolds <= size / numLabels; numFolds++)
                {
                    var aggTestSet = new LabeledDataset <int, int>();
                    for (int i = 0; i < numFolds; i++)
                    {
                        LabeledDataset <int, int> trainSet, testSet;
                        ld.SplitForStratifiedCrossValidation(numFolds, i + 1, out trainSet, out testSet);
                        AssertSetEquality(trainSet.Concat(testSet), ld);
                        aggTestSet.AddRange(testSet);

                        foreach (double distr in testSet.GroupBy(le => le.Label).Select(g => (double)g.Count() / testSet.Count))
                        {
                            Assert.IsTrue(Math.Abs(labelDistr - distr) <= 1.0 / testSet.Count);
                        }
                        foreach (double distr in trainSet.GroupBy(le => le.Label).Select(g => (double)g.Count() / trainSet.Count))
                        {
                            Assert.IsTrue(Math.Abs(labelDistr - distr) <= 1.0 / trainSet.Count);
                        }
                    }
                    AssertSetEquality(aggTestSet, ld);
                }
            }
        }
Beispiel #20
0
        protected override ILabeledDataset <LblT, SparseVector <double> > MapTrainSet(int foldN, ILabeledDataset <LblT, string> trainSet)
        {
            BowSpace bowSpace;

            Preconditions.CheckState(!mFoldBowSpaces.TryGetValue(foldN, out bowSpace));
            Preconditions.CheckState(mFoldBowSpaces.TryAdd(foldN, bowSpace = BowSpaceFunc()));

            List <SparseVector <double> > bowData = bowSpace is DeltaBowSpace <LblT>
                                                    ?((DeltaBowSpace <LblT>)bowSpace).Initialize(trainSet)
                                                        : bowSpace.Initialize(trainSet.Select(d => d.Example));

            var bowDataset = new LabeledDataset <LblT, SparseVector <double> >();

            for (int i = 0; i < bowData.Count; i++)
            {
                bowDataset.Add(trainSet[i].Label, bowData[i]);
            }

            return(bowDataset);
        }
Beispiel #21
0
        public override void Train(ILabeledExampleCollection <SentimentLabel, SparseVector <double> > dataset)
        {
            Preconditions.CheckState(BowSpace != null);
            var replDataset = new LabeledDataset <SentimentLabel, SparseVector <double> >();

            foreach (LabeledExample <SentimentLabel, SparseVector <double> > le in dataset)
            {
                SparseVector <double> vector1, vector2;
                Replicate(le.Example, out vector1, out vector2);

                replDataset.Add(new LabeledExample <SentimentLabel, SparseVector <double> >(
                                    le.Label == SentimentLabel.Neutral ? SentimentLabel.Negative : le.Label, vector1));
                replDataset.Add(new LabeledExample <SentimentLabel, SparseVector <double> >(
                                    le.Label == SentimentLabel.Neutral ? SentimentLabel.Positive : le.Label, vector2));
            }

            mClassifier = CreateModel();
            mClassifier.Train(replDataset);

            IsTrained = true;
        }
        public void TestGroupedCheckOk()
        {
            LabeledDataset <int, int> testSet, trainSet;
            LabeledDataset <int, int> ld = NewData(new[, ] {
                { 1, 10 }, { 2, 1 }
            });

            ld.SplitForStratifiedCrossValidation(2, 1, out trainSet, out testSet);

            ld = NewData(new[, ] {
                { 1, 10 }, { 2, 1 }, { 1, 1 }
            });
            ld.GroupLabels(true);
            ld.SplitForStratifiedCrossValidation(2, 1, out trainSet, out testSet);

            ld = NewData(new[, ] {
                { 1, 10 }, { 2, 1 }, { 1, 1 }, { 2, 10 }
            });
            ld.GroupLabels(true);
            ld.SplitForStratifiedCrossValidation(2, 1, out trainSet, out testSet);
        }
Beispiel #23
0
 public void TrainModels(IEnumerable <Author> authors)
 {
     foreach (Author author in authors)
     {
         LabeledDataset <string, SparseVector <double> > ds = new LabeledDataset <string, SparseVector <double> >();
         foreach (Author otherAuthor in authors)
         {
             if (otherAuthor != author && !otherAuthor.mIsTagged)
             {
                 foreach (Text text in otherAuthor.mTexts)
                 {
                     ds.Add(new LabeledExample <string, SparseVector <double> >(otherAuthor.mName, text.mFeatureVectors[mSelector]));
                 }
             }
         }
         SvmMulticlassClassifier <string> model = new SvmMulticlassClassifier <string>();
         model.C = Convert.ToDouble(Utils.GetConfigValue("SvmMultiClassC", "5000"));
         model.Train(ds);
         mModels.Add(author.mName, model);
     }
 }
 public void TestFolding()
 {
     for (int size = 2; size <= DatasetSize; size++)
     {
         LabeledDataset <int, int> ld = NewData(new[, ] {
             { 1, size }
         }, true);
         for (int numFolds = 2; numFolds <= size; numFolds++)
         {
             var aggTestSet = new LabeledDataset <int, int>();
             for (int i = 0; i < numFolds; i++)
             {
                 LabeledDataset <int, int> trainSet, testSet;
                 ld.SplitForStratifiedCrossValidation(numFolds, i + 1, out trainSet, out testSet);
                 AssertSetEquality(trainSet.Concat(testSet), ld);
                 aggTestSet.AddRange(testSet);
             }
             AssertSetEquality(aggTestSet, ld);
         }
     }
 }
Beispiel #25
0
        public void Train(ILabeledExampleCollection <LblT, string> dataset)
        {
            Preconditions.CheckState(!IsTrained);
            Preconditions.CheckNotNull(dataset);
            Preconditions.CheckNotNull(BowSpace);
            Preconditions.CheckNotNull(FeatureProcessor);
            Preconditions.CheckNotNull(Model);

            // preprocess the text
            foreach (LabeledExample <LblT, string> le in dataset)
            {
                le.Example = FeatureProcessor.Run(le.Example);
            }

            // bow vectors
            List <SparseVector <double> > bowData = BowSpace is DeltaBowSpace <LblT>
                                                    ?(BowSpace as DeltaBowSpace <LblT>).Initialize(dataset as ILabeledDataset <LblT, string> ?? new LabeledDataset <LblT, string>(dataset))
                                                        : BowSpace.Initialize(dataset.Select(d => d.Example));
            var bowDataset = new LabeledDataset <LblT, SparseVector <double> >();

            for (int i = 0; i < bowData.Count; i++)
            {
                bowDataset.Add(dataset[i].Label, bowData[i]);
            }

            // train
            if (OnTrainModel == null)
            {
                Model.Train(bowDataset);
            }
            else
            {
                OnTrainModel(this, bowDataset);
            }

            IsTrained = true;
        }
Beispiel #26
0
        public override void Run(object[] args)
        {
            // prepare data
            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;
            TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer);

            // Create a tokenizer.
            var tokenizer = new UnicodeTokenizer
            {
                MinTokenLen = 2,                            // Each token must be at least 2 characters long.
                Filter      = TokenizerFilter.AlphaStrict   // Tokens can consist of alphabetic characters only.
            };

            // take data for two classes from cvs file
            var data = new List <LabeledTweet>(GetLabeledTweets().Where(lt => lt.Polarity != 2)).ToList();

            // Create a bag-of-words space.
            var bowSpc = new BowSpace
            {
                Tokenizer      = tokenizer,                 // Assign the tokenizer.
                StopWords      = stopWords,                 // Assign the stop words.
                Stemmer        = stemmer,                   // Assign the stemmer.
                MinWordFreq    = 1,                         // A term must appear at least n-times in the corpus for it to be part of the vocabulary.
                MaxNGramLen    = 2,                         // Terms consisting of at most n-consecutive words will be considered.
                WordWeightType = WordWeightType.TermFreq,   // Set the weighting scheme for the bag-of-words vectors to TF.
                //WordWeightType = WordWeightType.TfIdf,  // Set the weighting scheme for the bag-of-words vectors to TF-IDF.
                NormalizeVectors  = true,                   // The TF-IDF vectors will be normalized.
                CutLowWeightsPerc = 0                       // The terms with the lowest weights, summing up to 20% of the overall weight sum, will be removed from each TF-IDF vector.
            };
            ArrayList <SparseVector <double> > bowData = bowSpc.Initialize(data.Select(d => d.Text));

            // label data
            var labeledSet = new LabeledDataset <string, SparseVector <double> >();

            for (int i = 0; i < data.Count; i++)
            {
                labeledSet.Add(data[i].Label, bowData[i]);
            }
            labeledSet.Shuffle();

            int testSize    = labeledSet.Count / 10;
            var trainingSet = new LabeledDataset <string, SparseVector <double> >(labeledSet.Skip(testSize));
            var testSet     = new LabeledDataset <string, SparseVector <double> >(labeledSet.Take(testSize));

            //-------------------- SVM

            var svmBinClass = new SvmBinaryClassifier <string> {
                VerbosityLevel = SvmLightVerbosityLevel.Off
            };

            if (args.Any())
            {
                svmBinClass.C = (int)args[0];
            }
            //svmBinClass.BiasedHyperplane = true;
            //svmBinClass.CustomParams = "-t 3";   // non-linear kernel
            //svmBinClass.CustomParams = String.Format("-j {0}",j);

            svmBinClass.Train(trainingSet);

            int    correct = 0;
            double avgDist = 0;

            foreach (LabeledExample <string, SparseVector <double> > labeledExample in testSet)
            {
                var prediction = svmBinClass.Predict(labeledExample.Example);
                //Output.WriteLine("actual: {0}\tpredicted: {1}\t score: {2:0.0000}", labeledExample.Label, prediction.BestClassLabel, prediction.BestScore);
                avgDist += prediction.BestScore;
                if (prediction.BestClassLabel == labeledExample.Label)
                {
                    correct++;
                }
            }

            Output.WriteLine("Accuracy: {0:0.00}", 100.0 * correct / testSet.Count);
            Output.WriteLine("Avg. distance: {0:0.00}", avgDist / testSet.Count);

            Result.Add("accuracy", (double)correct / testSet.Count);

            Result.Add("classifier", svmBinClass);
            Result.Add("labeled_data", labeledSet);
        }
 protected abstract LabeledDataset <LblT, ExT> GetTrainSet(int modelIdx, IModel <LblT, ExT> model, LabeledDataset <LblT, ExT> trainSet);
        public Vector2D[] ComputeLayout(LayoutSettings settings)
        {
            // clustering
            mLogger.Info("ComputeLayout", "Clustering ...");
            mKMeans        = new IncrementalKMeans(mKClust);
            mKMeans.Eps    = mKMeansEps;
            mKMeans.Random = mRandom;
            mKMeans.Trials = 3;
            ClusteringResult clustering = mKMeans.Cluster(mDataset); // throws ArgumentValueException
            // determine reference instances
            UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >();

            foreach (SparseVector <double> centroid in mKMeans.GetCentroids())
            {
                dsRefInst.Add(centroid); // dataset of reference instances
                mDataset.Add(centroid);  // add centroids to the main dataset
            }
            // position reference instances
            mLogger.Info("ComputeLayout", "Positioning reference instances ...");
            SparseMatrix <double>    simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false);
            StressMajorizationLayout sm     = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx));

            sm.Random   = mRandom;
            sm.MaxSteps = int.MaxValue;
            sm.MinDiff  = 0.00001;
            mRefPos     = sm.ComputeLayout();
            // k-NN
            mLogger.Info("ComputeLayout", "Computing similarities ...");
            simMtx = ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true);
            mLogger.Info("ComputeLayout", "Constructing system of linear equations ...");
            LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >();

            mPatches = new ArrayList <Patch>(mDataset.Count);
            for (int i = 0; i < mDataset.Count; i++)
            {
                mPatches.Add(new Patch(i));
            }
            foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx)
            {
                if (simMtxRow.Dat.Count <= 1)
                {
                    mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx);
                }
                ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count);
                foreach (IdxDat <double> item in simMtxRow.Dat)
                {
                    if (item.Idx != simMtxRow.Idx)
                    {
                        knn.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                    }
                }
                knn.Sort(DescSort <KeyDat <double, int> > .Instance);
                int count = Math.Min(knn.Count, mKNnExt);
                for (int i = 0; i < count; i++)
                {
                    mPatches[simMtxRow.Idx].List.Add(new KeyDat <double, Patch>(knn[i].Key, mPatches[knn[i].Dat]));
                }
                mPatches[simMtxRow.Idx].ProcessList();
                count = Math.Min(knn.Count, mKNn);
                SparseVector <double> eq = new SparseVector <double>();
                double wgt = 1.0 / (double)count;
                for (int i = 0; i < count; i++)
                {
                    eq.InnerIdx.Add(knn[i].Dat);
                    eq.InnerDat.Add(-wgt);
                }
                eq.InnerIdx.Sort(); // *** sort only indices
                eq[simMtxRow.Idx] = 1;
                lsqrDs.Add(0, eq);
            }
            Vector2D[] layout = new Vector2D[mDataset.Count - mKClust];
            for (int i = mDataset.Count - mKClust, j = 0; i < mDataset.Count; i++, j++)
            {
                SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) });
                lsqrDs.Add(mRefPos[j].X, eq);
            }
            LSqrModel lsqr = new LSqrModel();

            lsqr.Train(lsqrDs);
            mSolX = lsqr.Solution.GetWritableCopy();
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].X = lsqr.Solution[i];
            }
            for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++)
            {
                lsqrDs[i].Label = mRefPos[j].Y;
            }
            lsqr.Train(lsqrDs);
            mSolY = lsqr.Solution.GetWritableCopy();
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].Y = lsqr.Solution[i];
            }
            return(settings == null ? layout : settings.AdjustLayout(layout));
        }
Beispiel #29
0
        public Vector2D[] ComputeLayout(LayoutSettings settings)
        {
            UnlabeledDataset <SparseVector <double> > dataset = new UnlabeledDataset <SparseVector <double> >(mDataset);

            // clustering
            mLogger.Info("ComputeLayout", "Clustering ...");
            KMeansFast kMeans = new KMeansFast(mKClust);

            kMeans.Eps    = mKMeansEps;
            kMeans.Random = mRandom;
            kMeans.Trials = 1;
            ClusteringResult clustering = kMeans.Cluster(mDataset); // throws ArgumentValueException
            // determine reference instances
            UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >();

            foreach (Cluster cluster in clustering.Roots)
            {
                SparseVector <double> centroid
                    = cluster.Items.Count > 0 ? cluster.ComputeCentroid(mDataset, CentroidType.NrmL2) : new SparseVector <double>();
                dsRefInst.Add(centroid); // dataset of reference instances
                dataset.Add(centroid);   // add centroids to the main dataset
            }
            // position reference instances
            mLogger.Info("ComputeLayout", "Positioning reference instances ...");
            SparseMatrix <double>    simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false);
            StressMajorizationLayout sm     = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx));

            sm.Random = mRandom;
            Vector2D[] centrPos = sm.ComputeLayout();
            // k-NN
            mLogger.Info("ComputeLayout", "Computing similarities ...");
            simMtx = ModelUtils.GetDotProductSimilarity(dataset, mSimThresh, /*fullMatrix=*/ true);
            mLogger.Info("ComputeLayout", "Constructing system of linear equations ...");
            LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >();

            foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx)
            {
                if (simMtxRow.Dat.Count <= 1)
                {
                    mLogger.Warn("ComputeLayout", "Instance #{0} has no neighborhood.", simMtxRow.Idx);
                }
                ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count);
                foreach (IdxDat <double> item in simMtxRow.Dat)
                {
                    if (item.Idx != simMtxRow.Idx)
                    {
                        knn.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                    }
                }
                knn.Sort(DescSort <KeyDat <double, int> > .Instance);
                int count = Math.Min(knn.Count, mKNN);
                SparseVector <double> eq = new SparseVector <double>();
                double wgt = 1.0 / (double)count;
                for (int i = 0; i < count; i++)
                {
                    eq.InnerIdx.Add(knn[i].Dat);
                    eq.InnerDat.Add(-wgt);
                }
                eq.InnerIdx.Sort(); // *** sort only indices
                eq[simMtxRow.Idx] = 1;
                lsqrDs.Add(0, eq);
            }
            Vector2D[] layout = new Vector2D[dataset.Count - mKClust];
            for (int i = dataset.Count - mKClust, j = 0; i < dataset.Count; i++, j++)
            {
                SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) });
                lsqrDs.Add(centrPos[j].X, eq);
            }
            LSqrModel lsqr = new LSqrModel();

            lsqr.Train(lsqrDs);
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].X = lsqr.Solution[i];
            }
            for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++)
            {
                lsqrDs[i].Label = centrPos[j].Y;
            }
            lsqr.Train(lsqrDs);
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].Y = lsqr.Solution[i];
            }
            return(settings == null ? layout : settings.AdjustLayout(layout));
        }
        // TODO: exceptions
        public Vector2D[] Update(int numDequeue, IEnumerable <SparseVector <double> > newInst, bool test, LayoutSettings settings, ref PtInfo[] ptInfo, int _count)
        {
            // clustering
            mLogger.Info("Update", "Clustering ...");
            /*prof*/ StopWatch sw = new StopWatch();

            mKMeans.Eps = mKMeansEps;
            int iter = 0;

            mKMeans.Update(numDequeue, newInst, ref iter);
            /*prof*/ sw.Save("cl.txt", _count, iter.ToString());
            // determine reference instances
            /*prof*/ sw.Reset();
            UnlabeledDataset <SparseVector <double> > dsRefInst = new UnlabeledDataset <SparseVector <double> >();
            UnlabeledDataset <SparseVector <double> > dsNewInst = new UnlabeledDataset <SparseVector <double> >(newInst);

            foreach (SparseVector <double> centroid in mKMeans.GetCentroids())
            {
                dsRefInst.Add(centroid); // dataset of reference instances
                dsNewInst.Add(centroid); // dataset of new instances
            }
            // position reference instances
            mLogger.Info("Update", "Positioning reference instances ...");
            SparseMatrix <double>    simMtx = ModelUtils.GetDotProductSimilarity(dsRefInst, mSimThresh, /*fullMatrix=*/ false);
            StressMajorizationLayout sm     = new StressMajorizationLayout(dsRefInst.Count, new DistFunc(simMtx));

            sm.Random   = mRandom;
            sm.MaxSteps = int.MaxValue;
            sm.MinDiff  = 1E-3;
            mRefPos     = sm.ComputeLayout(/*settings=*/ null, mRefPos /*make this a property!!!*/);
            /*prof*/ sw.Save("sm.txt", _count);
            // k-NN
            /*prof*/ sw.Reset();
            DateTime t = DateTime.Now;

            mLogger.Info("Update", "Computing similarities ...");
            // update list of neighborhoods
            mPatches.RemoveRange(mDataset.Count - mKClust, mKClust);
            mPatches.RemoveRange(0, numDequeue);
            // remove instances from [dataset and] neighborhoods
            foreach (Patch patch in mPatches)
            {
                if (patch.Min != null && (patch.Min.Idx < numDequeue || patch.Max.Idx >= mDataset.Count - mKClust))
                {
                    int oldCount = patch.List.Count;
                    ArrayList <KeyDat <double, Patch> > tmp = new ArrayList <KeyDat <double, Patch> >();
                    foreach (KeyDat <double, Patch> item in patch.List)
                    {
                        if (item.Dat.Idx >= numDequeue && item.Dat.Idx < mDataset.Count - mKClust)
                        {
                            tmp.Add(item);
                        }
                        //else
                        //{
                        //    Console.WriteLine("Remove {0}", item.Dat.Idx - numDequeue);
                        //}
                    }
                    patch.List = tmp;
                    patch.ProcessList();
                    patch.NeedUpdate = patch.List.Count < mKNn && oldCount >= mKNn;
                }
            }
            // update dataset
            mDataset.RemoveRange(mDataset.Count - mKClust, mKClust);
            mDataset.RemoveRange(0, numDequeue);
            // add new instances to dataset
            int preAddCount = mDataset.Count;

            mDataset.AddRange(dsNewInst);
            // precompute transposed matrices
            SparseMatrix <double> trNewInst = ModelUtils.GetTransposedMatrix(dsNewInst);
            SparseMatrix <double> trDataset = ModelUtils.GetTransposedMatrix(mDataset);

            // add new instances to neighborhoods
            for (int i = 0; i < dsNewInst.Count; i++)
            {
                mPatches.Add(new Patch(-1));
                mPatches.Last.NeedUpdate = true;
            }
            for (int i = 0; i < mPatches.Count; i++)
            {
                mPatches[i].Idx = i;
            }
            for (int i = 0; i < mPatches.Count; i++)
            {
                Patch patch = mPatches[i];
                SparseVector <double> vec = mDataset[i];
                if (vec != null)
                {
                    if (patch.NeedUpdate) // full update required
                    {
                        //if (i == 1347) { Console.WriteLine("full update"); }
                        SparseVector <double>             simVec = ModelUtils.GetDotProductSimilarity(trDataset, mDataset.Count, vec, mSimThresh);
                        ArrayList <KeyDat <double, int> > tmp    = new ArrayList <KeyDat <double, int> >();
                        foreach (IdxDat <double> item in simVec)
                        {
                            if (item.Idx != i)
                            {
                                tmp.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                            }
                        }
                        tmp.Sort(new Comparer2());
                        int count = Math.Min(tmp.Count, mKNnExt);
                        patch.List.Clear();
                        for (int j = 0; j < count; j++)
                        {
                            patch.List.Add(new KeyDat <double, Patch>(tmp[j].Key, mPatches[tmp[j].Dat]));
                        }
                        patch.ProcessList();
                        patch.NeedUpdate = false;
                    }
                    else // only new instances need to be considered
                    {
                        //if (i == 1347) { Console.WriteLine("partial update"); }
                        SparseVector <double> simVec = ModelUtils.GetDotProductSimilarity(trNewInst, dsNewInst.Count, vec, mSimThresh);
                        // check if further processing is needed
                        bool needMerge = false;
                        if (test)
                        {
                            foreach (IdxDat <double> item in simVec)
                            {
                                if (item.Dat >= patch.MinSim)
                                {
                                    needMerge = true;
                                    //Console.WriteLine("{0} {1}", item.Dat, patch.MinSim);
                                    break;
                                }
                            }
                        }
                        else
                        {
                            foreach (IdxDat <double> item in simVec)
                            {
                                if (item.Dat > patch.MinSim)
                                {
                                    needMerge = true;
                                    //Console.WriteLine("{0} {1}", item.Dat, patch.MinSim);
                                    break;
                                }
                            }
                        }
                        if (needMerge || patch.List.Count < mKNn)
                        {
                            //if (i == 1347) { Console.WriteLine("merge"); }
                            int oldCount = patch.List.Count;
                            ArrayList <KeyDat <double, Patch> > tmp = new ArrayList <KeyDat <double, Patch> >();
                            foreach (IdxDat <double> item in simVec)
                            {
                                tmp.Add(new KeyDat <double, Patch>(item.Dat, mPatches[item.Idx + preAddCount]));
                            }
                            // merge the two lists
                            // TODO: speed this up
                            patch.List.AddRange(tmp);
                            patch.List.Sort(new Comparer());
                            // trim list to size
                            if (oldCount >= mKNn)
                            {
                                patch.List.RemoveRange(oldCount, patch.List.Count - oldCount);
                            }
                            patch.ProcessList();
                        }
                    }
                }
            }
            /*prof*/ sw.Save("knn.txt", _count);
            // *** Test ***
            sw.Reset();
            ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true);
            sw.Save("selfSim.txt", _count, mDataset.Count.ToString());
            if (test)
            {
                simMtx = ModelUtils.GetDotProductSimilarity(mDataset, mSimThresh, /*fullMatrix=*/ true);
                ArrayList <Patch> patches = new ArrayList <Patch>();
                for (int i = 0; i < mDataset.Count; i++)
                {
                    patches.Add(new Patch(i));
                }
                foreach (IdxDat <SparseVector <double> > simMtxRow in simMtx)
                {
                    if (simMtxRow.Dat.Count <= 1)
                    {
                        mLogger.Warn("Update", "Instance #{0} has no neighborhood.", simMtxRow.Idx);
                    }
                    ArrayList <KeyDat <double, int> > knn = new ArrayList <KeyDat <double, int> >(simMtxRow.Dat.Count);
                    foreach (IdxDat <double> item in simMtxRow.Dat)
                    {
                        if (item.Idx != simMtxRow.Idx)
                        {
                            knn.Add(new KeyDat <double, int>(item.Dat, item.Idx));
                        }
                    }
                    knn.Sort(new Comparer2());
                    int count = Math.Min(knn.Count, mKNnExt);
                    for (int i = 0; i < count; i++)
                    {
                        patches[simMtxRow.Idx].List.Add(new KeyDat <double, Patch>(knn[i].Key, patches[knn[i].Dat]));
                    }
                    patches[simMtxRow.Idx].ProcessList();
                }
                // compare
                if (patches.Count != mPatches.Count)
                {
                    throw new Exception("Count mismatch.");
                }
                for (int i = 0; i < mPatches.Count; i++)
                {
                    if (patches[i].List.Count < mKNn && patches[i].List.Count != mPatches[i].List.Count)
                    {
                        Console.WriteLine(mPatches[i].List.Count);
                        Console.WriteLine(patches[i].List.Count);
                        Output(mPatches[i].List);
                        Output(patches[i].List);
                        Console.WriteLine(i);
                        throw new Exception("List count mismatch.");
                    }
                    int count = Math.Min(mPatches[i].List.Count, mKNn);
                    for (int j = 0; j < count; j++)
                    {
                        //Console.WriteLine("{4} {0}-{1} {2}-{3}", mPatches[i].List[j].Key, mPatches[i].List[j].Dat.Idx, patches[i].List[j].Key, patches[i].List[j].Dat.Idx, i);
                        if (mPatches[i].List[j].Key != patches[i].List[j].Key || mPatches[i].List[j].Dat.Idx != patches[i].List[j].Dat.Idx)
                        {
                            Console.WriteLine("i:{4} fast:{0}-{1} slow:{2}-{3}", mPatches[i].List[j].Key, mPatches[i].List[j].Dat.Idx, patches[i].List[j].Key, patches[i].List[j].Dat.Idx, i);
                            int idxFast = mPatches[i].List[j].Dat.Idx;
                            int idxSlow = patches[i].List[j].Dat.Idx;
                            Console.WriteLine("slow @ fast idx: {0}", GetKey(patches[i].List, idxFast));
                            Console.WriteLine("fast @ slow idx: {0}", GetKey(mPatches[i].List, idxSlow));
                            throw new Exception("Patch item mismatch.");
                        }
                    }
                }
            }
            // *** End of test ***
            //Console.WriteLine("Number of patches: {0}", mPatches.Count);
            //int waka = 0;
            //foreach (Patch patch in mPatches)
            //{
            //    waka += patch.List.Count;
            //}
            //Console.WriteLine("Avg list size: {0}", (double)waka / (double)mPatches.Count);
            Console.WriteLine((DateTime.Now - t).TotalMilliseconds);
            /*prof*/ sw.Reset();
            mLogger.Info("Update", "Constructing system of linear equations ...");
            LabeledDataset <double, SparseVector <double> > lsqrDs = new LabeledDataset <double, SparseVector <double> >();

            Vector2D[] layout = new Vector2D[mDataset.Count - mKClust];
            foreach (Patch patch in mPatches)
            {
                int count = Math.Min(patch.List.Count, mKNn);
                SparseVector <double> eq = new SparseVector <double>();
                double wgt = 1.0 / (double)count;
                for (int i = 0; i < count; i++)
                {
                    eq.InnerIdx.Add(patch.List[i].Dat.Idx);
                    eq.InnerDat.Add(-wgt);
                }
                eq.InnerIdx.Sort(); // *** sort only indices
                eq[patch.Idx] = 1;
                lsqrDs.Add(0, eq);
            }
            for (int i = mDataset.Count - mKClust, j = 0; i < mDataset.Count; i++, j++)
            {
                SparseVector <double> eq = new SparseVector <double>(new IdxDat <double>[] { new IdxDat <double>(i, 1) });
                lsqrDs.Add(mRefPos[j].X, eq);
            }
            LSqrModel lsqr = new LSqrModel();

            mSolX.RemoveRange(0, numDequeue);
            double[] aux = new double[mKClust];
            mSolX.CopyTo(mSolX.Count - mKClust, aux, 0, mKClust);
            mSolX.RemoveRange(mSolX.Count - mKClust, mKClust);
            foreach (SparseVector <double> newVec in newInst)
            {
                mSolX.Add(0);
            }
            mSolX.AddRange(aux);
            lsqr.InitialSolution = mSolX.ToArray();
            lsqr.Train(lsqrDs);
            mSolX = lsqr.Solution.GetWritableCopy();
            //for (int i = 0; i < lsqr.InitialSolution.Length; i++)
            //{
            //    Console.WriteLine("{0}\t{1}", lsqr.InitialSolution[i], lsqr.Solution[i]);
            //}
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].X = lsqr.Solution[i];
            }
            for (int i = lsqrDs.Count - mKClust, j = 0; i < lsqrDs.Count; i++, j++)
            {
                lsqrDs[i].Label = mRefPos[j].Y;
            }
            mSolY.RemoveRange(0, numDequeue);
            aux = new double[mKClust];
            mSolY.CopyTo(mSolY.Count - mKClust, aux, 0, mKClust);
            mSolY.RemoveRange(mSolY.Count - mKClust, mKClust);
            foreach (SparseVector <double> newVec in newInst)
            {
                mSolY.Add(0);
            }
            mSolY.AddRange(aux);
            lsqr.InitialSolution = mSolY.ToArray();
            lsqr.Train(lsqrDs);
            mSolY = lsqr.Solution.GetWritableCopy();
            for (int i = 0; i < layout.Length; i++)
            {
                layout[i].Y = lsqr.Solution[i];
            }
            /*prof*/ sw.Save("lsqr.txt", _count);
            // -----------------------------------------------------------------
            // make ptInfo
            // -----------------------------------------------------------------
            ptInfo = new PtInfo[layout.Length];
            int ii = 0;

            foreach (Vector2D pt in layout)
            {
                ptInfo[ii]     = new PtInfo();
                ptInfo[ii].X   = pt.X;
                ptInfo[ii].Y   = pt.Y;
                ptInfo[ii].Vec = mDataset[ii];
                ii++;
            }
            // -----------------------------------------------------------------
            return(settings == null ? layout : settings.AdjustLayout(layout));
        }