Пример #1
0
        public void Train(DataPackage data, CancellationToken token)
        {
            if (data is null)
            {
                throw new ArgumentNullException(nameof(data));
            }

            log.Debug("Training with {0} records", data.Y.Length);

            standardizer = Standardizer.GetNumericStandardizer(data.X);
            var xTraining = data.X;
            var yTraining = data.Y;

            var xTesting = xTraining;
            var yTesting = yTraining;

            int testSize = 100;

            if (xTraining.Length > testSize * 4)
            {
                var training = xTraining.Length - testSize;
                xTesting  = xTraining.Skip(training).ToArray();
                yTesting  = yTraining.Skip(training).ToArray();
                xTraining = xTraining.Take(training).ToArray();
                yTraining = yTraining.Take(training).ToArray();
            }

            xTraining = standardizer.StandardizeAll(xTraining);
            // Instantiate a new Grid Search algorithm for Kernel Support Vector Machines
            var gridsearch = new GridSearch <SupportVectorMachine <Gaussian>, double[], int>()
            {
                // Here we can specify the range of the parameters to be included in the search
                ParameterRanges = new GridSearchRangeCollection
                {
                    new GridSearchRange("complexity", new [] { 0.001, 0.01, 0.1, 1, 10 }),
                    new GridSearchRange("gamma", new [] { 0.001, 0.01, 0.1, 1 })
                },

                // Indicate how learning algorithms for the models should be created
                Learner = p => new SequentialMinimalOptimization <Gaussian>
                {
                    Complexity = p["complexity"],
                    Kernel     = new Gaussian
                    {
                        Gamma = p["gamma"]
                    }
                },

                // Define how the performance of the models should be measured
                Loss = (actual, expected, m) => new ZeroOneLoss(expected).Loss(actual)
            };

            gridsearch.Token = token;

            var randomized = new Random().Shuffle(xTraining, yTraining).ToArray();

            yTraining = randomized[1].Cast <int>().ToArray();
            xTraining = randomized[0].Cast <double[]>().ToArray();

            var result = gridsearch.Learn(xTraining, yTraining);

            // Get the best SVM found during the parameter search
            SupportVectorMachine <Gaussian> svm = result.BestModel;

            // Instantiate the probabilistic calibration (using Platt's scaling)
            var calibration = new ProbabilisticOutputCalibration <Gaussian>(svm);

            // Run the calibration algorithm
            calibration.Learn(xTraining, yTraining); // returns the same machine
            model = calibration.Model;
            var predicted       = ClassifyInternal(xTraining);
            var confusionMatrix = new GeneralConfusionMatrix(classes: 2, expected: yTraining, predicted: predicted);

            log.Debug("Performance on training dataset . F1(0):{0} F1(1):{1}", confusionMatrix.PerClassMatrices[0].FScore, confusionMatrix.PerClassMatrices[1].FScore);

            predicted          = Classify(xTesting);
            confusionMatrix    = new GeneralConfusionMatrix(classes: 2, expected: yTesting, predicted: predicted);
            TestSetPerformance = confusionMatrix;
            log.Debug("Performance on testing dataset . F1(0):{0} F1(1):{1}", confusionMatrix.PerClassMatrices[0].FScore, confusionMatrix.PerClassMatrices[1].FScore);
        }
        public DetectionResults Filter(DocumentClusters document)
        {
            if (document.Clusters.Length < 3)
            {
                logger.Info("Not enought text clusters for clustering");
                return(new DetectionResults(document.Clusters));
            }

            double[][] observations = vectorSource.GetVectors(document.Clusters, NormalizationType.None);
            var        standardizer = Standardizer.GetNumericStandardizer(observations);

            observations = standardizer.StandardizeAll(observations);
            var data = observations.ToArray();

            for (int i = 0; i < observations.Length; i++)
            {
                for (int j = 0; j < observations[i].Length; j++)
                {
                    if (double.IsNaN(observations[i][j]))
                    {
                        observations[i][j] = 0;
                    }
                }
            }

            var teacher = new OneclassSupportVectorLearning <Gaussian>
            {
                Kernel    = Gaussian.FromGamma(1.0 / data.Length),
                Nu        = 0.5,
                Shrinking = true,
                Tolerance = 0.001
            };

            var svm = teacher.Learn(data);

            double[] prediction = svm.Score(data);

            Dictionary <int, List <double> > weights = new Dictionary <int, List <double> >();

            for (int i = 0; i < prediction.Length; i++)
            {
                foreach (var sentenceItem in document.Clusters[i].Sentences)
                {
                    if (!weights.TryGetValue(sentenceItem.Index, out var classType))
                    {
                        classType = new List <double>();
                        weights[sentenceItem.Index] = classType;
                    }

                    classType.Add(prediction[i]);
                }
            }

            List <ProcessingTextBlock> anomaly    = new List <ProcessingTextBlock>();
            List <ProcessingTextBlock> resultData = new List <ProcessingTextBlock>();
            List <SentenceItem>        sentences  = new List <SentenceItem>();
            ProcessingTextBlock        cluster;
            bool?lastResult   = null;
            var  cutoffIndex  = (int)(weights.Count * 0.2);
            var  cutoff       = weights.Select(item => item.Value.Sum()).OrderBy(item => item).Skip(cutoffIndex).First();
            var  allSentences = document.Clusters.SelectMany(item => item.Sentences)
                                .Distinct()
                                .OrderBy(item => item.Index)
                                .ToArray();

            if (allSentences.Length != weights.Count)
            {
                throw new ArgumentOutOfRangeException(nameof(document), "Sentence length mismatch");
            }

            foreach (var sentence in allSentences)
            {
                var current = weights[sentence.Index].Sum();
                var result  = current > cutoff;
                if (lastResult != null &&
                    result != lastResult)
                {
                    cluster = new ProcessingTextBlock(sentences.ToArray());
                    sentences.Clear();
                    if (lastResult.Value)
                    {
                        resultData.Add(cluster);
                    }
                    else
                    {
                        anomaly.Add(cluster);
                    }
                }

                sentences.Add(sentence);
                lastResult = result;
            }

            cluster = new ProcessingTextBlock(sentences.ToArray());
            sentences.Clear();
            if (lastResult.Value)
            {
                resultData.Add(cluster);
            }
            else
            {
                anomaly.Add(cluster);
            }

            StringBuilder builder = new StringBuilder();

            foreach (var textCluster in anomaly)
            {
                foreach (var sentenceItem in textCluster.Sentences)
                {
                    builder.AppendLine(sentenceItem.Text);
                }
            }

            return(new DetectionResults(resultData.ToArray(), anomaly.ToArray()));
        }