public DetectionResults Filter(DocumentClusters document) { List <ProcessingTextBlock> clusters = new List <ProcessingTextBlock>(); List <ProcessingTextBlock> withoutSentiment = new List <ProcessingTextBlock>(); foreach (var cluster in document.Clusters) { if (cluster.Sentences.Any(item => item.CalculateSentiment().HasValue)) { clusters.Add(cluster); } else { withoutSentiment.Add(cluster); } } return(new DetectionResults(clusters.ToArray(), withoutSentiment.ToArray())); }
public DetectionResults Filter(DocumentClusters document) { if (document.Clusters.Length < 3) { logger.Info("Not enought text clusters for clustering"); return(new DetectionResults(document.Clusters)); } double[][] observations = vectorSource.GetVectors(document.Clusters, NormalizationType.None); var standardizer = Standardizer.GetNumericStandardizer(observations); observations = standardizer.StandardizeAll(observations); var data = observations.ToArray(); for (int i = 0; i < observations.Length; i++) { for (int j = 0; j < observations[i].Length; j++) { if (double.IsNaN(observations[i][j])) { observations[i][j] = 0; } } } var teacher = new OneclassSupportVectorLearning <Gaussian> { Kernel = Gaussian.FromGamma(1.0 / data.Length), Nu = 0.5, Shrinking = true, Tolerance = 0.001 }; var svm = teacher.Learn(data); double[] prediction = svm.Score(data); Dictionary <int, List <double> > weights = new Dictionary <int, List <double> >(); for (int i = 0; i < prediction.Length; i++) { foreach (var sentenceItem in document.Clusters[i].Sentences) { if (!weights.TryGetValue(sentenceItem.Index, out var classType)) { classType = new List <double>(); weights[sentenceItem.Index] = classType; } classType.Add(prediction[i]); } } List <ProcessingTextBlock> anomaly = new List <ProcessingTextBlock>(); List <ProcessingTextBlock> resultData = new List <ProcessingTextBlock>(); List <SentenceItem> sentences = new List <SentenceItem>(); ProcessingTextBlock cluster; bool?lastResult = null; var cutoffIndex = (int)(weights.Count * 0.2); var cutoff = weights.Select(item => item.Value.Sum()).OrderBy(item => item).Skip(cutoffIndex).First(); var allSentences = document.Clusters.SelectMany(item => item.Sentences) .Distinct() .OrderBy(item => item.Index) .ToArray(); if (allSentences.Length != weights.Count) { throw new ArgumentOutOfRangeException(nameof(document), "Sentence length mismatch"); } foreach (var sentence in allSentences) { var current = weights[sentence.Index].Sum(); var result = current > cutoff; if (lastResult != null && result != lastResult) { cluster = new ProcessingTextBlock(sentences.ToArray()); sentences.Clear(); if (lastResult.Value) { resultData.Add(cluster); } else { anomaly.Add(cluster); } } sentences.Add(sentence); lastResult = result; } cluster = new ProcessingTextBlock(sentences.ToArray()); sentences.Clear(); if (lastResult.Value) { resultData.Add(cluster); } else { anomaly.Add(cluster); } StringBuilder builder = new StringBuilder(); foreach (var textCluster in anomaly) { foreach (var sentenceItem in textCluster.Sentences) { builder.AppendLine(sentenceItem.Text); } } return(new DetectionResults(resultData.ToArray(), anomaly.ToArray())); }