public FeatureSelectionResult Select(TextDocument[] documents, FeatureSelectionParams featureSelectionParams) { cache = new Dictionary<string, double>(); var words = documents.SelectMany(p => p.Words).Distinct().ToArray(); var documentsCount = documents.GetDocumentsCountByWord(); words = words.Where(w => documentsCount.ContainsKey(w) && documentsCount[w] >= featureSelectionParams.MinDocCount && documentsCount[w] <= featureSelectionParams.MaxDocCount).ToArray(); var featuredWords = words .Select(w => new WeightedWord { Word = w, Metric = CalculateChiSquared(w, documents, featureSelectionParams.TargetTag) }) .OrderByDescending(w => w.Metric) .Take(words.Length * featureSelectionParams.UpperBoundPercent / 100) .Skip(words.Length * featureSelectionParams.LowerBoundPercent / 100) .ToArray(); return new FeatureSelectionResult { FeaturedWords = featuredWords }; }
public FeatureSelectionResult Select(TextDocument[] documents, FeatureSelectionParams featureSelectionParams) { var words = documents.SelectMany(p => p.Words).Distinct().ToArray(); var documentsCount = documents.Where(d => d.Tags.Contains(featureSelectionParams.TargetTag)).GetDocumentsCountByWord(); var featuredWords = words .Where(documentsCount.ContainsKey) .Select(w => new WeightedWord { Word = w, Metric = documentsCount[w] }) .OrderByDescending(w => w.Metric) .Take(words.Length * featureSelectionParams.UpperBoundPercent / 100) .Skip(words.Length * featureSelectionParams.LowerBoundPercent / 100) .ToArray(); return new FeatureSelectionResult { FeaturedWords = featuredWords }; }
public FeatureSelectionResult Select(TextDocument[] documents, FeatureSelectionParams featureSelectionParams) { var featureSelector = concreteFeatureSelectors.First(x => x.Type == featureSelectionParams.Type); return featureSelector.Select(documents, featureSelectionParams); }