// This method is called by multiple threads to modify the same list of elements and is not synchronized. // However, it is thread safe due to mutation slicing: each invocation will modify exactly one index of the values public static void AssignProbabilities(DiscreteAttribute attribute, List <Record> elements, List <DiscreteAttribute> outliers = null) { var counts = new Dictionary <string /*value*/, double /*count*/>(); Array.ForEach(attribute.Values, str => counts.Add(str, 0)); foreach (var element in elements) { double count; if (counts.TryGetValue(element[attribute], out count)) { counts[element[attribute]] = count + 1; } } var picker = new ProbabilityPicker(ConvertToProbabilities(counts)); if (!picker.IsSignificant() && outliers != null) { lock (outliers) outliers.Add(attribute); } foreach (var element in elements) { if (!counts.ContainsKey(element[attribute])) { element[attribute] = picker.Pick(); } } }
public static void AssignProbabilitiesByClass(DiscreteAttribute attribute, List <Record> trainingSet) { var groups = trainingSet.GroupBy(elem => elem.IsPositive); foreach (var group in groups) { AssignProbabilities(attribute, group.ToList(), new List <DiscreteAttribute>()); } }
static double CalculateRatio(List <Record> recordsSet, DiscreteAttribute header) { double result = 0.0d; foreach (var grouping in recordsSet.GroupBy(record => record[header])) { result = result + CalcPartEntropy(grouping.Count(), recordsSet.Count); } return(result); }
public static DiscreteAttribute GetBestAttribute(List <Record> recordsSet, List <DiscreteAttribute> headersSet) { if (!headersSet.Any()) { return(null); } double bestGain = 0; DiscreteAttribute bestAttribute = null; Parallel.ForEach(headersSet.ToArray(), header => { var gain = CalculateGain(recordsSet, header); //double ratio = CalculateRatio(recordsSet, header); if (gain.Item1) //if (true) { lock (typeof(DecisionTree)) { if (gain.Item2 > bestGain) { bestGain = gain.Item2; bestAttribute = header; } } } else { lock (headersSet) { headersSet.Remove(header); } } }); return(bestAttribute); }
public TreeNode(TreeNode parent, string value, List <DiscreteAttribute> attributes, List <Record> records, double fracCertainty) { this.parent = parent; Value = value; children = new ConcurrentDictionary <string, TreeNode>(); if (DecideTrue(records)) { return; } if (DecideFalse(records)) { return; } DecideWithProbability(records); splitAttribute = DecisionTree.GetBestAttribute(records, attributes); if (IsLeafNode()) { return; } BuildChildNodes(attributes, records, fracCertainty); }
public string this[DiscreteAttribute attr] { get { return(this[attr.Index]); } set { this[attr.Index] = value; } }
static Tuple <bool /*significant*/, double> CalculateGain(List <Record> samplesSet, DiscreteAttribute attribute) { double entropyAfter = 0; foreach (var @group in samplesSet.GroupBy(sample => sample[attribute])) { //if (attribute.Values.Contains(@group.Key)) { int groupTotals = @group.Count(); int groupPositives = CountPositiveExamples(@group); entropyAfter += CalculateEntropy(groupPositives, groupTotals - groupPositives) * groupTotals / samplesSet.Count; } } int positives = CountPositiveExamples(samplesSet); double entropyBefore = CalculateEntropy(positives, samplesSet.Count - positives); return(new Tuple <bool, double>(entropyAfter < entropyBefore, entropyBefore - entropyAfter)); }