public Processor(Configuration config, IDatabase database) { _config = config; _database = database; _listeners = new List <IMatchListener>(); _passthrough = new PassThroughFilter(); _choosebest = new ChooseBestFilter(); // precomputing for later optimizations _proporder = new List <Property>(); foreach (Property p in _config.GetProperties()) { if (!p.IsIdProperty) { _proporder.Add(p); } } _proporder.Sort(new PropertyComparator()); // still precomputing double prob = 0.5; _accprob = new double[_proporder.Count]; for (int ix = _proporder.Count - 1; ix >= 0; ix--) { prob = StandardUtils.ComputeBayes(prob, _proporder[ix].HighProbability); _accprob[ix] = prob; } }
public void ComputeBayes_TwoDoubles_ReturnsProbabilityAsDouble() { const double val1 = 52.3; const double val2 = 19.22; double actual = StandardUtils.ComputeBayes(val1, val2); Console.WriteLine(String.Format("ComputeBayes result = {0}", actual)); Assert.IsInstanceOf <double>(actual); Assert.Greater(actual, 0.0); }
public void ComputeBayes_OneValueZero_ReturnsZero() { const double val1 = 0.0; const double val2 = 19.22; double actual = StandardUtils.ComputeBayes(val1, val2); Console.WriteLine(String.Format("ComputeBayes result = {0}", actual)); Assert.IsInstanceOf <double>(actual); Assert.AreEqual(0.0, actual); }
public double Compare(IRecord r1, IRecord r2) { double prob = 0.5; foreach (string propname in r1.GetProperties()) { Property prop = _config.GetPropertyByName(propname); if (prop.IsIdProperty || prop.IsIgnoreProperty()) { continue; } List <string> vs1 = r1.GetValues(propname); List <string> vs2 = r2.GetValues(propname); if ((vs1.Count == 0) || (vs2.Count == 0)) { continue; // no values to compare, so skip } double high = 0.0; foreach (string v1 in vs1) { if (v1.Equals("")) //TODO: These values shouldn't be here at all. { continue; } foreach (string v2 in vs2) { if (v2.Equals("")) //TODO: These values shouldn't be here at all. { continue; } try { double p = prop.Compare(v1, v2); high = Math.Max(high, p); } catch (Exception e) { throw new DukeException(String.Format("Comparison of values {0} and {1} failed. {2}", v1, v2, e.Message)); } } } prob = StandardUtils.ComputeBayes(prob, high); } return(prob); }
private void FindLookupProperties() { var candidates = new List <Property>(); foreach (Property property in _properties.Values) { if (!property.IsIdProperty || property.IsIgnoreProperty()) { candidates.Add(property); } } candidates.Sort(HighComparator.Compare); //TODO: see if the HighComparator even needs to be a separate class... int last = -1; double prob = 0.5; double limit = ThresholdMaybe; if (limit == 0.0) { limit = Threshold; } for (int ix = 0; ix < candidates.Count; ix++) { Property prop = candidates[ix]; if (prop.HighProbability == 0.0) { // if the probability is zero we ignore the property entirely continue; } prob = StandardUtils.ComputeBayes(prob, prop.HighProbability); if (prob >= Threshold) { if (last == -1) { last = ix; } break; } if (prob >= limit && last == -1) { last = ix; } } if (prob < Threshold) { //throw new DukeConfigException("Maximum possible probability is " + prob + // ", which is below threshold (" + threshold + // "), which means no duplicates will ever " + // "be found"); throw new Exception(String.Format("Maximum possible probability is {0}, which is below threshold ({1}" + "), which means no duplicates will ever be found", prob, Threshold)); } if (last == -1) { _lookups.Clear(); } else { _lookups = new List <Property>(candidates.GetRange(last, candidates.Count)); } }