示例#1
0
        public Processor(Configuration config, IDatabase database)
        {
            _config    = config;
            _database  = database;
            _listeners = new List <IMatchListener>();

            _passthrough = new PassThroughFilter();
            _choosebest  = new ChooseBestFilter();

            // precomputing for later optimizations
            _proporder = new List <Property>();
            foreach (Property p in _config.GetProperties())
            {
                if (!p.IsIdProperty)
                {
                    _proporder.Add(p);
                }
            }

            _proporder.Sort(new PropertyComparator());

            // still precomputing
            double prob = 0.5;

            _accprob = new double[_proporder.Count];
            for (int ix = _proporder.Count - 1; ix >= 0; ix--)
            {
                prob         = StandardUtils.ComputeBayes(prob, _proporder[ix].HighProbability);
                _accprob[ix] = prob;
            }
        }
        public void ComputeBayes_TwoDoubles_ReturnsProbabilityAsDouble()
        {
            const double val1   = 52.3;
            const double val2   = 19.22;
            double       actual = StandardUtils.ComputeBayes(val1, val2);

            Console.WriteLine(String.Format("ComputeBayes result = {0}", actual));
            Assert.IsInstanceOf <double>(actual);
            Assert.Greater(actual, 0.0);
        }
        public void ComputeBayes_OneValueZero_ReturnsZero()
        {
            const double val1   = 0.0;
            const double val2   = 19.22;
            double       actual = StandardUtils.ComputeBayes(val1, val2);

            Console.WriteLine(String.Format("ComputeBayes result = {0}", actual));
            Assert.IsInstanceOf <double>(actual);
            Assert.AreEqual(0.0, actual);
        }
示例#4
0
        public double Compare(IRecord r1, IRecord r2)
        {
            double prob = 0.5;

            foreach (string propname in r1.GetProperties())
            {
                Property prop = _config.GetPropertyByName(propname);
                if (prop.IsIdProperty || prop.IsIgnoreProperty())
                {
                    continue;
                }

                List <string> vs1 = r1.GetValues(propname);
                List <string> vs2 = r2.GetValues(propname);
                if ((vs1.Count == 0) || (vs2.Count == 0))
                {
                    continue; // no values to compare, so skip
                }
                double high = 0.0;
                foreach (string v1 in vs1)
                {
                    if (v1.Equals("")) //TODO: These values shouldn't be here at all.
                    {
                        continue;
                    }

                    foreach (string v2 in vs2)
                    {
                        if (v2.Equals("")) //TODO: These values shouldn't be here at all.
                        {
                            continue;
                        }

                        try
                        {
                            double p = prop.Compare(v1, v2);
                            high = Math.Max(high, p);
                        }
                        catch (Exception e)
                        {
                            throw new DukeException(String.Format("Comparison of values {0} and {1} failed. {2}", v1, v2,
                                                                  e.Message));
                        }
                    }
                }

                prob = StandardUtils.ComputeBayes(prob, high);
            }

            return(prob);
        }
示例#5
0
        private void FindLookupProperties()
        {
            var candidates = new List <Property>();

            foreach (Property property in _properties.Values)
            {
                if (!property.IsIdProperty || property.IsIgnoreProperty())
                {
                    candidates.Add(property);
                }
            }

            candidates.Sort(HighComparator.Compare);
            //TODO: see if the HighComparator even needs to be a separate class...

            int    last  = -1;
            double prob  = 0.5;
            double limit = ThresholdMaybe;

            if (limit == 0.0)
            {
                limit = Threshold;
            }

            for (int ix = 0; ix < candidates.Count; ix++)
            {
                Property prop = candidates[ix];
                if (prop.HighProbability == 0.0)
                {
                    // if the probability is zero we ignore the property entirely
                    continue;
                }

                prob = StandardUtils.ComputeBayes(prob, prop.HighProbability);
                if (prob >= Threshold)
                {
                    if (last == -1)
                    {
                        last = ix;
                    }
                    break;
                }
                if (prob >= limit && last == -1)
                {
                    last = ix;
                }
            }

            if (prob < Threshold)
            {
                //throw new DukeConfigException("Maximum possible probability is " + prob +
                //                           ", which is below threshold (" + threshold +
                //                           "), which means no duplicates will ever " +
                //                           "be found");
                throw new Exception(String.Format("Maximum possible probability is {0}, which is below threshold ({1}" +
                                                  "), which means no duplicates will ever be found", prob, Threshold));
            }
            if (last == -1)
            {
                _lookups.Clear();
            }
            else
            {
                _lookups = new List <Property>(candidates.GetRange(last, candidates.Count));
            }
        }