Esempio n. 1
0
        public static bool BasedOnEmptyFields(SplittingQuestion question, string column1, string column2)
        {
            bool matches = false;

            if (question.OneFieldValueIsEmpty)
            {
                if (column1 == "")
                {
                    matches = column2 != "";
                }
                else
                {
                    matches = column2 == "";
                }
            }
            else if (question.BothFieldValuesAreEmpty)
            {
                matches = column1 == "" && column2 == "";
            }
            else
            {
                throw new ArgumentException();
            }

            return(matches);
        }
Esempio n. 2
0
        public static bool BasedOnEditDistance(SplittingQuestion question, string column1, string column2)
        {
            bool matches      = false;
            int  editDistance = int.MaxValue;

            if (question.Field == FieldEnum.Phone2)
            {
                string[] column1Numbers = column1.Split(new string[] { "^^" }, StringSplitOptions.None);
                string[] column2Numbers = column2.Split(new string[] { "^^" }, StringSplitOptions.None);

                foreach (string column1Number in column1Numbers)
                {
                    foreach (string column2Number in column2Numbers)
                    {
                        int currentEditDistance = NLP.EditDistance.Compute(column1Number, column2Number);

                        if (currentEditDistance < editDistance)
                        {
                            editDistance = currentEditDistance;
                        }
                    }
                }
            }
            else
            {
                editDistance = NLP.EditDistance.Compute(column1, column2);
            }

            if (editDistance <= question.MaximumEditDistance)
            {
                matches = true;
            }

            return(matches);
        }
Esempio n. 3
0
        public static bool BasedOnMRNDistance(SplittingQuestion question, RecordPair pair)
        {
            bool ret         = false;
            int  maxDistance = question.MRNMaxDistance;

            if (pair.Record1.MRN > 0 && pair.Record2.MRN > 0)
            {
                ret = System.Math.Abs(pair.Record1.MRN - pair.Record2.MRN) <= maxDistance;
            }
            return(ret);
        }
Esempio n. 4
0
        public static bool ComputeSplitDirection(SplittingQuestion question, RecordPair pair)
        {
            bool matches = false;

            string column1 = pair.Record1.Cache[(int)question.Field];
            string column2 = pair.Record2.Cache[(int)question.Field];

            switch (question.MatchType)
            {
            case MatchTypeEnum.MRNDistance:
                matches = MatchTypeMatcher.BasedOnMRNDistance(question, pair);
                break;

            case MatchTypeEnum.LivesInMassResidence:
                matches = MatchTypeMatcher.BasedOnLivesInMassResidence(question, pair);
                break;

            case MatchTypeEnum.IsHomeless:
                matches = MatchTypeMatcher.BasedOnIsHomeless(question, pair);
                break;

            case MatchTypeEnum.EditDistance:
                matches = MatchTypeMatcher.BasedOnEditDistance(question, column1, column2);
                break;

            case MatchTypeEnum.EmptyMatch:
                matches = MatchTypeMatcher.BasedOnEmptyFields(question, column1, column2);
                break;

            case MatchTypeEnum.SoftMatch:
                if (question.Field == FieldEnum.Address1)
                {
                    matches = MatchTypeMatcher.BasedOnAddressSoftMatch(question, column1, column2);
                }
                else if (question.Field == FieldEnum.DOB)
                {
                    matches = MatchTypeMatcher.BasedOnDateSoftMatch(question, column1, column2);
                }
                break;

            case MatchTypeEnum.IsFemale:
                matches = MatchTypeMatcher.BasedOnIsFemale(question, pair);
                break;

            default:
                throw new ArgumentException();
            }
            return(matches);
        }
Esempio n. 5
0
        public static bool BasedOnAddressSoftMatch(SplittingQuestion question, string column1, string column2)
        {
            bool passedSoftMatch = false;

            string[] address1Bits = column1.Split(' ');
            string[] address2Bits = column2.Split(' ');

            if (address1Bits.Length > 1 && address2Bits.Length > 1)
            {
                List <string> matches = new List <string>();

                string[] suffixes = AddressSuffixLoader.GetAllStreetSuffixes();

                bool integerFound = false;
                foreach (string address1Bit in address1Bits)
                {
                    foreach (string address2Bit in address2Bits)
                    {
                        if (address1Bit == address2Bit && !matches.Contains(address1Bit) &&
                            !suffixes.Contains(address1Bit))
                        {
                            matches.Add(address1Bit);

                            int throwAway = 0;
                            if (int.TryParse(address1Bit, out throwAway))
                            {
                                integerFound = true;
                            }
                        }
                    }
                }

                passedSoftMatch = (matches.Count >= 2 && integerFound);
            }
            return(passedSoftMatch);
        }
Esempio n. 6
0
        private static SplitDirection ComputeSplitDirection(LabeledPoint point, SplittingQuestion question,
                                                            DecisionTreeOptions options)
        {
            int frameHeight = point.SourceTomogram.Height;
            int frameWidth  = point.SourceTomogram.Width;

            int uY = point.Y + question.OffsetUY;

            if (uY >= frameHeight)
            {
                uY = -1;
            }
            int uX = point.X + question.OffsetUX;

            if (uX >= frameWidth)
            {
                uX = -1;
            }

            int vY = point.Y + question.OffsetVY;

            if (vY >= frameHeight)
            {
                vY = -1;
            }
            int vX = point.X + question.OffsetVX;

            if (vX >= frameWidth)
            {
                vX = -1;
            }

            int u = uY * frameWidth + uX;
            int v = vY * frameWidth + vX;
            int z = point.Y * frameWidth + point.X;

            float uVal = 0f, vVal = 0f, zVal = point.SourceTomogram.Data[z];

            if (u < 0 || v < 0)
            {
                uVal = vVal = options.OutOfRangeValue;
            }
            else
            {
                uVal = point.SourceTomogram.Data[u];
                //if(Math.Abs(zVal - uVal) > options.DistanceThreshold)
                //{
                //    uVal = options.OutOfRangeValue;
                //}
                vVal = point.SourceTomogram.Data[v];
                //if(Math.Abs(zVal - vVal) > options.DistanceThreshold)
                //{
                //    vVal = options.OutOfRangeValue;
                //}
            }

            if ((uVal - vVal) < question.Threshold)
            {
                return(SplitDirection.Left);
            }
            else
            {
                return(SplitDirection.Right);
            }
        }
Esempio n. 7
0
        private static void RecurseAndPartition(List <LabeledPoint> trainingPoints, List <SplittingQuestion> splittingQuestions,
                                                int currentRecursionLevel, DecisionTreeOptions options, DecisionTreeNode currentNode, Random random)
        {
            Console.WriteLine($"{new String('-', currentRecursionLevel)}{currentRecursionLevel}");

            if (currentRecursionLevel >= options.MaximumNumberOfRecursionLevels)
            {
                // create leaf node
                MakeLeafNode(currentNode, trainingPoints);
            }
            else
            {
                double            currentShannonEntropy = ComputeShannonEntropy(trainingPoints);
                double            highestGain           = double.MinValue;
                SplittingQuestion bestSplittingQuestion = null;

                //for (int s = 0; s < splittingQuestions.Count; s++)
                Parallel.For(0, splittingQuestions.Count, s =>
                {
                    //Console.Write(".");
                    //Interlocked.Increment(ref t);
                    //Console.WriteLine($"{t}/{splittingQuestions.Count}");

                    //List<LabeledPoint> leftBucket1 = new List<LabeledPoint>();
                    //List<LabeledPoint> rightBucket1 = new List<LabeledPoint>();

                    List <LabeledPointGroup> leftBucket = new List <LabeledPointGroup>();
                    leftBucket.Add(new LabeledPointGroup
                    {
                        Count = 0,
                        Class = 0
                    });
                    leftBucket.Add(new LabeledPointGroup
                    {
                        Count = 0,
                        Class = 1
                    });
                    List <LabeledPointGroup> rightBucket = new List <LabeledPointGroup>();
                    rightBucket.Add(new LabeledPointGroup
                    {
                        Count = 0,
                        Class = 0
                    });
                    rightBucket.Add(new LabeledPointGroup
                    {
                        Count = 0,
                        Class = 1
                    });

                    SplittingQuestion splittingQuestion = splittingQuestions[s];

                    for (int p = 0; p < trainingPoints.Count; p++)
                    {
                        //if (random.NextDouble() < .1 || trainingPoints.Count < 1000)
                        {
                            LabeledPoint trainingPoint = trainingPoints[p];

                            SplitDirection split = ComputeSplitDirection(trainingPoint, splittingQuestion, options);

                            if (split == SplitDirection.Left)
                            {
                                leftBucket[trainingPoint.Label].Count++;
                                //leftBucket1.Add(trainingPoint);
                            }
                            else
                            {
                                //rightBucket1.Add(trainingPoint);
                                rightBucket[trainingPoint.Label].Count++;
                            }
                        }
                    }

                    //double gain = ComputeGain(currentShannonEntropy, leftBucket1, rightBucket1);
                    double gain = ComputeGain(currentShannonEntropy, leftBucket, rightBucket);

                    lock (typeof(DecisionTreeBuilder))
                    {
                        if (gain > highestGain)
                        {
                            highestGain           = gain;
                            bestSplittingQuestion = splittingQuestion;
                        }
                    }
                });

                if (highestGain > options.SufficientGainLevel)
                {
                    List <LabeledPoint> bestLeftBucket  = new List <LabeledPoint>();
                    List <LabeledPoint> bestRightBucket = new List <LabeledPoint>();

                    for (int p = 0; p < trainingPoints.Count; p++)
                    {
                        LabeledPoint trainingPoint = trainingPoints[p];

                        SplitDirection split = ComputeSplitDirection(trainingPoint, bestSplittingQuestion, options);

                        if (split == SplitDirection.Left)
                        {
                            bestLeftBucket.Add(trainingPoint);
                        }
                        else
                        {
                            bestRightBucket.Add(trainingPoint);
                        }
                    }

                    currentNode.Question    = bestSplittingQuestion;
                    currentNode.LeftBranch  = new DecisionTreeNode();
                    currentNode.RightBranch = new DecisionTreeNode();
                    currentNode.IsLeaf      = false;

                    //System.Console.WriteLine("left: " + bestLeftBucket.Count.ToString());
                    //System.Console.WriteLine("right: " + bestRightBucket.Count.ToString());

                    //splittingQuestions =
                    //    GenerateSplittingQuestions(random, options);

                    RecurseAndPartition(bestLeftBucket, splittingQuestions,
                                        currentRecursionLevel + 1, options, currentNode.LeftBranch, random);

                    RecurseAndPartition(bestRightBucket, splittingQuestions,
                                        currentRecursionLevel + 1, options, currentNode.RightBranch, random);
                }
                else
                {
                    MakeLeafNode(currentNode, trainingPoints);
                }
            }
        }
Esempio n. 8
0
        public static bool BasedOnDateSoftMatch(SplittingQuestion question, string column1, string column2)
        {
            bool     passedSoftMatch = false;
            DateTime column1Date;
            DateTime column2Date;

            if (DateTime.TryParse(column1, out column1Date) &&
                DateTime.TryParse(column2, out column2Date))
            {
                passedSoftMatch = (column1Date.Day == column2Date.Month ||
                                   column2Date.Day == column1Date.Month);

                if (!passedSoftMatch)
                {
                    passedSoftMatch = column1Date.Day == column2Date.Day &&
                                      column1Date.Month == column2Date.Month &&
                                      System.Math.Abs(column1Date.Year - column1Date.Year) == 10;

                    if (!passedSoftMatch)
                    {
                        passedSoftMatch = System.Math.Abs(column1Date.Day - column2Date.Day) == 1 &&
                                          column1Date.Month == column2Date.Month &&
                                          column1Date.Year == column2Date.Year;

                        if (!passedSoftMatch)
                        {
                            if (column1.Length == column2.Length)
                            {
                                int visuallySimilarChars = 0;
                                for (int c = 0; c < column1.Length; c++)
                                {
                                    char column1Char = column1[c];
                                    char column2Char = column2[c];

                                    if (column1Char != column2Char)
                                    {
                                        if ((column1Char == '3' && column2Char == '2') ||
                                            (column2Char == '2' && column1Char == '3') ||
                                            (column1Char == '5' && column2Char == '6') ||
                                            (column1Char == '6' && column2Char == '5') ||
                                            (column1Char == '1' && column2Char == '7') ||
                                            (column1Char == '7' && column2Char == '1'))
                                        {
                                            visuallySimilarChars++;
                                        }
                                    }

                                    // too dissimilar
                                    if (visuallySimilarChars >= 2)
                                    {
                                        break;
                                    }
                                }

                                passedSoftMatch = visuallySimilarChars == 1;
                            }
                        }
                    }
                }
            }
            return(passedSoftMatch);
        }
Esempio n. 9
0
 public static bool BasedOnIsFemale(SplittingQuestion question, RecordPair pair)
 {
     return(pair.Record1.Gender == pair.Record2.Gender && pair.Record1.Gender == "F");
 }
Esempio n. 10
0
 public static bool BasedOnIsHomeless(SplittingQuestion question, RecordPair pair)
 {
     return(pair.Record1.Address1 == "HOMELESS" || pair.Record1.Address2 == "HOMELESS");
 }
Esempio n. 11
0
 public static bool BasedOnLivesInMassResidence(SplittingQuestion question, RecordPair pair)
 {
     return(pair.Record1.LivesInLargeResidence || pair.Record2.LivesInLargeResidence);
 }
Esempio n. 12
0
        private void RecurseAndPartition(DecisionTreeNode parentNode, SplittingQuestion[] splittingQuestions,
                                         RecordPair[] allPairs, int level, double subsamplingPercentage, double minGainToBreak,
                                         Stack <Tuple <SplittingQuestion, bool> > splittingQuestionsThatGotUsHere)
        {
            Console.WriteLine($"Level {level}. {splittingQuestions.Length} splitting questions on {allPairs.Length} record pairs.");

            // find the best splitting question.
            SplittingQuestion bestQuestion = null;
            int numberDone  = 0;
            int displayLeft = Console.CursorLeft;
            int displayTop  = Console.CursorTop;

            bool reachedLeafNode = false;

            // is this precomputed? if not, then we need to compute it.
            if (parentNode.Question == null)
            {
                double highestGain    = 0.0;
                double currentEntropy = ComputeShannonEntropy(allPairs);

                //Console.WriteLine("F mode....");
                //foreach (SplittingQuestion splittingQuestion in splittingQuestions)
                Parallel.ForEach(splittingQuestions, splittingQuestion =>
                {
                    List <RecordPair> leftBucket  = new List <RecordPair>();
                    List <RecordPair> rightBucket = new List <RecordPair>();

                    //Random rand = new Random();

                    int matchesInLeft = 0, noMatchesInLeft = 0,
                    matchesInRight    = 0, noMatchesInRight = 0;

                    int pairNumber = 0;
                    foreach (RecordPair pair in allPairs)
                    {
                        //if(pairNumber%10000==0)
                        //    Console.WriteLine($"{pairNumber} of {allPairs.Length}");
                        pairNumber++;
                        //if (rand.NextDouble() <= subsamplingPercentage)
                        {
                            bool goLeft = ComputeSplitDirection(splittingQuestion, pair);

                            if (goLeft)
                            {
                                if (pair.IsMatch)
                                {
                                    matchesInLeft++;
                                }
                                else
                                {
                                    noMatchesInLeft++;
                                }
                            }
                            else
                            {
                                if (pair.IsMatch)
                                {
                                    matchesInRight++;
                                }
                                else
                                {
                                    noMatchesInRight++;
                                }
                            }
                        }
                    }

                    double leftEntropy  = ComputeShannonEntropy(matchesInLeft, noMatchesInLeft);
                    double rightEntropy = ComputeShannonEntropy(matchesInRight, noMatchesInRight);

                    double gain = ComputeGain(currentEntropy, leftEntropy, (noMatchesInLeft + matchesInLeft),
                                              rightEntropy, (matchesInRight + noMatchesInRight));

                    lock (splittingQuestions)
                    {
                        if (gain > highestGain)
                        {
                            highestGain  = gain;
                            bestQuestion = splittingQuestion;
                        }
                    }
                    lock (splittingQuestions)
                    {
                        numberDone++;
                        //Console.SetCursorPosition(displayLeft, displayTop);
                        //Console.WriteLine($"{(int)((numberDone / (splittingQuestions.Length * 1.0)) * 100)}%");
                    }
                });

                reachedLeafNode = highestGain <= minGainToBreak;

                if (reachedLeafNode)
                {
                    parentNode.IsLeaf = true;
                    int matchCount   = allPairs.Count(n => n.IsMatch);
                    int noMatchCount = allPairs.Count(n => !n.IsMatch);

                    if (matchCount > noMatchCount)
                    {
                        parentNode.IsMatch = true;
                    }
                    else
                    {
                        parentNode.IsMatch = false;
                    }
                }

                Console.WriteLine("\tGain limit met. Anything reaching this leaf will be labeled as " + (parentNode.IsMatch ? "match." : "no match"));
            }
            else
            {
                // otherwise it's precomputed, just take the current question as the "best question"
                bestQuestion    = parentNode.Question;
                reachedLeafNode = parentNode.IsLeaf;

                Console.WriteLine("\tPrecomputed. Anything reaching this leaf will be labeled as " + (parentNode.IsMatch ? "match." : "no match"));
            }

            //if (reachedLeafNode)
            //{


            //    StringBuilder sb = new StringBuilder(1024);
            //    sb.AppendLine($"Level {level}, IsMatch {parentNode.IsMatch}");
            //    sb.AppendLine("Questions:");
            //    foreach (Tuple<SplittingQuestion, bool> questionAnswer in splittingQuestionsThatGotUsHere)
            //    {
            //        sb.AppendLine($"\t{questionAnswer.Item1}:{questionAnswer.Item2}");
            //    }
            //    sb.AppendLine($"match: {matchCount}, nomatch: {noMatchCount}");
            //    File.WriteAllText(
            //        $"c:/users/brush/desktop/treeresults/{Guid.NewGuid().ToString().Replace("-", "")}",
            //        sb.ToString());

            //    Console.WriteLine("\tGain limit met. Anything reaching this leaf will be labeled as " + (parentNode.IsMatch ? "match." : "no match"));
            //}
            if (!reachedLeafNode)
            {
                Console.WriteLine($"\tBest question at this level is {bestQuestion}");

                List <RecordPair> bestLeftBucket = new List <RecordPair>(), bestRightBucket = new List <RecordPair>();
                Parallel.ForEach(allPairs, pair =>
                {
                    bool goLeft = ComputeSplitDirection(bestQuestion, pair);

                    if (goLeft)
                    {
                        lock (bestLeftBucket)
                        {
                            bestLeftBucket.Add(pair);
                        }
                    }
                    else
                    {
                        lock (bestRightBucket)
                        {
                            bestRightBucket.Add(pair);
                        }
                    }
                });

                parentNode.Question = bestQuestion;
                if (parentNode.LeftBranch == null)
                {
                    parentNode.LeftBranch = new DecisionTreeNode();
                }

                if (parentNode.RightBranch == null)
                {
                    parentNode.RightBranch = new DecisionTreeNode();
                }

                splittingQuestionsThatGotUsHere.Push(new Tuple <SplittingQuestion, bool>(bestQuestion, true));
                RecurseAndPartition(parentNode.LeftBranch, splittingQuestions, bestLeftBucket.ToArray(),
                                    level + 1, subsamplingPercentage, minGainToBreak, splittingQuestionsThatGotUsHere);
                splittingQuestionsThatGotUsHere.Pop();

                splittingQuestionsThatGotUsHere.Push(new Tuple <SplittingQuestion, bool>(bestQuestion, false));
                RecurseAndPartition(parentNode.RightBranch, splittingQuestions, bestRightBucket.ToArray(),
                                    level + 1, subsamplingPercentage, minGainToBreak, splittingQuestionsThatGotUsHere);
                splittingQuestionsThatGotUsHere.Pop();
            }
        }