public static bool BasedOnEmptyFields(SplittingQuestion question, string column1, string column2) { bool matches = false; if (question.OneFieldValueIsEmpty) { if (column1 == "") { matches = column2 != ""; } else { matches = column2 == ""; } } else if (question.BothFieldValuesAreEmpty) { matches = column1 == "" && column2 == ""; } else { throw new ArgumentException(); } return(matches); }
public static bool BasedOnEditDistance(SplittingQuestion question, string column1, string column2) { bool matches = false; int editDistance = int.MaxValue; if (question.Field == FieldEnum.Phone2) { string[] column1Numbers = column1.Split(new string[] { "^^" }, StringSplitOptions.None); string[] column2Numbers = column2.Split(new string[] { "^^" }, StringSplitOptions.None); foreach (string column1Number in column1Numbers) { foreach (string column2Number in column2Numbers) { int currentEditDistance = NLP.EditDistance.Compute(column1Number, column2Number); if (currentEditDistance < editDistance) { editDistance = currentEditDistance; } } } } else { editDistance = NLP.EditDistance.Compute(column1, column2); } if (editDistance <= question.MaximumEditDistance) { matches = true; } return(matches); }
public static bool BasedOnMRNDistance(SplittingQuestion question, RecordPair pair) { bool ret = false; int maxDistance = question.MRNMaxDistance; if (pair.Record1.MRN > 0 && pair.Record2.MRN > 0) { ret = System.Math.Abs(pair.Record1.MRN - pair.Record2.MRN) <= maxDistance; } return(ret); }
public static bool ComputeSplitDirection(SplittingQuestion question, RecordPair pair) { bool matches = false; string column1 = pair.Record1.Cache[(int)question.Field]; string column2 = pair.Record2.Cache[(int)question.Field]; switch (question.MatchType) { case MatchTypeEnum.MRNDistance: matches = MatchTypeMatcher.BasedOnMRNDistance(question, pair); break; case MatchTypeEnum.LivesInMassResidence: matches = MatchTypeMatcher.BasedOnLivesInMassResidence(question, pair); break; case MatchTypeEnum.IsHomeless: matches = MatchTypeMatcher.BasedOnIsHomeless(question, pair); break; case MatchTypeEnum.EditDistance: matches = MatchTypeMatcher.BasedOnEditDistance(question, column1, column2); break; case MatchTypeEnum.EmptyMatch: matches = MatchTypeMatcher.BasedOnEmptyFields(question, column1, column2); break; case MatchTypeEnum.SoftMatch: if (question.Field == FieldEnum.Address1) { matches = MatchTypeMatcher.BasedOnAddressSoftMatch(question, column1, column2); } else if (question.Field == FieldEnum.DOB) { matches = MatchTypeMatcher.BasedOnDateSoftMatch(question, column1, column2); } break; case MatchTypeEnum.IsFemale: matches = MatchTypeMatcher.BasedOnIsFemale(question, pair); break; default: throw new ArgumentException(); } return(matches); }
public static bool BasedOnAddressSoftMatch(SplittingQuestion question, string column1, string column2) { bool passedSoftMatch = false; string[] address1Bits = column1.Split(' '); string[] address2Bits = column2.Split(' '); if (address1Bits.Length > 1 && address2Bits.Length > 1) { List <string> matches = new List <string>(); string[] suffixes = AddressSuffixLoader.GetAllStreetSuffixes(); bool integerFound = false; foreach (string address1Bit in address1Bits) { foreach (string address2Bit in address2Bits) { if (address1Bit == address2Bit && !matches.Contains(address1Bit) && !suffixes.Contains(address1Bit)) { matches.Add(address1Bit); int throwAway = 0; if (int.TryParse(address1Bit, out throwAway)) { integerFound = true; } } } } passedSoftMatch = (matches.Count >= 2 && integerFound); } return(passedSoftMatch); }
private static SplitDirection ComputeSplitDirection(LabeledPoint point, SplittingQuestion question, DecisionTreeOptions options) { int frameHeight = point.SourceTomogram.Height; int frameWidth = point.SourceTomogram.Width; int uY = point.Y + question.OffsetUY; if (uY >= frameHeight) { uY = -1; } int uX = point.X + question.OffsetUX; if (uX >= frameWidth) { uX = -1; } int vY = point.Y + question.OffsetVY; if (vY >= frameHeight) { vY = -1; } int vX = point.X + question.OffsetVX; if (vX >= frameWidth) { vX = -1; } int u = uY * frameWidth + uX; int v = vY * frameWidth + vX; int z = point.Y * frameWidth + point.X; float uVal = 0f, vVal = 0f, zVal = point.SourceTomogram.Data[z]; if (u < 0 || v < 0) { uVal = vVal = options.OutOfRangeValue; } else { uVal = point.SourceTomogram.Data[u]; //if(Math.Abs(zVal - uVal) > options.DistanceThreshold) //{ // uVal = options.OutOfRangeValue; //} vVal = point.SourceTomogram.Data[v]; //if(Math.Abs(zVal - vVal) > options.DistanceThreshold) //{ // vVal = options.OutOfRangeValue; //} } if ((uVal - vVal) < question.Threshold) { return(SplitDirection.Left); } else { return(SplitDirection.Right); } }
private static void RecurseAndPartition(List <LabeledPoint> trainingPoints, List <SplittingQuestion> splittingQuestions, int currentRecursionLevel, DecisionTreeOptions options, DecisionTreeNode currentNode, Random random) { Console.WriteLine($"{new String('-', currentRecursionLevel)}{currentRecursionLevel}"); if (currentRecursionLevel >= options.MaximumNumberOfRecursionLevels) { // create leaf node MakeLeafNode(currentNode, trainingPoints); } else { double currentShannonEntropy = ComputeShannonEntropy(trainingPoints); double highestGain = double.MinValue; SplittingQuestion bestSplittingQuestion = null; //for (int s = 0; s < splittingQuestions.Count; s++) Parallel.For(0, splittingQuestions.Count, s => { //Console.Write("."); //Interlocked.Increment(ref t); //Console.WriteLine($"{t}/{splittingQuestions.Count}"); //List<LabeledPoint> leftBucket1 = new List<LabeledPoint>(); //List<LabeledPoint> rightBucket1 = new List<LabeledPoint>(); List <LabeledPointGroup> leftBucket = new List <LabeledPointGroup>(); leftBucket.Add(new LabeledPointGroup { Count = 0, Class = 0 }); leftBucket.Add(new LabeledPointGroup { Count = 0, Class = 1 }); List <LabeledPointGroup> rightBucket = new List <LabeledPointGroup>(); rightBucket.Add(new LabeledPointGroup { Count = 0, Class = 0 }); rightBucket.Add(new LabeledPointGroup { Count = 0, Class = 1 }); SplittingQuestion splittingQuestion = splittingQuestions[s]; for (int p = 0; p < trainingPoints.Count; p++) { //if (random.NextDouble() < .1 || trainingPoints.Count < 1000) { LabeledPoint trainingPoint = trainingPoints[p]; SplitDirection split = ComputeSplitDirection(trainingPoint, splittingQuestion, options); if (split == SplitDirection.Left) { leftBucket[trainingPoint.Label].Count++; //leftBucket1.Add(trainingPoint); } else { //rightBucket1.Add(trainingPoint); rightBucket[trainingPoint.Label].Count++; } } } //double gain = ComputeGain(currentShannonEntropy, leftBucket1, rightBucket1); double gain = ComputeGain(currentShannonEntropy, leftBucket, rightBucket); lock (typeof(DecisionTreeBuilder)) { if (gain > highestGain) { highestGain = gain; bestSplittingQuestion = splittingQuestion; } } }); if (highestGain > options.SufficientGainLevel) { List <LabeledPoint> bestLeftBucket = new List <LabeledPoint>(); List <LabeledPoint> bestRightBucket = new List <LabeledPoint>(); for (int p = 0; p < trainingPoints.Count; p++) { LabeledPoint trainingPoint = trainingPoints[p]; SplitDirection split = ComputeSplitDirection(trainingPoint, bestSplittingQuestion, options); if (split == SplitDirection.Left) { bestLeftBucket.Add(trainingPoint); } else { bestRightBucket.Add(trainingPoint); } } currentNode.Question = bestSplittingQuestion; currentNode.LeftBranch = new DecisionTreeNode(); currentNode.RightBranch = new DecisionTreeNode(); currentNode.IsLeaf = false; //System.Console.WriteLine("left: " + bestLeftBucket.Count.ToString()); //System.Console.WriteLine("right: " + bestRightBucket.Count.ToString()); //splittingQuestions = // GenerateSplittingQuestions(random, options); RecurseAndPartition(bestLeftBucket, splittingQuestions, currentRecursionLevel + 1, options, currentNode.LeftBranch, random); RecurseAndPartition(bestRightBucket, splittingQuestions, currentRecursionLevel + 1, options, currentNode.RightBranch, random); } else { MakeLeafNode(currentNode, trainingPoints); } } }
public static bool BasedOnDateSoftMatch(SplittingQuestion question, string column1, string column2) { bool passedSoftMatch = false; DateTime column1Date; DateTime column2Date; if (DateTime.TryParse(column1, out column1Date) && DateTime.TryParse(column2, out column2Date)) { passedSoftMatch = (column1Date.Day == column2Date.Month || column2Date.Day == column1Date.Month); if (!passedSoftMatch) { passedSoftMatch = column1Date.Day == column2Date.Day && column1Date.Month == column2Date.Month && System.Math.Abs(column1Date.Year - column1Date.Year) == 10; if (!passedSoftMatch) { passedSoftMatch = System.Math.Abs(column1Date.Day - column2Date.Day) == 1 && column1Date.Month == column2Date.Month && column1Date.Year == column2Date.Year; if (!passedSoftMatch) { if (column1.Length == column2.Length) { int visuallySimilarChars = 0; for (int c = 0; c < column1.Length; c++) { char column1Char = column1[c]; char column2Char = column2[c]; if (column1Char != column2Char) { if ((column1Char == '3' && column2Char == '2') || (column2Char == '2' && column1Char == '3') || (column1Char == '5' && column2Char == '6') || (column1Char == '6' && column2Char == '5') || (column1Char == '1' && column2Char == '7') || (column1Char == '7' && column2Char == '1')) { visuallySimilarChars++; } } // too dissimilar if (visuallySimilarChars >= 2) { break; } } passedSoftMatch = visuallySimilarChars == 1; } } } } } return(passedSoftMatch); }
public static bool BasedOnIsFemale(SplittingQuestion question, RecordPair pair) { return(pair.Record1.Gender == pair.Record2.Gender && pair.Record1.Gender == "F"); }
public static bool BasedOnIsHomeless(SplittingQuestion question, RecordPair pair) { return(pair.Record1.Address1 == "HOMELESS" || pair.Record1.Address2 == "HOMELESS"); }
public static bool BasedOnLivesInMassResidence(SplittingQuestion question, RecordPair pair) { return(pair.Record1.LivesInLargeResidence || pair.Record2.LivesInLargeResidence); }
private void RecurseAndPartition(DecisionTreeNode parentNode, SplittingQuestion[] splittingQuestions, RecordPair[] allPairs, int level, double subsamplingPercentage, double minGainToBreak, Stack <Tuple <SplittingQuestion, bool> > splittingQuestionsThatGotUsHere) { Console.WriteLine($"Level {level}. {splittingQuestions.Length} splitting questions on {allPairs.Length} record pairs."); // find the best splitting question. SplittingQuestion bestQuestion = null; int numberDone = 0; int displayLeft = Console.CursorLeft; int displayTop = Console.CursorTop; bool reachedLeafNode = false; // is this precomputed? if not, then we need to compute it. if (parentNode.Question == null) { double highestGain = 0.0; double currentEntropy = ComputeShannonEntropy(allPairs); //Console.WriteLine("F mode...."); //foreach (SplittingQuestion splittingQuestion in splittingQuestions) Parallel.ForEach(splittingQuestions, splittingQuestion => { List <RecordPair> leftBucket = new List <RecordPair>(); List <RecordPair> rightBucket = new List <RecordPair>(); //Random rand = new Random(); int matchesInLeft = 0, noMatchesInLeft = 0, matchesInRight = 0, noMatchesInRight = 0; int pairNumber = 0; foreach (RecordPair pair in allPairs) { //if(pairNumber%10000==0) // Console.WriteLine($"{pairNumber} of {allPairs.Length}"); pairNumber++; //if (rand.NextDouble() <= subsamplingPercentage) { bool goLeft = ComputeSplitDirection(splittingQuestion, pair); if (goLeft) { if (pair.IsMatch) { matchesInLeft++; } else { noMatchesInLeft++; } } else { if (pair.IsMatch) { matchesInRight++; } else { noMatchesInRight++; } } } } double leftEntropy = ComputeShannonEntropy(matchesInLeft, noMatchesInLeft); double rightEntropy = ComputeShannonEntropy(matchesInRight, noMatchesInRight); double gain = ComputeGain(currentEntropy, leftEntropy, (noMatchesInLeft + matchesInLeft), rightEntropy, (matchesInRight + noMatchesInRight)); lock (splittingQuestions) { if (gain > highestGain) { highestGain = gain; bestQuestion = splittingQuestion; } } lock (splittingQuestions) { numberDone++; //Console.SetCursorPosition(displayLeft, displayTop); //Console.WriteLine($"{(int)((numberDone / (splittingQuestions.Length * 1.0)) * 100)}%"); } }); reachedLeafNode = highestGain <= minGainToBreak; if (reachedLeafNode) { parentNode.IsLeaf = true; int matchCount = allPairs.Count(n => n.IsMatch); int noMatchCount = allPairs.Count(n => !n.IsMatch); if (matchCount > noMatchCount) { parentNode.IsMatch = true; } else { parentNode.IsMatch = false; } } Console.WriteLine("\tGain limit met. Anything reaching this leaf will be labeled as " + (parentNode.IsMatch ? "match." : "no match")); } else { // otherwise it's precomputed, just take the current question as the "best question" bestQuestion = parentNode.Question; reachedLeafNode = parentNode.IsLeaf; Console.WriteLine("\tPrecomputed. Anything reaching this leaf will be labeled as " + (parentNode.IsMatch ? "match." : "no match")); } //if (reachedLeafNode) //{ // StringBuilder sb = new StringBuilder(1024); // sb.AppendLine($"Level {level}, IsMatch {parentNode.IsMatch}"); // sb.AppendLine("Questions:"); // foreach (Tuple<SplittingQuestion, bool> questionAnswer in splittingQuestionsThatGotUsHere) // { // sb.AppendLine($"\t{questionAnswer.Item1}:{questionAnswer.Item2}"); // } // sb.AppendLine($"match: {matchCount}, nomatch: {noMatchCount}"); // File.WriteAllText( // $"c:/users/brush/desktop/treeresults/{Guid.NewGuid().ToString().Replace("-", "")}", // sb.ToString()); // Console.WriteLine("\tGain limit met. Anything reaching this leaf will be labeled as " + (parentNode.IsMatch ? "match." : "no match")); //} if (!reachedLeafNode) { Console.WriteLine($"\tBest question at this level is {bestQuestion}"); List <RecordPair> bestLeftBucket = new List <RecordPair>(), bestRightBucket = new List <RecordPair>(); Parallel.ForEach(allPairs, pair => { bool goLeft = ComputeSplitDirection(bestQuestion, pair); if (goLeft) { lock (bestLeftBucket) { bestLeftBucket.Add(pair); } } else { lock (bestRightBucket) { bestRightBucket.Add(pair); } } }); parentNode.Question = bestQuestion; if (parentNode.LeftBranch == null) { parentNode.LeftBranch = new DecisionTreeNode(); } if (parentNode.RightBranch == null) { parentNode.RightBranch = new DecisionTreeNode(); } splittingQuestionsThatGotUsHere.Push(new Tuple <SplittingQuestion, bool>(bestQuestion, true)); RecurseAndPartition(parentNode.LeftBranch, splittingQuestions, bestLeftBucket.ToArray(), level + 1, subsamplingPercentage, minGainToBreak, splittingQuestionsThatGotUsHere); splittingQuestionsThatGotUsHere.Pop(); splittingQuestionsThatGotUsHere.Push(new Tuple <SplittingQuestion, bool>(bestQuestion, false)); RecurseAndPartition(parentNode.RightBranch, splittingQuestions, bestRightBucket.ToArray(), level + 1, subsamplingPercentage, minGainToBreak, splittingQuestionsThatGotUsHere); splittingQuestionsThatGotUsHere.Pop(); } }