コード例 #1
0
        public static List <RecordPair> LoadAllPositivesFromAnswerKey(string answerKeyPath, Dictionary <int, Record> finalDataSet)
        {
            List <RecordPair> ret = new List <RecordPair>();

            IEnumerable <string> lines = File.ReadLines(answerKeyPath);

            Parallel.ForEach(lines, line =>
            {
                string[] bits = line.Split(',');
                if (bits.Length == 3)
                {
                    Record[] records = bits.Take(2).Select(n => finalDataSet[int.Parse(n)]).ToArray();
                    RecordPair pair  = new RecordPair
                    {
                        IsMatch = true,
                        Record1 = records[0],
                        Record2 = records[1],
                    };
                    lock (ret)
                    {
                        ret.Add(pair);
                    }
                }
            });
            return(ret);
        }
コード例 #2
0
        private static double ComputeShannonEntropy(RecordPair[] pairs)
        {
            double entropy = 0.0;

            int numberInMatchColumn  = 0;
            int numberInNoMatchColum = 0;

            for (int c = 0; c < pairs.Length; c++)
            {
                RecordPair pair = pairs[c];

                if (pair.IsMatch)
                {
                    numberInMatchColumn++;
                }
                else
                {
                    numberInNoMatchColum++;
                }
            }

            double totalLength = pairs.Length;

            double ratio = numberInMatchColumn / totalLength;

            entropy = -(ratio * System.Math.Log(ratio, 2));

            ratio    = numberInNoMatchColum / totalLength;
            entropy += -(ratio * System.Math.Log(ratio, 2));

            return(entropy);
        }
コード例 #3
0
        public static bool PassesBigBucketFilter(RecordPair pair, double percentageOfBigBucketToAllow = .5)
        {
            return(true);
            //Random rand = new Random();
            //bool fallsIntoBigBucket = false;
            //int dobEditDistance = EditDistance.Compute(pair.Record1.DOB, pair.Record2.DOB);
            //if (!(dobEditDistance <= 0))
            //{
            //    if (!(System.Math.Abs(pair.Record1.MRN - pair.Record2.MRN) <= 100))
            //    {
            //        if (!(EditDistance.Compute(pair.Record1.LastName, pair.Record2.LastName) <= 1))
            //        {
            //            if (!(dobEditDistance <= 1))
            //            {
            //                fallsIntoBigBucket = true;
            //            }
            //        }
            //    }
            //}

            //bool shouldBeUsed = true;
            //if (fallsIntoBigBucket)
            //{
            //    shouldBeUsed = rand.NextDouble() <= percentageOfBigBucketToAllow;
            //}

            //return shouldBeUsed;
        }
コード例 #4
0
        private static bool RecurseAndCheckIsMatch(DecisionTreeNode parentNode, RecordPair pair, TreeLogger logger)
        {
            if (parentNode.IsLeaf)
            {
                if (logger != null)
                {
                    logger.FinalResultIsMatch = parentNode.IsMatch;
                    Console.WriteLine($"Is Match {parentNode.IsMatch}");
                }

                return(parentNode.IsMatch);
            }
            else
            {
                bool goesLeft = ComputeSplitDirection(parentNode.Question, pair);

                if (logger != null)
                {
                    Console.WriteLine($"Question: {parentNode.Question} Answer: {goesLeft}.");
                    logger.SplittingQuestionsToTheBottom.Add(new Tuple <SplittingQuestion, bool>(parentNode.Question, goesLeft));
                }

                if (goesLeft)
                {
                    return(RecurseAndCheckIsMatch(parentNode.LeftBranch, pair, logger));
                }
                else
                {
                    return(RecurseAndCheckIsMatch(parentNode.RightBranch, pair, logger));
                }
            }
        }
コード例 #5
0
        public static IEnumerable <RecordPair> GetAllNegativeRecordPairsForMRNData(string mrnRecordPath)
        {
            List <RecordPair> trainingData = new List <RecordPair>();
            List <Record>     mrnRecords   = GetCleanedRecordsFromMRNFile(mrnRecordPath);

            for (int c = 0; c < mrnRecords.Count(); c += 2)
            {
                for (int d = 0; d < mrnRecords.Count; d += 2)
                {
                    if (c != d)
                    {
                        RecordPair nonMatchPair1 = new RecordPair
                        {
                            IsMatch = false,
                            Record1 = mrnRecords[c],
                            Record2 = mrnRecords[d]
                        };

                        yield return(nonMatchPair1);

                        RecordPair nonMatchPair2 = new RecordPair
                        {
                            IsMatch = false,
                            Record1 = mrnRecords[c + 1],
                            Record2 = mrnRecords[d]
                        };

                        yield return(nonMatchPair2);
                    }
                }
            }
        }
コード例 #6
0
ファイル: DataCleaner.cs プロジェクト: kwende/MitchMatch
        public static RecordPair CleanRecordPair(RecordPair pair, string streetSuffixesFile)
        {
            pair.Record1 = CleanRecord(pair.Record1);
            pair.Record2 = CleanRecord(pair.Record2);

            return(pair);
        }
コード例 #7
0
        public static Dictionary <Record, List <Record> > GetMatches(List <Record> records)
        {
            Dictionary <Record, List <Record> > ret = new Dictionary <Record, List <Record> >();

            string[]       finalDataSetList = File.ReadAllLines("c:/users/brush/desktop/finaldataset.csv");
            DecisionTree[] forest           = DataLoader.LoadForestFromDirectory("C:/users/brush/desktop/forest");

            foreach (Record record in records)
            {
                ret.Add(record, new List <Record>());
            }

            Console.WriteLine("Searching for matches...");
            //for (int c = 1; c < finalDataSetList.Length; c++)
            Parallel.For(1, finalDataSetList.Length, c =>
            {
                if (c % 10000 == 0)
                {
                    Console.WriteLine($"{(c / (finalDataSetList.Length * 1.0) * 100)}%");
                }

                string finalDataSetRow = finalDataSetList[c];
                string[] bits          = finalDataSetRow.Split(',');

                if (bits[0] != "")
                {
                    int enterpriseId = int.Parse(bits[0]);

                    if (enterpriseId > 15374761)
                    {
                        Record comparisonRecord = DataCleaner.CleanRecord(Record.FromFinalDatasetString(bits));

                        foreach (Record toMatch in records)
                        {
                            if (!toMatch.Equals(comparisonRecord))
                            {
                                RecordPair pair = new RecordPair
                                {
                                    Record1 = toMatch,
                                    Record2 = comparisonRecord,
                                };

                                bool isMatch = DecisionTreeBuilder.IsMatch(pair, forest, null);

                                if (isMatch)
                                {
                                    lock (ret)
                                    {
                                        ret[toMatch].Add(comparisonRecord);
                                    }
                                }
                            }
                        }
                    }
                }
            });

            return(ret);
        }
コード例 #8
0
ファイル: MatchTypeMatcher.cs プロジェクト: kwende/MitchMatch
        public static bool BasedOnMRNDistance(SplittingQuestion question, RecordPair pair)
        {
            bool ret         = false;
            int  maxDistance = question.MRNMaxDistance;

            if (pair.Record1.MRN > 0 && pair.Record2.MRN > 0)
            {
                ret = System.Math.Abs(pair.Record1.MRN - pair.Record2.MRN) <= maxDistance;
            }
            return(ret);
        }
コード例 #9
0
        public static void Test()
        {
            Console.WriteLine("Line 1:");
            string line1 = Console.ReadLine();

            Console.WriteLine("Line 2:");
            string line2 = Console.ReadLine();

            RecordPair pair = new RecordPair();

            pair.Record1 = DataCleaner.CleanRecord(Record.FromString(line1));
            pair.Record2 = DataCleaner.CleanRecord(Record.FromString(line2));

            DecisionTree[] forest = DataLoader.LoadForestFromDirectory(".");

            TreeLogger logger  = new TreeLogger();
            bool       isMatch = DecisionTreeBuilder.IsMatch(pair, forest, logger);

            Console.WriteLine("Search for those in training data who make it there?");
            string response = Console.ReadLine();

            //RecordPair pair2 = new RecordPair();
            //pair2.Record1 = Record.FromString("LAZAR,,KALLER,,M,,16/10/1965,,,,,,,-1,,,4839002,15479245,");
            //pair2.Record2 = Record.FromString("ADRIENNE,,KELLEY,,F,895535860,16/10/1965,9175738850,,1560 SILVER ST,2H,BRONX,NY,10461,,[email protected],4799491,15637549,");

            //bool ret = DecisionTreeBuilder.ReplayDecision(pair2, logger.SplittingQuestionsToTheBottom);

            if (response.ToLower() == "y")
            {
                using (StreamWriter sw = File.AppendText("c:/users/brush/desktop/gothere.txt"))
                {
                    List <RecordPair> pairs = new List <RecordPair>();
                    Console.Write("Loading training data for this iteration...");
                    pairs.AddRange(DataLoader.GetPositivesFromMRNData("mrns.csv"));
                    pairs.AddRange(DataLoader.GetHandPassedSets("more.csv"));
                    pairs.AddRange(DataLoader.GetRejectedRecordPairs("rejected.txt"));
                    //pairs.AddRange(DataLoader.GetPairsFromMisfitsFile("misfits.txt"));
                    Console.WriteLine("...done");

                    Parallel.ForEach(pairs, p =>
                    {
                        if (DecisionTreeBuilder.ReplayDecision(p, logger.SplittingQuestionsToTheBottom))
                        {
                            lock (sw)
                            {
                                sw.WriteLine(p);
                            }
                        }
                    });
                }
            }

            return;
        }
コード例 #10
0
        public static IEnumerable <RecordPair> LoadNegativesFromAnswerKey(List <RecordPair> positives)
        {
            for (int a = 0; a < positives.Count; a++)
            {
                RecordPair matchPair1 = positives[a];

                for (int b = a + 1; b < positives.Count; b++)
                {
                    RecordPair matchPair2 = positives[b];

                    if (matchPair1.Record1.EnterpriseId != matchPair2.Record1.EnterpriseId &&
                        matchPair1.Record1.EnterpriseId != matchPair2.Record2.EnterpriseId &&
                        matchPair1.Record2.EnterpriseId != matchPair2.Record1.EnterpriseId &&
                        matchPair1.Record2.EnterpriseId != matchPair2.Record2.EnterpriseId)
                    {
                        RecordPair noMatch = new RecordPair
                        {
                            IsMatch = false,
                            Record1 = matchPair1.Record1,
                            Record2 = matchPair2.Record1
                        };

                        yield return(noMatch);

                        noMatch = new RecordPair
                        {
                            IsMatch = false,
                            Record1 = matchPair1.Record2,
                            Record2 = matchPair2.Record2,
                        };

                        yield return(noMatch);

                        noMatch = new RecordPair
                        {
                            IsMatch = false,
                            Record1 = matchPair1.Record1,
                            Record2 = matchPair2.Record2
                        };

                        yield return(noMatch);

                        noMatch = new RecordPair
                        {
                            IsMatch = false,
                            Record1 = matchPair1.Record2,
                            Record2 = matchPair2.Record1
                        };

                        yield return(noMatch);
                    }
                }
            }
        }
コード例 #11
0
        public static bool IsMatch(RecordPair pair, DecisionTree[] forest, TreeLogger logger)
        {
            int positives = 0;

            foreach (DecisionTree tree in forest)
            {
                if (RecurseAndCheckIsMatch(tree.Root, pair, logger))
                {
                    positives++;
                }
            }

            return((positives / (forest.Length) * 1.0) > .5);
        }
コード例 #12
0
        private void ViewRecords(RecordPair rp)
        {
            var orgUrl   = string.Empty;
            var recordId = string.Empty;

            if (rp == null)
            {
                MessageBox.Show("You need to select a record");
                return;
            }

            var entityLogicalName = rp.Entity.LogicalName;

            try
            {
                if (rp.LRecord.Id != null)
                {
                    orgUrl   = GetUrl(leftProxy.ServiceConfiguration.CurrentServiceEndpoint.Address.ToString());
                    recordId = rp.LRecord.Id.ToString();
                    wbLeftRecord.Navigate(new Uri($"{orgUrl}main.aspx?etn={entityLogicalName}&pagetype=entityrecord&id=%7B{recordId}%7D"));
                }
                else
                {
                    wbLeftRecord.Navigate(new Uri("about:blank"));
                }
            }
            catch (Exception ex)
            {
                MessageBox.Show($"[MainForm.BtnViewRecords]) Coudln't view left record, Message: {ex.Message}");
            }

            try
            {
                if (rp.RRecord.Id != null)
                {
                    orgUrl   = GetUrl(rightProxy.ServiceConfiguration.CurrentServiceEndpoint.Address.ToString());
                    recordId = rp.RRecord.Id.ToString();
                    wbRightRecord.Navigate(new Uri($"{orgUrl}main.aspx?etn={entityLogicalName}&pagetype=entityrecord&id=%7B{recordId}%7D"));
                }
                else
                {
                    wbRightRecord.Navigate(new Uri("about:blank"));
                }
            }
            catch (Exception ex)
            {
                MessageBox.Show($"[MainForm.BtnViewRecords]) Coudln't view right record, Message: {ex.Message}");
            }
        }
コード例 #13
0
        public static bool ComputeSplitDirection(SplittingQuestion question, RecordPair pair)
        {
            bool matches = false;

            string column1 = pair.Record1.Cache[(int)question.Field];
            string column2 = pair.Record2.Cache[(int)question.Field];

            switch (question.MatchType)
            {
            case MatchTypeEnum.MRNDistance:
                matches = MatchTypeMatcher.BasedOnMRNDistance(question, pair);
                break;

            case MatchTypeEnum.LivesInMassResidence:
                matches = MatchTypeMatcher.BasedOnLivesInMassResidence(question, pair);
                break;

            case MatchTypeEnum.IsHomeless:
                matches = MatchTypeMatcher.BasedOnIsHomeless(question, pair);
                break;

            case MatchTypeEnum.EditDistance:
                matches = MatchTypeMatcher.BasedOnEditDistance(question, column1, column2);
                break;

            case MatchTypeEnum.EmptyMatch:
                matches = MatchTypeMatcher.BasedOnEmptyFields(question, column1, column2);
                break;

            case MatchTypeEnum.SoftMatch:
                if (question.Field == FieldEnum.Address1)
                {
                    matches = MatchTypeMatcher.BasedOnAddressSoftMatch(question, column1, column2);
                }
                else if (question.Field == FieldEnum.DOB)
                {
                    matches = MatchTypeMatcher.BasedOnDateSoftMatch(question, column1, column2);
                }
                break;

            case MatchTypeEnum.IsFemale:
                matches = MatchTypeMatcher.BasedOnIsFemale(question, pair);
                break;

            default:
                throw new ArgumentException();
            }
            return(matches);
        }
コード例 #14
0
        public static bool ReplayDecision(RecordPair pair, List <Tuple <SplittingQuestion, bool> > rules)
        {
            bool good = true;

            foreach (Tuple <SplittingQuestion, bool> rule in rules)
            {
                bool response = ComputeSplitDirection(rule.Item1, pair);

                if (response != rule.Item2)
                {
                    good = false;
                    break;
                }
            }

            return(good);
        }
コード例 #15
0
        public static List <RecordPair> LoadTrainingDataFromNoHomoFile(string noHomoFilePath)
        {
            List <RecordPair> ret = new List <RecordPair>();

            string[] lines = File.ReadAllLines(noHomoFilePath);

            for (int c = 0; c < lines.Length; c += 4)
            {
                RecordPair pair = new RecordPair();
                pair.IsMatch = bool.Parse(lines[c]);
                pair.Record1 = DataCleaner.CleanRecord(Record.FromString(lines[c + 1]));
                pair.Record2 = DataCleaner.CleanRecord(Record.FromString(lines[c + 2]));

                ret.Add(pair);
            }

            return(ret);
        }
コード例 #16
0
        public static void List()
        {
            Console.WriteLine("Line to match:");
            string line1   = Console.ReadLine();
            Record toMatch = Record.FromString(line1);

            string[] finalDataSetList = File.ReadAllLines("c:/users/brush/desktop/finaldataset.csv");

            Console.WriteLine("Searching for matches...");
            for (int c = 1; c < finalDataSetList.Length; c++)
            {
                string   finalDataSetRow = finalDataSetList[c];
                string[] bits            = finalDataSetRow.Split(',');

                if (bits[0] != "")
                {
                    int enterpriseId = int.Parse(bits[0]);

                    if (enterpriseId > 15374761)
                    {
                        Record comparisonRecord = Record.FromFinalDatasetString(bits);

                        RecordPair pair = new RecordPair
                        {
                            Record1 = toMatch,
                            Record2 = comparisonRecord,
                        };

                        DecisionTree[] forest = DataLoader.LoadForestFromDirectory("C:/users/brush/desktop/forest");

                        bool isMatch = DecisionTreeBuilder.IsMatch(pair, forest, null);

                        if (isMatch)
                        {
                            Console.WriteLine(comparisonRecord);
                            Console.WriteLine();
                        }
                    }
                }
            }
        }
コード例 #17
0
        public static List <RecordPair> GetRejectedRecordPairs(string rejectFile)
        {
            List <RecordPair> trainingData = new List <RecordPair>();

            string[] rejectedLines = File.ReadAllLines(rejectFile);
            for (int c = 0; c < rejectedLines.Length; c += 3)
            {
                string line1 = rejectedLines[c];
                string line2 = rejectedLines[c + 1];

                RecordPair failurePair = new RecordPair();
                failurePair.Record1 = DataCleaner.CleanRecord(Record.FromString(line1));
                failurePair.Record2 = DataCleaner.CleanRecord(Record.FromString(line2));
                failurePair.IsMatch = false;
                if (PassesBigBucketFilter(failurePair))
                {
                    trainingData.Add(failurePair);
                }
            }

            return(trainingData);
        }
コード例 #18
0
        public static List <RecordPair> GetPairsFromMisfitsFile(string misfitsFilePath, Dictionary <int, Record> finalDataSet)
        {
            string[] lines = File.ReadAllLines(misfitsFilePath);

            List <RecordPair> ret = new List <RecordPair>();

            //for (int c = 0; c < lines.Length; c++)
            Parallel.For(0, lines.Length, c =>
            {
                if (!string.IsNullOrEmpty(lines[c]))
                {
                    int[] enterpriseIds = lines[c].Split(',').Select(n => int.Parse(n)).ToArray();

                    RecordPair pair = new RecordPair();
                    pair.IsMatch    = false;
                    pair.Record1    = finalDataSet[enterpriseIds[0]];
                    pair.Record2    = finalDataSet[enterpriseIds[1]];

                    lock (ret)
                    {
                        ret.Add(pair);
                    }
                }
            });

            //int duplicates = 0;
            //List<RecordPair> cleaned = new List<RecordPair>();
            ////foreach (RecordPair pairA in ret)
            //int counter = 0;
            ////Parallel.ForEach(ret, pairA =>
            //Parallel.For(0, ret.Count, n =>
            //{
            //    RecordPair pairA = ret[n];

            //    Interlocked.Increment(ref counter);

            //    if (counter % 1000 == 0)
            //    {
            //        Console.WriteLine($"{counter.ToString("N0")}/{ret.Count.ToString("N0")}");
            //    }

            //    bool isDuplicate = false;
            //    // foreach (RecordPair pairB in ret)
            //    for (int c = n + 1; c < ret.Count; c++)
            //    {
            //        RecordPair pairB = ret[c];

            //        if (pairA != pairB && pairA.Equals(pairB))
            //        {
            //            duplicates++;
            //            isDuplicate = true;
            //            break;
            //        }
            //    }

            //    if (!isDuplicate)
            //    {
            //        lock (cleaned)
            //        {
            //            cleaned.Add(pairA);
            //        }
            //    }
            //});

            //using (StreamWriter sw = File.CreateText("C:/users/brush/desktop/cleaned.csv"))
            //{
            //    foreach (RecordPair pair in cleaned)
            //    {
            //        sw.WriteLine($"{pair.Record1.EnterpriseId},{pair.Record2.EnterpriseId}");
            //    }
            //}

            //Console.WriteLine($"There are {ret} entries. After cleaning there are {cleaned.Count}");
            //Console.ReadLine();

            return(ret);
        }
コード例 #19
0
ファイル: MatchTypeMatcher.cs プロジェクト: kwende/MitchMatch
 public static bool BasedOnIsFemale(SplittingQuestion question, RecordPair pair)
 {
     return(pair.Record1.Gender == pair.Record2.Gender && pair.Record1.Gender == "F");
 }
コード例 #20
0
        public static List <RecordPair> GetRandomPairsForMRNData(string inputFilePath)
        {
            List <RecordPair> trainingData = new List <RecordPair>();

            List <Record> mrnRecords = GetCleanedRecordsFromMRNFile(inputFilePath);

            Parallel.For(0, mrnRecords.Count() / 2, c =>
            {
                c *= 2;

                lock (trainingData)
                {
                    trainingData.Add(new RecordPair
                    {
                        IsMatch = true,
                        Record1 = mrnRecords[c],
                        Record2 = mrnRecords[c + 1],
                    });
                }

                Random rand = new Random();
                for (int d = 0; d < mrnRecords.Count; d += 2)
                {
                    {
                        if (c != d)
                        {
                            if (rand.Next() % 2 == 0)
                            {
                                RecordPair pair = new RecordPair
                                {
                                    IsMatch = false,
                                    Record1 = mrnRecords[c],
                                    Record2 = mrnRecords[d]
                                };
                                if (PassesBigBucketFilter(pair))
                                {
                                    lock (trainingData)
                                    {
                                        trainingData.Add(pair);
                                    }
                                }
                            }
                            else
                            {
                                RecordPair pair = new RecordPair
                                {
                                    IsMatch = false,
                                    Record1 = mrnRecords[c + 1],
                                    Record2 = mrnRecords[d]
                                };
                                if (PassesBigBucketFilter(pair))
                                {
                                    lock (trainingData)
                                    {
                                        trainingData.Add(pair);
                                    }
                                }
                            }
                        }
                    }
                }
            });

            return(trainingData);
        }
コード例 #21
0
ファイル: MatchTypeMatcher.cs プロジェクト: kwende/MitchMatch
 public static bool BasedOnIsHomeless(SplittingQuestion question, RecordPair pair)
 {
     return(pair.Record1.Address1 == "HOMELESS" || pair.Record1.Address2 == "HOMELESS");
 }
コード例 #22
0
ファイル: MatchTypeMatcher.cs プロジェクト: kwende/MitchMatch
 public static bool BasedOnLivesInMassResidence(SplittingQuestion question, RecordPair pair)
 {
     return(pair.Record1.LivesInLargeResidence || pair.Record2.LivesInLargeResidence);
 }
コード例 #23
0
        public static List <RecordPair> GetHandPassedSets(string inputMoreFilePath)
        {
            List <RecordPair> trainingData = new List <RecordPair>();

            string[]        extraLines = File.ReadAllLines(inputMoreFilePath);
            List <Record[]> moreGroups = new List <Record[]>();

            for (int c = 0; c < extraLines.Length; c++)
            {
                List <Record> group = new List <Record>();
                for (; c < extraLines.Length; c++)
                {
                    if (extraLines[c] == "")
                    {
                        break;
                    }
                    else
                    {
                        Record record = Record.FromString(extraLines[c]);
                        group.Add(DataCleaner.CleanRecord(record));
                    }
                }
                moreGroups.Add(group.ToArray());
            }

            for (int c = 0; c < moreGroups.Count; c++)
            {
                // get the positives by iterating in the group.
                Record[] recordsInGroupC = moreGroups[c];
                for (int d = 0; d < recordsInGroupC.Length; d++)
                {
                    Record record1 = recordsInGroupC[d];
                    for (int e = d; e < recordsInGroupC.Length; e++)
                    {
                        Record     record2 = recordsInGroupC[e];
                        RecordPair pair    = new RecordPair
                        {
                            IsMatch = true,
                            Record1 = record1,
                            Record2 = record2
                        };
                        if (PassesBigBucketFilter(pair))
                        {
                            trainingData.Add(pair);
                        }
                    }
                }

                //get the negatives by iterating everyone else
                for (int d = 0; d < moreGroups.Count; d++)
                {
                    //Console.WriteLine(d.ToString());
                    if (c != d)
                    {
                        Record[] others = moreGroups[d];
                        for (int e = 0; e < recordsInGroupC.Length; e++)
                        {
                            Record record1 = recordsInGroupC[e];
                            for (int f = 0; f < others.Length; f++)
                            {
                                Record     record2 = others[f];
                                RecordPair pair    = new RecordPair
                                {
                                    IsMatch = false,
                                    Record1 = record1,
                                    Record2 = record2
                                };
                                if (PassesBigBucketFilter(pair))
                                {
                                    trainingData.Add(pair);
                                }
                                //trainingData.Add(pair);
                            }
                        }
                    }
                }
            }

            return(trainingData);
        }