public static List <RecordPair> LoadAllPositivesFromAnswerKey(string answerKeyPath, Dictionary <int, Record> finalDataSet) { List <RecordPair> ret = new List <RecordPair>(); IEnumerable <string> lines = File.ReadLines(answerKeyPath); Parallel.ForEach(lines, line => { string[] bits = line.Split(','); if (bits.Length == 3) { Record[] records = bits.Take(2).Select(n => finalDataSet[int.Parse(n)]).ToArray(); RecordPair pair = new RecordPair { IsMatch = true, Record1 = records[0], Record2 = records[1], }; lock (ret) { ret.Add(pair); } } }); return(ret); }
private static double ComputeShannonEntropy(RecordPair[] pairs) { double entropy = 0.0; int numberInMatchColumn = 0; int numberInNoMatchColum = 0; for (int c = 0; c < pairs.Length; c++) { RecordPair pair = pairs[c]; if (pair.IsMatch) { numberInMatchColumn++; } else { numberInNoMatchColum++; } } double totalLength = pairs.Length; double ratio = numberInMatchColumn / totalLength; entropy = -(ratio * System.Math.Log(ratio, 2)); ratio = numberInNoMatchColum / totalLength; entropy += -(ratio * System.Math.Log(ratio, 2)); return(entropy); }
public static bool PassesBigBucketFilter(RecordPair pair, double percentageOfBigBucketToAllow = .5) { return(true); //Random rand = new Random(); //bool fallsIntoBigBucket = false; //int dobEditDistance = EditDistance.Compute(pair.Record1.DOB, pair.Record2.DOB); //if (!(dobEditDistance <= 0)) //{ // if (!(System.Math.Abs(pair.Record1.MRN - pair.Record2.MRN) <= 100)) // { // if (!(EditDistance.Compute(pair.Record1.LastName, pair.Record2.LastName) <= 1)) // { // if (!(dobEditDistance <= 1)) // { // fallsIntoBigBucket = true; // } // } // } //} //bool shouldBeUsed = true; //if (fallsIntoBigBucket) //{ // shouldBeUsed = rand.NextDouble() <= percentageOfBigBucketToAllow; //} //return shouldBeUsed; }
private static bool RecurseAndCheckIsMatch(DecisionTreeNode parentNode, RecordPair pair, TreeLogger logger) { if (parentNode.IsLeaf) { if (logger != null) { logger.FinalResultIsMatch = parentNode.IsMatch; Console.WriteLine($"Is Match {parentNode.IsMatch}"); } return(parentNode.IsMatch); } else { bool goesLeft = ComputeSplitDirection(parentNode.Question, pair); if (logger != null) { Console.WriteLine($"Question: {parentNode.Question} Answer: {goesLeft}."); logger.SplittingQuestionsToTheBottom.Add(new Tuple <SplittingQuestion, bool>(parentNode.Question, goesLeft)); } if (goesLeft) { return(RecurseAndCheckIsMatch(parentNode.LeftBranch, pair, logger)); } else { return(RecurseAndCheckIsMatch(parentNode.RightBranch, pair, logger)); } } }
public static IEnumerable <RecordPair> GetAllNegativeRecordPairsForMRNData(string mrnRecordPath) { List <RecordPair> trainingData = new List <RecordPair>(); List <Record> mrnRecords = GetCleanedRecordsFromMRNFile(mrnRecordPath); for (int c = 0; c < mrnRecords.Count(); c += 2) { for (int d = 0; d < mrnRecords.Count; d += 2) { if (c != d) { RecordPair nonMatchPair1 = new RecordPair { IsMatch = false, Record1 = mrnRecords[c], Record2 = mrnRecords[d] }; yield return(nonMatchPair1); RecordPair nonMatchPair2 = new RecordPair { IsMatch = false, Record1 = mrnRecords[c + 1], Record2 = mrnRecords[d] }; yield return(nonMatchPair2); } } } }
public static RecordPair CleanRecordPair(RecordPair pair, string streetSuffixesFile) { pair.Record1 = CleanRecord(pair.Record1); pair.Record2 = CleanRecord(pair.Record2); return(pair); }
public static Dictionary <Record, List <Record> > GetMatches(List <Record> records) { Dictionary <Record, List <Record> > ret = new Dictionary <Record, List <Record> >(); string[] finalDataSetList = File.ReadAllLines("c:/users/brush/desktop/finaldataset.csv"); DecisionTree[] forest = DataLoader.LoadForestFromDirectory("C:/users/brush/desktop/forest"); foreach (Record record in records) { ret.Add(record, new List <Record>()); } Console.WriteLine("Searching for matches..."); //for (int c = 1; c < finalDataSetList.Length; c++) Parallel.For(1, finalDataSetList.Length, c => { if (c % 10000 == 0) { Console.WriteLine($"{(c / (finalDataSetList.Length * 1.0) * 100)}%"); } string finalDataSetRow = finalDataSetList[c]; string[] bits = finalDataSetRow.Split(','); if (bits[0] != "") { int enterpriseId = int.Parse(bits[0]); if (enterpriseId > 15374761) { Record comparisonRecord = DataCleaner.CleanRecord(Record.FromFinalDatasetString(bits)); foreach (Record toMatch in records) { if (!toMatch.Equals(comparisonRecord)) { RecordPair pair = new RecordPair { Record1 = toMatch, Record2 = comparisonRecord, }; bool isMatch = DecisionTreeBuilder.IsMatch(pair, forest, null); if (isMatch) { lock (ret) { ret[toMatch].Add(comparisonRecord); } } } } } } }); return(ret); }
public static bool BasedOnMRNDistance(SplittingQuestion question, RecordPair pair) { bool ret = false; int maxDistance = question.MRNMaxDistance; if (pair.Record1.MRN > 0 && pair.Record2.MRN > 0) { ret = System.Math.Abs(pair.Record1.MRN - pair.Record2.MRN) <= maxDistance; } return(ret); }
public static void Test() { Console.WriteLine("Line 1:"); string line1 = Console.ReadLine(); Console.WriteLine("Line 2:"); string line2 = Console.ReadLine(); RecordPair pair = new RecordPair(); pair.Record1 = DataCleaner.CleanRecord(Record.FromString(line1)); pair.Record2 = DataCleaner.CleanRecord(Record.FromString(line2)); DecisionTree[] forest = DataLoader.LoadForestFromDirectory("."); TreeLogger logger = new TreeLogger(); bool isMatch = DecisionTreeBuilder.IsMatch(pair, forest, logger); Console.WriteLine("Search for those in training data who make it there?"); string response = Console.ReadLine(); //RecordPair pair2 = new RecordPair(); //pair2.Record1 = Record.FromString("LAZAR,,KALLER,,M,,16/10/1965,,,,,,,-1,,,4839002,15479245,"); //pair2.Record2 = Record.FromString("ADRIENNE,,KELLEY,,F,895535860,16/10/1965,9175738850,,1560 SILVER ST,2H,BRONX,NY,10461,,[email protected],4799491,15637549,"); //bool ret = DecisionTreeBuilder.ReplayDecision(pair2, logger.SplittingQuestionsToTheBottom); if (response.ToLower() == "y") { using (StreamWriter sw = File.AppendText("c:/users/brush/desktop/gothere.txt")) { List <RecordPair> pairs = new List <RecordPair>(); Console.Write("Loading training data for this iteration..."); pairs.AddRange(DataLoader.GetPositivesFromMRNData("mrns.csv")); pairs.AddRange(DataLoader.GetHandPassedSets("more.csv")); pairs.AddRange(DataLoader.GetRejectedRecordPairs("rejected.txt")); //pairs.AddRange(DataLoader.GetPairsFromMisfitsFile("misfits.txt")); Console.WriteLine("...done"); Parallel.ForEach(pairs, p => { if (DecisionTreeBuilder.ReplayDecision(p, logger.SplittingQuestionsToTheBottom)) { lock (sw) { sw.WriteLine(p); } } }); } } return; }
public static IEnumerable <RecordPair> LoadNegativesFromAnswerKey(List <RecordPair> positives) { for (int a = 0; a < positives.Count; a++) { RecordPair matchPair1 = positives[a]; for (int b = a + 1; b < positives.Count; b++) { RecordPair matchPair2 = positives[b]; if (matchPair1.Record1.EnterpriseId != matchPair2.Record1.EnterpriseId && matchPair1.Record1.EnterpriseId != matchPair2.Record2.EnterpriseId && matchPair1.Record2.EnterpriseId != matchPair2.Record1.EnterpriseId && matchPair1.Record2.EnterpriseId != matchPair2.Record2.EnterpriseId) { RecordPair noMatch = new RecordPair { IsMatch = false, Record1 = matchPair1.Record1, Record2 = matchPair2.Record1 }; yield return(noMatch); noMatch = new RecordPair { IsMatch = false, Record1 = matchPair1.Record2, Record2 = matchPair2.Record2, }; yield return(noMatch); noMatch = new RecordPair { IsMatch = false, Record1 = matchPair1.Record1, Record2 = matchPair2.Record2 }; yield return(noMatch); noMatch = new RecordPair { IsMatch = false, Record1 = matchPair1.Record2, Record2 = matchPair2.Record1 }; yield return(noMatch); } } } }
public static bool IsMatch(RecordPair pair, DecisionTree[] forest, TreeLogger logger) { int positives = 0; foreach (DecisionTree tree in forest) { if (RecurseAndCheckIsMatch(tree.Root, pair, logger)) { positives++; } } return((positives / (forest.Length) * 1.0) > .5); }
private void ViewRecords(RecordPair rp) { var orgUrl = string.Empty; var recordId = string.Empty; if (rp == null) { MessageBox.Show("You need to select a record"); return; } var entityLogicalName = rp.Entity.LogicalName; try { if (rp.LRecord.Id != null) { orgUrl = GetUrl(leftProxy.ServiceConfiguration.CurrentServiceEndpoint.Address.ToString()); recordId = rp.LRecord.Id.ToString(); wbLeftRecord.Navigate(new Uri($"{orgUrl}main.aspx?etn={entityLogicalName}&pagetype=entityrecord&id=%7B{recordId}%7D")); } else { wbLeftRecord.Navigate(new Uri("about:blank")); } } catch (Exception ex) { MessageBox.Show($"[MainForm.BtnViewRecords]) Coudln't view left record, Message: {ex.Message}"); } try { if (rp.RRecord.Id != null) { orgUrl = GetUrl(rightProxy.ServiceConfiguration.CurrentServiceEndpoint.Address.ToString()); recordId = rp.RRecord.Id.ToString(); wbRightRecord.Navigate(new Uri($"{orgUrl}main.aspx?etn={entityLogicalName}&pagetype=entityrecord&id=%7B{recordId}%7D")); } else { wbRightRecord.Navigate(new Uri("about:blank")); } } catch (Exception ex) { MessageBox.Show($"[MainForm.BtnViewRecords]) Coudln't view right record, Message: {ex.Message}"); } }
public static bool ComputeSplitDirection(SplittingQuestion question, RecordPair pair) { bool matches = false; string column1 = pair.Record1.Cache[(int)question.Field]; string column2 = pair.Record2.Cache[(int)question.Field]; switch (question.MatchType) { case MatchTypeEnum.MRNDistance: matches = MatchTypeMatcher.BasedOnMRNDistance(question, pair); break; case MatchTypeEnum.LivesInMassResidence: matches = MatchTypeMatcher.BasedOnLivesInMassResidence(question, pair); break; case MatchTypeEnum.IsHomeless: matches = MatchTypeMatcher.BasedOnIsHomeless(question, pair); break; case MatchTypeEnum.EditDistance: matches = MatchTypeMatcher.BasedOnEditDistance(question, column1, column2); break; case MatchTypeEnum.EmptyMatch: matches = MatchTypeMatcher.BasedOnEmptyFields(question, column1, column2); break; case MatchTypeEnum.SoftMatch: if (question.Field == FieldEnum.Address1) { matches = MatchTypeMatcher.BasedOnAddressSoftMatch(question, column1, column2); } else if (question.Field == FieldEnum.DOB) { matches = MatchTypeMatcher.BasedOnDateSoftMatch(question, column1, column2); } break; case MatchTypeEnum.IsFemale: matches = MatchTypeMatcher.BasedOnIsFemale(question, pair); break; default: throw new ArgumentException(); } return(matches); }
public static bool ReplayDecision(RecordPair pair, List <Tuple <SplittingQuestion, bool> > rules) { bool good = true; foreach (Tuple <SplittingQuestion, bool> rule in rules) { bool response = ComputeSplitDirection(rule.Item1, pair); if (response != rule.Item2) { good = false; break; } } return(good); }
public static List <RecordPair> LoadTrainingDataFromNoHomoFile(string noHomoFilePath) { List <RecordPair> ret = new List <RecordPair>(); string[] lines = File.ReadAllLines(noHomoFilePath); for (int c = 0; c < lines.Length; c += 4) { RecordPair pair = new RecordPair(); pair.IsMatch = bool.Parse(lines[c]); pair.Record1 = DataCleaner.CleanRecord(Record.FromString(lines[c + 1])); pair.Record2 = DataCleaner.CleanRecord(Record.FromString(lines[c + 2])); ret.Add(pair); } return(ret); }
public static void List() { Console.WriteLine("Line to match:"); string line1 = Console.ReadLine(); Record toMatch = Record.FromString(line1); string[] finalDataSetList = File.ReadAllLines("c:/users/brush/desktop/finaldataset.csv"); Console.WriteLine("Searching for matches..."); for (int c = 1; c < finalDataSetList.Length; c++) { string finalDataSetRow = finalDataSetList[c]; string[] bits = finalDataSetRow.Split(','); if (bits[0] != "") { int enterpriseId = int.Parse(bits[0]); if (enterpriseId > 15374761) { Record comparisonRecord = Record.FromFinalDatasetString(bits); RecordPair pair = new RecordPair { Record1 = toMatch, Record2 = comparisonRecord, }; DecisionTree[] forest = DataLoader.LoadForestFromDirectory("C:/users/brush/desktop/forest"); bool isMatch = DecisionTreeBuilder.IsMatch(pair, forest, null); if (isMatch) { Console.WriteLine(comparisonRecord); Console.WriteLine(); } } } } }
public static List <RecordPair> GetRejectedRecordPairs(string rejectFile) { List <RecordPair> trainingData = new List <RecordPair>(); string[] rejectedLines = File.ReadAllLines(rejectFile); for (int c = 0; c < rejectedLines.Length; c += 3) { string line1 = rejectedLines[c]; string line2 = rejectedLines[c + 1]; RecordPair failurePair = new RecordPair(); failurePair.Record1 = DataCleaner.CleanRecord(Record.FromString(line1)); failurePair.Record2 = DataCleaner.CleanRecord(Record.FromString(line2)); failurePair.IsMatch = false; if (PassesBigBucketFilter(failurePair)) { trainingData.Add(failurePair); } } return(trainingData); }
public static List <RecordPair> GetPairsFromMisfitsFile(string misfitsFilePath, Dictionary <int, Record> finalDataSet) { string[] lines = File.ReadAllLines(misfitsFilePath); List <RecordPair> ret = new List <RecordPair>(); //for (int c = 0; c < lines.Length; c++) Parallel.For(0, lines.Length, c => { if (!string.IsNullOrEmpty(lines[c])) { int[] enterpriseIds = lines[c].Split(',').Select(n => int.Parse(n)).ToArray(); RecordPair pair = new RecordPair(); pair.IsMatch = false; pair.Record1 = finalDataSet[enterpriseIds[0]]; pair.Record2 = finalDataSet[enterpriseIds[1]]; lock (ret) { ret.Add(pair); } } }); //int duplicates = 0; //List<RecordPair> cleaned = new List<RecordPair>(); ////foreach (RecordPair pairA in ret) //int counter = 0; ////Parallel.ForEach(ret, pairA => //Parallel.For(0, ret.Count, n => //{ // RecordPair pairA = ret[n]; // Interlocked.Increment(ref counter); // if (counter % 1000 == 0) // { // Console.WriteLine($"{counter.ToString("N0")}/{ret.Count.ToString("N0")}"); // } // bool isDuplicate = false; // // foreach (RecordPair pairB in ret) // for (int c = n + 1; c < ret.Count; c++) // { // RecordPair pairB = ret[c]; // if (pairA != pairB && pairA.Equals(pairB)) // { // duplicates++; // isDuplicate = true; // break; // } // } // if (!isDuplicate) // { // lock (cleaned) // { // cleaned.Add(pairA); // } // } //}); //using (StreamWriter sw = File.CreateText("C:/users/brush/desktop/cleaned.csv")) //{ // foreach (RecordPair pair in cleaned) // { // sw.WriteLine($"{pair.Record1.EnterpriseId},{pair.Record2.EnterpriseId}"); // } //} //Console.WriteLine($"There are {ret} entries. After cleaning there are {cleaned.Count}"); //Console.ReadLine(); return(ret); }
public static bool BasedOnIsFemale(SplittingQuestion question, RecordPair pair) { return(pair.Record1.Gender == pair.Record2.Gender && pair.Record1.Gender == "F"); }
public static List <RecordPair> GetRandomPairsForMRNData(string inputFilePath) { List <RecordPair> trainingData = new List <RecordPair>(); List <Record> mrnRecords = GetCleanedRecordsFromMRNFile(inputFilePath); Parallel.For(0, mrnRecords.Count() / 2, c => { c *= 2; lock (trainingData) { trainingData.Add(new RecordPair { IsMatch = true, Record1 = mrnRecords[c], Record2 = mrnRecords[c + 1], }); } Random rand = new Random(); for (int d = 0; d < mrnRecords.Count; d += 2) { { if (c != d) { if (rand.Next() % 2 == 0) { RecordPair pair = new RecordPair { IsMatch = false, Record1 = mrnRecords[c], Record2 = mrnRecords[d] }; if (PassesBigBucketFilter(pair)) { lock (trainingData) { trainingData.Add(pair); } } } else { RecordPair pair = new RecordPair { IsMatch = false, Record1 = mrnRecords[c + 1], Record2 = mrnRecords[d] }; if (PassesBigBucketFilter(pair)) { lock (trainingData) { trainingData.Add(pair); } } } } } } }); return(trainingData); }
public static bool BasedOnIsHomeless(SplittingQuestion question, RecordPair pair) { return(pair.Record1.Address1 == "HOMELESS" || pair.Record1.Address2 == "HOMELESS"); }
public static bool BasedOnLivesInMassResidence(SplittingQuestion question, RecordPair pair) { return(pair.Record1.LivesInLargeResidence || pair.Record2.LivesInLargeResidence); }
public static List <RecordPair> GetHandPassedSets(string inputMoreFilePath) { List <RecordPair> trainingData = new List <RecordPair>(); string[] extraLines = File.ReadAllLines(inputMoreFilePath); List <Record[]> moreGroups = new List <Record[]>(); for (int c = 0; c < extraLines.Length; c++) { List <Record> group = new List <Record>(); for (; c < extraLines.Length; c++) { if (extraLines[c] == "") { break; } else { Record record = Record.FromString(extraLines[c]); group.Add(DataCleaner.CleanRecord(record)); } } moreGroups.Add(group.ToArray()); } for (int c = 0; c < moreGroups.Count; c++) { // get the positives by iterating in the group. Record[] recordsInGroupC = moreGroups[c]; for (int d = 0; d < recordsInGroupC.Length; d++) { Record record1 = recordsInGroupC[d]; for (int e = d; e < recordsInGroupC.Length; e++) { Record record2 = recordsInGroupC[e]; RecordPair pair = new RecordPair { IsMatch = true, Record1 = record1, Record2 = record2 }; if (PassesBigBucketFilter(pair)) { trainingData.Add(pair); } } } //get the negatives by iterating everyone else for (int d = 0; d < moreGroups.Count; d++) { //Console.WriteLine(d.ToString()); if (c != d) { Record[] others = moreGroups[d]; for (int e = 0; e < recordsInGroupC.Length; e++) { Record record1 = recordsInGroupC[e]; for (int f = 0; f < others.Length; f++) { Record record2 = others[f]; RecordPair pair = new RecordPair { IsMatch = false, Record1 = record1, Record2 = record2 }; if (PassesBigBucketFilter(pair)) { trainingData.Add(pair); } //trainingData.Add(pair); } } } } } return(trainingData); }