public static Dictionary <Record, List <Record> > GetMatches(List <Record> records) { Dictionary <Record, List <Record> > ret = new Dictionary <Record, List <Record> >(); string[] finalDataSetList = File.ReadAllLines("c:/users/brush/desktop/finaldataset.csv"); DecisionTree[] forest = DataLoader.LoadForestFromDirectory("C:/users/brush/desktop/forest"); foreach (Record record in records) { ret.Add(record, new List <Record>()); } Console.WriteLine("Searching for matches..."); //for (int c = 1; c < finalDataSetList.Length; c++) Parallel.For(1, finalDataSetList.Length, c => { if (c % 10000 == 0) { Console.WriteLine($"{(c / (finalDataSetList.Length * 1.0) * 100)}%"); } string finalDataSetRow = finalDataSetList[c]; string[] bits = finalDataSetRow.Split(','); if (bits[0] != "") { int enterpriseId = int.Parse(bits[0]); if (enterpriseId > 15374761) { Record comparisonRecord = DataCleaner.CleanRecord(Record.FromFinalDatasetString(bits)); foreach (Record toMatch in records) { if (!toMatch.Equals(comparisonRecord)) { RecordPair pair = new RecordPair { Record1 = toMatch, Record2 = comparisonRecord, }; bool isMatch = DecisionTreeBuilder.IsMatch(pair, forest, null); if (isMatch) { lock (ret) { ret[toMatch].Add(comparisonRecord); } } } } } } }); return(ret); }
public static void Test() { Console.WriteLine("Line 1:"); string line1 = Console.ReadLine(); Console.WriteLine("Line 2:"); string line2 = Console.ReadLine(); RecordPair pair = new RecordPair(); pair.Record1 = DataCleaner.CleanRecord(Record.FromString(line1)); pair.Record2 = DataCleaner.CleanRecord(Record.FromString(line2)); DecisionTree[] forest = DataLoader.LoadForestFromDirectory("."); TreeLogger logger = new TreeLogger(); bool isMatch = DecisionTreeBuilder.IsMatch(pair, forest, logger); Console.WriteLine("Search for those in training data who make it there?"); string response = Console.ReadLine(); //RecordPair pair2 = new RecordPair(); //pair2.Record1 = Record.FromString("LAZAR,,KALLER,,M,,16/10/1965,,,,,,,-1,,,4839002,15479245,"); //pair2.Record2 = Record.FromString("ADRIENNE,,KELLEY,,F,895535860,16/10/1965,9175738850,,1560 SILVER ST,2H,BRONX,NY,10461,,[email protected],4799491,15637549,"); //bool ret = DecisionTreeBuilder.ReplayDecision(pair2, logger.SplittingQuestionsToTheBottom); if (response.ToLower() == "y") { using (StreamWriter sw = File.AppendText("c:/users/brush/desktop/gothere.txt")) { List <RecordPair> pairs = new List <RecordPair>(); Console.Write("Loading training data for this iteration..."); pairs.AddRange(DataLoader.GetPositivesFromMRNData("mrns.csv")); pairs.AddRange(DataLoader.GetHandPassedSets("more.csv")); pairs.AddRange(DataLoader.GetRejectedRecordPairs("rejected.txt")); //pairs.AddRange(DataLoader.GetPairsFromMisfitsFile("misfits.txt")); Console.WriteLine("...done"); Parallel.ForEach(pairs, p => { if (DecisionTreeBuilder.ReplayDecision(p, logger.SplittingQuestionsToTheBottom)) { lock (sw) { sw.WriteLine(p); } } }); } } return; }
public static List <RecordPair> LoadTrainingDataFromNoHomoFile(string noHomoFilePath) { List <RecordPair> ret = new List <RecordPair>(); string[] lines = File.ReadAllLines(noHomoFilePath); for (int c = 0; c < lines.Length; c += 4) { RecordPair pair = new RecordPair(); pair.IsMatch = bool.Parse(lines[c]); pair.Record1 = DataCleaner.CleanRecord(Record.FromString(lines[c + 1])); pair.Record2 = DataCleaner.CleanRecord(Record.FromString(lines[c + 2])); ret.Add(pair); } return(ret); }
public static List <RecordPair> GetRejectedRecordPairs(string rejectFile) { List <RecordPair> trainingData = new List <RecordPair>(); string[] rejectedLines = File.ReadAllLines(rejectFile); for (int c = 0; c < rejectedLines.Length; c += 3) { string line1 = rejectedLines[c]; string line2 = rejectedLines[c + 1]; RecordPair failurePair = new RecordPair(); failurePair.Record1 = DataCleaner.CleanRecord(Record.FromString(line1)); failurePair.Record2 = DataCleaner.CleanRecord(Record.FromString(line2)); failurePair.IsMatch = false; if (PassesBigBucketFilter(failurePair)) { trainingData.Add(failurePair); } } return(trainingData); }
public static Dictionary <int, Record> LoadFinalDataSet(string finalDataSetPath) { Dictionary <int, Record> ret = new Dictionary <int, Record>(); IEnumerable <string> lines = File.ReadLines(finalDataSetPath); bool encounteredHeader = false; foreach (string line in lines) { if (!encounteredHeader) { encounteredHeader = true; } else { string[] bits = line.Split(','); if (bits[0] != "" && int.Parse(bits[0]) >= 15374761) { Record cleanedRecord = DataCleaner.CleanRecord(Record.FromFinalDatasetString(line)); ret.Add(cleanedRecord.EnterpriseId, cleanedRecord); } } } return(ret); }
public static List <RecordPair> GetHandPassedSets(string inputMoreFilePath) { List <RecordPair> trainingData = new List <RecordPair>(); string[] extraLines = File.ReadAllLines(inputMoreFilePath); List <Record[]> moreGroups = new List <Record[]>(); for (int c = 0; c < extraLines.Length; c++) { List <Record> group = new List <Record>(); for (; c < extraLines.Length; c++) { if (extraLines[c] == "") { break; } else { Record record = Record.FromString(extraLines[c]); group.Add(DataCleaner.CleanRecord(record)); } } moreGroups.Add(group.ToArray()); } for (int c = 0; c < moreGroups.Count; c++) { // get the positives by iterating in the group. Record[] recordsInGroupC = moreGroups[c]; for (int d = 0; d < recordsInGroupC.Length; d++) { Record record1 = recordsInGroupC[d]; for (int e = d; e < recordsInGroupC.Length; e++) { Record record2 = recordsInGroupC[e]; RecordPair pair = new RecordPair { IsMatch = true, Record1 = record1, Record2 = record2 }; if (PassesBigBucketFilter(pair)) { trainingData.Add(pair); } } } //get the negatives by iterating everyone else for (int d = 0; d < moreGroups.Count; d++) { //Console.WriteLine(d.ToString()); if (c != d) { Record[] others = moreGroups[d]; for (int e = 0; e < recordsInGroupC.Length; e++) { Record record1 = recordsInGroupC[e]; for (int f = 0; f < others.Length; f++) { Record record2 = others[f]; RecordPair pair = new RecordPair { IsMatch = false, Record1 = record1, Record2 = record2 }; if (PassesBigBucketFilter(pair)) { trainingData.Add(pair); } //trainingData.Add(pair); } } } } } return(trainingData); }