public static void PrintRemainingRowCount(Row[] data, ClosedSets matches) { int remaining = 0; for (int i = 0; i < data.Length; i++) { if (matches.RowToClosedRowSet[i].Count == 1) { remaining++; } } Console.WriteLine($"Remaining: {remaining}"); }
public static ClosedSets LoadOriginalMatches(Row[] allData) { IEnumerable <string> lines = File.ReadLines(@"..\..\..\..\submission1.csv"); ClosedSets closedSets = new ClosedSets(allData); foreach (string line in lines) { var eidsAsStrings = line.Split(','); List <int> eids = new List <int>(); for (int i = 0; i < eidsAsStrings.Length - 1; i++) { int eid; int.TryParse(eidsAsStrings[i], out eid); eids.Add(eid); } closedSets.AddMatch(eids); } return(closedSets); }
public void FindAllMatches(Row[] allData, ref ClosedSets newMatches) { int originalNumberOfMatches = newMatches.NumberOfMatches; //****************** SOLID MATCHES ******************// AddMatches("SSN + LAST", allData, r => HardSelector(r, new FieldInclusions { SSN = true, Last = true, }), 0, (r1, r2) => 1, ref newMatches, originalNumberOfMatches); AddMatches("SSN + DOB", allData, r => HardSelector(r, new FieldInclusions { SSN = true, DOB = true, }), 0, (r1, r2) => 1, ref newMatches, originalNumberOfMatches); AddMatches("SSN + PHONE", allData, r => HardSelector(r, new FieldInclusions { SSN = true, Phone = true, }), 0, (r1, r2) => 1, ref newMatches, originalNumberOfMatches); AddMatches("SSN + ADDRESS", allData, r => HardSelector(r, new FieldInclusions { SSN = true, Address = true, }), 0, (r1, r2) => 1, ref newMatches, originalNumberOfMatches); AddMatches("NAME + DOB strong", allData, r => HardSelector(r, new FieldInclusions { Name = true, DOB = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, Phone = true, Address = true, }), ref newMatches, originalNumberOfMatches); AddMatches("NAME + PHONE (no sr/jr)", allData, r => HardSelector(r, new FieldInclusions { Name = true, Phone = true }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, DOB = true, }), ref newMatches, originalNumberOfMatches); AddMatches("NAME + ADDRESS (no sr/jr)", allData, r => HardSelector(r, new FieldInclusions { Name = true, Address = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, DOB = true, }), ref newMatches, originalNumberOfMatches); AddMatches("DOB + PHONE (no twin)", allData, r => HardSelector(r, new FieldInclusions { First = true, DOB = true, Phone = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, Last = true, Address = true, }), ref newMatches, originalNumberOfMatches); AddMatches("DOB + ADDRESS (no twin)", allData, r => HardSelector(r, new FieldInclusions { DOB = true, Address = true }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, First = true, }), ref newMatches, originalNumberOfMatches); AddMatches("PHONE + ADDRESS (no twin)", allData, r => HardSelector(r, new FieldInclusions { Phone = true, Address = true }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, First = true, }), ref newMatches, originalNumberOfMatches); AddMatches("SSN + soft match", allData, r => HardSelector(r, new FieldInclusions { SSN = true }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { Name = true, Phone = true, DOB = true, Address = true, }), ref newMatches, originalNumberOfMatches); // Josh Code Review : Makes many of the SSN matches above redundant. //****************** PROBABLY SOLID MATCHES ******************// AddMatches("NAME + PHONE (sr/jr)", allData, r => HardSelector(r, new FieldInclusions { Name = true, Phone = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { Address = true, }), ref newMatches, originalNumberOfMatches); AddMatches("NAME + ADDRESS (sr/jr)", allData, r => HardSelector(r, new FieldInclusions { Name = true, Address = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { Phone = true, }), ref newMatches, originalNumberOfMatches); AddMatches("DOB + PHONE (twin)", allData, r => HardSelector(r, new FieldInclusions { DOB = true, Phone = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { Last = true, Address = true, }), ref newMatches, originalNumberOfMatches); AddMatches("DOB + ADDRESS (twin)", allData, r => HardSelector(r, new FieldInclusions { Address = true, DOB = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { Last = true, Phone = true, }), ref newMatches, originalNumberOfMatches); AddMatches("PHONE + ADDRESS (twin)", allData, r => HardSelector(r, new FieldInclusions { Phone = true, Address = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { Last = true, }), ref newMatches, originalNumberOfMatches); //AddMatches("Name + 2 soft", allData, r => HardSelector(r, new FieldInclusions //{ // Name = true, //}), 2, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions //{ // SSN = true, // DOB = true, // Phone = true, // Address = true, //}), ref newMatches, originalNumberOfMatches); //AddMatches("DOB + 2 soft", allData, r => HardSelector(r, new FieldInclusions //{ // DOB = true, //}), 2, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions //{ // SSN = true, // First = true, // Phone = true, // Address = true, //}), ref newMatches, originalNumberOfMatches); AddMatches("Phone + 2 soft", allData, r => HardSelector(r, new FieldInclusions { Phone = true, }), 2, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, First = true, DOB = true, Address = true, }), ref newMatches, originalNumberOfMatches); AddMatches("Address + 2 soft", allData, r => HardSelector(r, new FieldInclusions { Address = true, }), 2, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, First = true, DOB = true, Phone = true, }), ref newMatches, originalNumberOfMatches); //Josh code review : This could match on address, first name, and DOB. Maybe it should go in the weaker matches category? ////****************** WEAKER MATCHES ******************// //List<int> weakerMatchedIDs = new List<int>(); //var weak = AddMatches("PHONE + soft SSN", allData, r => HardSelector(r, new FieldInclusions { Phone = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, }), ref newMatches, originalNumberOfMatches); //weakerMatchedIDs.AddRange(weak); //weak = AddMatches("NAME + DOB weaker", allData, r => HardSelector(r, new FieldInclusions { Name = true, DOB = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSNSoft = true, }), ref newMatches, originalNumberOfMatches); //weakerMatchedIDs.AddRange(weak); //weak = AddMatches("NAME + PHONE weaker", allData, r => HardSelector(r, new FieldInclusions { Name = true, Phone = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSNSoft = true, }), ref newMatches, originalNumberOfMatches); //weakerMatchedIDs.AddRange(weak); //weak = AddMatches("NAME + ADDRESS weaker", allData, r => HardSelector(r, new FieldInclusions { Name = true, Address = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSNSoft = true, }), ref newMatches, originalNumberOfMatches); //weakerMatchedIDs.AddRange(weak); //weak = AddMatches("PHONE + soft NAME", allData, r => HardSelector(r, new FieldInclusions { Phone = true }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { Name = true, }), ref newMatches, originalNumberOfMatches); //weakerMatchedIDs.AddRange(weak); //weak = AddMatches("PHONE + soft FIRST/DOB", allData, r => HardSelector(r, new FieldInclusions { Phone = true }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { First = true, DOB = true, }), ref newMatches, originalNumberOfMatches); //weakerMatchedIDs.AddRange(weak); ////****************** NEW SUPER-WEAK MATCHES ******************// AddMatches("FIRST + EMAIL + soft", allData, r => HardSelector(r, new FieldInclusions { Email = true, First = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, DOB = true, Phone = true, }), ref newMatches, originalNumberOfMatches); AddMatches("EMAIL + soft", allData, r => HardSelector(r, new FieldInclusions { Email = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, DOB = true, Phone = true, }), ref newMatches, originalNumberOfMatches); AddMatches("SSN + soft", allData, r => HardSelector(r, new FieldInclusions { SSN = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { First = true, DOB = true, Phone = true, Address = true, }), ref newMatches, originalNumberOfMatches); AddMatches("NAME + soft", allData, r => HardSelector(r, new FieldInclusions { Name = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, DOB = true, Phone = true, Address = true, }), ref newMatches, originalNumberOfMatches); AddMatches("Phone + soft", allData, r => HardSelector(r, new FieldInclusions { Phone = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, Name = true, DOB = true, Address = true, }), ref newMatches, originalNumberOfMatches); AddMatches("DOB + soft", allData, r => HardSelector(r, new FieldInclusions { DOB = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, Name = true, Phone = true, Address = true, }), ref newMatches, originalNumberOfMatches); AddMatches("Address + soft", allData, r => HardSelector(r, new FieldInclusions { Address = true, }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions { SSN = true, Name = true, DOB = true, Phone = true, }), ref newMatches, originalNumberOfMatches); Console.WriteLine("Done!"); if (_printActuals) { File.WriteAllLines("newMatches.txt", _newMatchingRows); } }
public List <int> AddMatches(string label, Row[] data, Func <Row, string> groupingValue, int softEqualsCount, Func <Row, Row, int> softEquals, ref ClosedSets matches, int originalNumberOfMatches) { _newMatchingRows.Add($"STARTING {label} MATCHES"); List <int> toReturn = new List <int>(); Console.WriteLine(); Console.WriteLine(label); var grouped = data.GroupBy(groupingValue); int thrownOutCounter = 0; int addedCounter = 0; int modifiedCounter = 0; foreach (var group in grouped) { if (group.Count() > 100) { if (_printLargeGroupValues) { Console.WriteLine(group.Key); } continue; } if (group.Count() < 2) { continue; } if (group.Key == "BADFORMAT") // Skip unentered or cleaned data { continue; } // Loop over each pair in the group, test soft equality, and add if appropriate foreach (Row row1 in group) { foreach (Row row2 in group) { if (row2 != row1) { if (softEquals(row1, row2) >= softEqualsCount) { if (matches.AddMatch(row1, row2)) { addedCounter++; if (_printActuals) { //PrintingLibrary.PrintPair(row1, row2); _newMatchingRows.Add(row1.ToString()); _newMatchingRows.Add(row2.ToString()); _newMatchingRows.Add(""); } } } else { thrownOutCounter++; if (_printErrors) { PrintingLibrary.PrintPair(row1, row2); } } } } } } Console.WriteLine($"Groups thrown out: {thrownOutCounter}"); Console.WriteLine($"Match added: {addedCounter}"); Console.WriteLine($"Match modified: {modifiedCounter}"); Console.WriteLine($"Cumulative Matches Found: {matches.NumberOfMatches - originalNumberOfMatches}"); PrintingLibrary.PrintRemainingRowCount(data, matches); return(toReturn); }