Ejemplo n.º 1
0
        public static void PrintRemainingRowCount(Row[] data, ClosedSets matches)
        {
            int remaining = 0;

            for (int i = 0; i < data.Length; i++)
            {
                if (matches.RowToClosedRowSet[i].Count == 1)
                {
                    remaining++;
                }
            }
            Console.WriteLine($"Remaining: {remaining}");
        }
Ejemplo n.º 2
0
        public static ClosedSets LoadOriginalMatches(Row[] allData)
        {
            IEnumerable <string> lines = File.ReadLines(@"..\..\..\..\submission1.csv");

            ClosedSets closedSets = new ClosedSets(allData);

            foreach (string line in lines)
            {
                var        eidsAsStrings = line.Split(',');
                List <int> eids          = new List <int>();
                for (int i = 0; i < eidsAsStrings.Length - 1; i++)
                {
                    int eid;
                    int.TryParse(eidsAsStrings[i], out eid);
                    eids.Add(eid);
                }
                closedSets.AddMatch(eids);
            }

            return(closedSets);
        }
Ejemplo n.º 3
0
        public void FindAllMatches(Row[] allData, ref ClosedSets newMatches)
        {
            int originalNumberOfMatches = newMatches.NumberOfMatches;

            //******************  SOLID MATCHES   ******************//
            AddMatches("SSN + LAST", allData, r => HardSelector(r, new FieldInclusions
            {
                SSN  = true,
                Last = true,
            }), 0, (r1, r2) =>
                       1, ref newMatches, originalNumberOfMatches);
            AddMatches("SSN + DOB", allData, r => HardSelector(r, new FieldInclusions
            {
                SSN = true,
                DOB = true,
            }), 0, (r1, r2) =>
                       1, ref newMatches, originalNumberOfMatches);
            AddMatches("SSN + PHONE", allData, r => HardSelector(r, new FieldInclusions
            {
                SSN   = true,
                Phone = true,
            }), 0, (r1, r2) =>
                       1, ref newMatches, originalNumberOfMatches);
            AddMatches("SSN + ADDRESS", allData, r => HardSelector(r, new FieldInclusions
            {
                SSN     = true,
                Address = true,
            }), 0, (r1, r2) =>
                       1, ref newMatches, originalNumberOfMatches);

            AddMatches("NAME + DOB strong", allData, r => HardSelector(r, new FieldInclusions
            {
                Name = true,
                DOB  = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN     = true,
                Phone   = true,
                Address = true,
            }), ref newMatches, originalNumberOfMatches);
            AddMatches("NAME + PHONE (no sr/jr)", allData, r => HardSelector(r, new FieldInclusions
            {
                Name  = true,
                Phone = true
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN = true,
                DOB = true,
            }), ref newMatches, originalNumberOfMatches);

            AddMatches("NAME + ADDRESS (no sr/jr)", allData, r => HardSelector(r, new FieldInclusions
            {
                Name    = true,
                Address = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN = true,
                DOB = true,
            }), ref newMatches, originalNumberOfMatches);

            AddMatches("DOB + PHONE (no twin)", allData, r => HardSelector(r, new FieldInclusions
            {
                First = true,
                DOB   = true,
                Phone = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN     = true,
                Last    = true,
                Address = true,
            }), ref newMatches, originalNumberOfMatches);

            AddMatches("DOB + ADDRESS (no twin)", allData, r => HardSelector(r, new FieldInclusions
            {
                DOB     = true,
                Address = true
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN   = true,
                First = true,
            }), ref newMatches, originalNumberOfMatches);

            AddMatches("PHONE + ADDRESS (no twin)", allData, r => HardSelector(r, new FieldInclusions
            {
                Phone   = true,
                Address = true
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN   = true,
                First = true,
            }), ref newMatches, originalNumberOfMatches);


            AddMatches("SSN + soft match", allData, r => HardSelector(r, new FieldInclusions
            {
                SSN = true
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                Name    = true,
                Phone   = true,
                DOB     = true,
                Address = true,
            }), ref newMatches, originalNumberOfMatches); // Josh Code Review : Makes many of the SSN matches above redundant.

            //******************  PROBABLY SOLID MATCHES   ******************//

            AddMatches("NAME + PHONE (sr/jr)", allData, r => HardSelector(r, new FieldInclusions
            {
                Name  = true,
                Phone = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                Address = true,
            }), ref newMatches, originalNumberOfMatches);
            AddMatches("NAME + ADDRESS (sr/jr)", allData, r => HardSelector(r, new FieldInclusions
            {
                Name    = true,
                Address = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                Phone = true,
            }), ref newMatches, originalNumberOfMatches);
            AddMatches("DOB + PHONE (twin)", allData, r => HardSelector(r, new FieldInclusions
            {
                DOB   = true,
                Phone = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                Last    = true,
                Address = true,
            }), ref newMatches, originalNumberOfMatches);
            AddMatches("DOB + ADDRESS (twin)", allData, r => HardSelector(r, new FieldInclusions
            {
                Address = true,
                DOB     = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                Last  = true,
                Phone = true,
            }), ref newMatches, originalNumberOfMatches);
            AddMatches("PHONE + ADDRESS (twin)", allData, r => HardSelector(r, new FieldInclusions
            {
                Phone   = true,
                Address = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                Last = true,
            }), ref newMatches, originalNumberOfMatches);


            //AddMatches("Name + 2 soft", allData, r => HardSelector(r, new FieldInclusions
            //{
            //    Name = true,
            //}), 2, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            //{
            //    SSN = true,
            //    DOB = true,
            //    Phone = true,
            //    Address = true,
            //}), ref newMatches, originalNumberOfMatches);
            //AddMatches("DOB + 2 soft", allData, r => HardSelector(r, new FieldInclusions
            //{
            //    DOB = true,
            //}), 2, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            //{
            //    SSN = true,
            //    First = true,
            //    Phone = true,
            //    Address = true,
            //}), ref newMatches, originalNumberOfMatches);
            AddMatches("Phone + 2 soft", allData, r => HardSelector(r, new FieldInclusions
            {
                Phone = true,
            }), 2, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN     = true,
                First   = true,
                DOB     = true,
                Address = true,
            }), ref newMatches, originalNumberOfMatches);
            AddMatches("Address + 2 soft", allData, r => HardSelector(r, new FieldInclusions
            {
                Address = true,
            }), 2, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN   = true,
                First = true,
                DOB   = true,
                Phone = true,
            }), ref newMatches, originalNumberOfMatches);  //Josh code review : This could match on address, first name, and DOB.  Maybe it should go in the weaker matches category?

            ////******************  WEAKER MATCHES   ******************//

            //List<int> weakerMatchedIDs = new List<int>();
            //var weak =
            AddMatches("PHONE + soft SSN", allData, r => HardSelector(r, new FieldInclusions
            {
                Phone = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN = true,
            }), ref newMatches, originalNumberOfMatches);
            //weakerMatchedIDs.AddRange(weak);

            //weak =
            AddMatches("NAME + DOB weaker", allData, r => HardSelector(r, new FieldInclusions
            {
                Name = true,
                DOB  = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSNSoft = true,
            }), ref newMatches, originalNumberOfMatches);
            //weakerMatchedIDs.AddRange(weak);

            //weak =
            AddMatches("NAME + PHONE weaker", allData, r => HardSelector(r, new FieldInclusions
            {
                Name  = true,
                Phone = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSNSoft = true,
            }), ref newMatches, originalNumberOfMatches);
            //weakerMatchedIDs.AddRange(weak);

            //weak =
            AddMatches("NAME + ADDRESS weaker", allData, r => HardSelector(r, new FieldInclusions
            {
                Name    = true,
                Address = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSNSoft = true,
            }), ref newMatches, originalNumberOfMatches);
            //weakerMatchedIDs.AddRange(weak);

            //weak =
            AddMatches("PHONE + soft NAME", allData, r => HardSelector(r, new FieldInclusions
            {
                Phone = true
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                Name = true,
            }), ref newMatches, originalNumberOfMatches);
            //weakerMatchedIDs.AddRange(weak);

            //weak =
            AddMatches("PHONE + soft FIRST/DOB", allData, r => HardSelector(r, new FieldInclusions
            {
                Phone = true
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                First = true,
                DOB   = true,
            }), ref newMatches, originalNumberOfMatches);
            //weakerMatchedIDs.AddRange(weak);

            ////******************  NEW SUPER-WEAK MATCHES   ******************//

            AddMatches("FIRST + EMAIL + soft", allData, r => HardSelector(r, new FieldInclusions
            {
                Email = true,
                First = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN   = true,
                DOB   = true,
                Phone = true,
            }), ref newMatches, originalNumberOfMatches);

            AddMatches("EMAIL + soft", allData, r => HardSelector(r, new FieldInclusions
            {
                Email = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN   = true,
                DOB   = true,
                Phone = true,
            }), ref newMatches, originalNumberOfMatches);

            AddMatches("SSN + soft", allData, r => HardSelector(r, new FieldInclusions
            {
                SSN = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                First   = true,
                DOB     = true,
                Phone   = true,
                Address = true,
            }), ref newMatches, originalNumberOfMatches);

            AddMatches("NAME + soft", allData, r => HardSelector(r, new FieldInclusions
            {
                Name = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN     = true,
                DOB     = true,
                Phone   = true,
                Address = true,
            }), ref newMatches, originalNumberOfMatches);

            AddMatches("Phone + soft", allData, r => HardSelector(r, new FieldInclusions
            {
                Phone = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN     = true,
                Name    = true,
                DOB     = true,
                Address = true,
            }), ref newMatches, originalNumberOfMatches);

            AddMatches("DOB + soft", allData, r => HardSelector(r, new FieldInclusions
            {
                DOB = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN     = true,
                Name    = true,
                Phone   = true,
                Address = true,
            }), ref newMatches, originalNumberOfMatches);

            AddMatches("Address + soft", allData, r => HardSelector(r, new FieldInclusions
            {
                Address = true,
            }), 1, (r1, r2) => SoftMatchCount(r1, r2, new FieldInclusions
            {
                SSN   = true,
                Name  = true,
                DOB   = true,
                Phone = true,
            }), ref newMatches, originalNumberOfMatches);


            Console.WriteLine("Done!");
            if (_printActuals)
            {
                File.WriteAllLines("newMatches.txt", _newMatchingRows);
            }
        }
Ejemplo n.º 4
0
        public List <int> AddMatches(string label, Row[] data, Func <Row, string> groupingValue, int softEqualsCount, Func <Row, Row, int> softEquals, ref ClosedSets matches, int originalNumberOfMatches)
        {
            _newMatchingRows.Add($"STARTING {label} MATCHES");

            List <int> toReturn = new List <int>();

            Console.WriteLine();
            Console.WriteLine(label);

            var grouped = data.GroupBy(groupingValue);

            int thrownOutCounter = 0;
            int addedCounter     = 0;
            int modifiedCounter  = 0;

            foreach (var group in grouped)
            {
                if (group.Count() > 100)
                {
                    if (_printLargeGroupValues)
                    {
                        Console.WriteLine(group.Key);
                    }
                    continue;
                }

                if (group.Count() < 2)
                {
                    continue;
                }
                if (group.Key == "BADFORMAT") // Skip unentered or cleaned data
                {
                    continue;
                }

                // Loop over each pair in the group, test soft equality, and add if appropriate
                foreach (Row row1 in group)
                {
                    foreach (Row row2 in group)
                    {
                        if (row2 != row1)
                        {
                            if (softEquals(row1, row2) >= softEqualsCount)
                            {
                                if (matches.AddMatch(row1, row2))
                                {
                                    addedCounter++;

                                    if (_printActuals)
                                    {
                                        //PrintingLibrary.PrintPair(row1, row2);
                                        _newMatchingRows.Add(row1.ToString());
                                        _newMatchingRows.Add(row2.ToString());
                                        _newMatchingRows.Add("");
                                    }
                                }
                            }
                            else
                            {
                                thrownOutCounter++;

                                if (_printErrors)
                                {
                                    PrintingLibrary.PrintPair(row1, row2);
                                }
                            }
                        }
                    }
                }
            }

            Console.WriteLine($"Groups thrown out: {thrownOutCounter}");
            Console.WriteLine($"Match added: {addedCounter}");
            Console.WriteLine($"Match modified: {modifiedCounter}");
            Console.WriteLine($"Cumulative Matches Found: {matches.NumberOfMatches - originalNumberOfMatches}");

            PrintingLibrary.PrintRemainingRowCount(data, matches);

            return(toReturn);
        }