Esempio n. 1
0
        public override Matches DistanceAtMostN(string[] strings, int n)
        {
            Matches toReturn = MatchesEngine.NewMatches(strings.Count());

            //Every string matches itself
            for (int i = 0; i < strings.Length; i++)
            {
                toReturn.AddMatch(i, i, 0);
            }

            Console.WriteLine("Creating the neighborhoods");
            List <EditDistanceMatchObject> neighborHood = new List <EditDistanceMatchObject>();

            for (int i = 0; i < strings.Length; i++)
            {
                Console.Write($"\r{i}/{strings.Length} neighborhoods computed");
                neighborHood.AddRange(DeleteN(strings[i], i, n));
            }
            Console.WriteLine();

            Console.WriteLine("Grouping by neighborhood");
            var grouped = neighborHood.GroupBy(edmo => edmo.Substring).ToArray();


            Console.WriteLine("Checking edit distance");
            int c = 0;

            foreach (var group in grouped)
            {
                Console.Write($"\r{c++}/{grouped.Length} edit distance groups checked");
                var groupArray = group.ToArray();

                for (int i = 0; i < groupArray.Length; i++)
                {
                    for (int j = i + 1; j < groupArray.Length; j++)
                    {
                        int ed = EditDistance(groupArray[i], groupArray[j]);
                        if (ed <= n)
                        {
                            toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, ed);
                        }
                    }
                }
            }
            Console.WriteLine();

            Console.WriteLine("Cleaning string match object");
            toReturn.Clean();

            //ExploreStrings(strings, toReturn);

            return(toReturn);
        }
Esempio n. 2
0
        public static Matches FuzzyMatchOnNImportantFields(int[] eids, List <RowMatchObject> matchObjectsForFields, int n, Row[] allData)
        {
            int     maxEid   = eids.Max();
            Matches toReturn = MatchesEngine.NewMatches(maxEid + 1);

            FastEditDistanceGrouper fastEditDistanceGrouper = new FastEditDistanceGrouper();

            int c = 0;

            int[]      eidToMatchCount = new int[maxEid + 1];
            List <int> usedEids        = new List <int>();

            Random r = new Random();

            foreach (int eid in eids)
            {
                usedEids.Clear();
                Console.Write($"\r{c++}/{eids.Count()} Final Row Matches");
                foreach (var matchObject in matchObjectsForFields)
                {
                    int index = matchObject.EidToIndex[eid];
                    if (index == -1)
                    {
                        continue;
                    }
                    var neigborIndices = matchObject.Matches.Neighbors(index);
                    foreach (var neighborIndex in neigborIndices)
                    {
                        var neighborEids = matchObject.IndexToEids[neighborIndex.Index];
                        foreach (var neighborEid in neighborEids)
                        {
                            if (eid > neighborEid)  //We will do the exact same computation when we find all the matches for neighborEID.
                            {
                                continue;
                            }
                            usedEids.Add(neighborEid);
                            eidToMatchCount[neighborEid]++;
                        }
                    }
                }

                int bigNeighborCount = 0;
                foreach (var usedEid in usedEids)
                {
                    if (eidToMatchCount[usedEid] >= n)
                    {
                        toReturn.AddMatch(eid, usedEid, eidToMatchCount[usedEid]);  //One way of recording the NUMBER of fuzzy matches
                        bigNeighborCount++;
                    }

                    eidToMatchCount[usedEid] = 0;
                }

                //if (r.NextDouble() < .0001)
                //{
                //    Console.WriteLine("Delete me");
                //    Console.WriteLine(bigNeighborCount);
                //    int betterCount = 0;
                //    var myGuy = allData.Where(d => d.EnterpriseID == eid).First();
                //    foreach (var datum in allData)
                //    {
                //        if (eid <= datum.EnterpriseID)
                //        {
                //            if (EasiestAgreementCount(myGuy, datum) >= n)
                //            {
                //                betterCount++;
                //            }
                //        }
                //    }

                //    Console.WriteLine(betterCount);
                //}
            }

            Console.WriteLine("\nCleaning Two Field Fuzzy Match Object");
            toReturn.Clean();  //I think I've actually staged things in a way that makes this unnecessary
            Console.WriteLine("Done Cleaning");
            return(toReturn);
        }
Esempio n. 3
0
        public override Matches DistanceAtMostN(string[] strings, int n)
        {
            DateTime[] dates = strings.Select(s => DateTime.Parse(s)).ToArray();

            Matches toReturn = MatchesEngine.NewMatches(dates.Length);

            var dateIndices = dates.Select((d, i) => new DateIndex {
                Date = d, Index = i
            }).ToArray();


            //Day month transpositions
            var groupedByYearAndNormalizedDateTime = dateIndices.GroupBy(d => System.Math.Min(d.Date.Day, d.Date.Month) + "/" + System.Math.Max(d.Date.Day, d.Date.Month) + "/" + d.Date.Year);

            foreach (var group in groupedByYearAndNormalizedDateTime)
            {
                var groupArray = group.ToArray();

                for (int i = 0; i < groupArray.Length; i++)
                {
                    for (int j = i; j < groupArray.Length; j++)
                    {
                        int distance;
                        if (groupArray[i].Date == groupArray[j].Date)
                        {
                            distance = 0;
                        }
                        else
                        {
                            distance = 1;
                        }

                        toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, distance);
                    }
                }
            }

            //Day transposed, or off by one, or off by 1 digit
            var groupedByMonthAndYear = dateIndices.GroupBy(d => d.Date.Month + "/" + d.Date.Year);

            foreach (var group in groupedByMonthAndYear)
            {
                var groupArray = group.ToArray();
                for (int i = 0; i < groupArray.Length; i++)
                {
                    for (int j = i + 1; j < groupArray.Length; j++)
                    {
                        int day1 = groupArray[i].Date.Day;
                        int day2 = groupArray[j].Date.Day;
                        if (MatchingManager.OneOrOneDigit(day1, day2) || MatchingManager.TransposedDigit(day1, day2))
                        {
                            toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, 1);
                        }
                    }
                }
            }

            //Month transposed, or off by one, or off by 1 digit
            var groupedByDayAndYear = dateIndices.GroupBy(d => d.Date.Day + "/" + d.Date.Year);

            foreach (var group in groupedByDayAndYear)
            {
                var groupArray = group.ToArray();
                for (int i = 0; i < groupArray.Length; i++)
                {
                    for (int j = i + 1; j < groupArray.Length; j++)
                    {
                        int month1 = groupArray[i].Date.Month;
                        int month2 = groupArray[j].Date.Month;
                        if (MatchingManager.OneOrOneDigit(month1, month2) || MatchingManager.TransposedDigit(month1, month2))
                        {
                            toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, 1);
                        }
                    }
                }
            }

            //Year has transposed digit or is off by one or one digit, or is off by 100
            var groupedByDayAndMonth = dateIndices.GroupBy(d => d.Date.Day + "/" + d.Date.Month);

            foreach (var group in groupedByDayAndMonth)
            {
                var groupArray = group.ToArray();
                for (int i = 0; i < groupArray.Length; i++)
                {
                    for (int j = i + 1; j < groupArray.Length; j++)
                    {
                        int year1 = groupArray[i].Date.Year;
                        int year2 = groupArray[j].Date.Year;
                        if (MatchingManager.OneOrOneDigit(year1, year2) || MatchingManager.TransposedDigit(year1, year2) || MatchingManager.OffBy100(year1, year2))
                        {
                            toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, 1);
                        }
                    }
                }
            }

            toReturn.Clean();

            return(toReturn);
        }