Пример #1
0
        public static Matches NewMatches(int n)
        {
            Matches toReturn = new Matches();

            toReturn.MatchArray = new List <IndexDistancePair> [n];
            for (int i = 0; i < n; i++)
            {
                toReturn.MatchArray[i] = new List <IndexDistancePair>();
            }

            return(toReturn);
        }
Пример #2
0
        public override Matches DistanceAtMostN(string[] strings, int n)
        {
            Matches toReturn = MatchesEngine.NewMatches(strings.Count());

            //Every string matches itself
            for (int i = 0; i < strings.Length; i++)
            {
                toReturn.AddMatch(i, i, 0);
            }

            Console.WriteLine("Creating the neighborhoods");
            List <EditDistanceMatchObject> neighborHood = new List <EditDistanceMatchObject>();

            for (int i = 0; i < strings.Length; i++)
            {
                Console.Write($"\r{i}/{strings.Length} neighborhoods computed");
                neighborHood.AddRange(DeleteN(strings[i], i, n));
            }
            Console.WriteLine();

            Console.WriteLine("Grouping by neighborhood");
            var grouped = neighborHood.GroupBy(edmo => edmo.Substring).ToArray();


            Console.WriteLine("Checking edit distance");
            int c = 0;

            foreach (var group in grouped)
            {
                Console.Write($"\r{c++}/{grouped.Length} edit distance groups checked");
                var groupArray = group.ToArray();

                for (int i = 0; i < groupArray.Length; i++)
                {
                    for (int j = i + 1; j < groupArray.Length; j++)
                    {
                        int ed = EditDistance(groupArray[i], groupArray[j]);
                        if (ed <= n)
                        {
                            toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, ed);
                        }
                    }
                }
            }
            Console.WriteLine();

            Console.WriteLine("Cleaning string match object");
            toReturn.Clean();

            //ExploreStrings(strings, toReturn);

            return(toReturn);
        }
Пример #3
0
        public static void AddMatch(this Matches matches, int i, int j, int distance)
        {
            var matchArray = matches.MatchArray;

            matches.AddDirectedMatch(i, j, distance);
            matches.AddDirectedMatch(j, i, distance);
            matchArray[i].Add(new IndexDistancePair {
                Index = j, Distance = distance
            });
            matchArray[j].Add(new IndexDistancePair {
                Index = i, Distance = distance
            });
        }
Пример #4
0
        public static Matches EditDistanceAtMostN(string[] S, string[] T, int n)
        {
            Matches toReturn = MatchesEngine.NewMatches(S.Length);

            //Create BKTree
            var bkTree = BKTreeEngine.CreateBKTree(S.ToList());

            //Create lookup table
            Dictionary <string, int> stringToInt = new Dictionary <string, int>();

            for (int i = 0; i < S.Length; i++)
            {
                stringToInt[S[i]] = i;
            }

            int    c         = 0;
            object cLock     = new object();
            object matchLock = new object();

            Parallel.For(0, T.Length, j =>
            {
                lock (cLock)
                {
                    Console.Write($"\r{c++}/{T.Length}");
                }

                var neighbors = BKTreeEngine.EditDistanceAtMostN(T[j], bkTree, n);
                foreach (var neighbor in neighbors)
                {
                    int i = stringToInt[neighbor];
                    lock (cLock)
                    {
                        toReturn.AddDirectedMatch(i, j, EditDistanceEngine.Compute(T[j], neighbor));
                    }
                }
            }
                         );

            return(toReturn);
        }
Пример #5
0
        public static void Clean(this Matches matches)
        {
            var matchArray = matches.MatchArray;

            for (int i = 0; i < matchArray.Length; i++)
            {
                List <IndexDistancePair> newList = new List <IndexDistancePair>();
                var groupedByIndex = matchArray[i].GroupBy(pair => pair.Index);
                foreach (var group in groupedByIndex)
                {
                    if (group.Count() == 1)
                    {
                        newList.Add(group.First());
                    }
                    else
                    {
                        newList.Add(new IndexDistancePair {
                            Index = group.Key, Distance = group.Min(p => p.Distance)
                        });
                    }
                }
                matchArray[i] = newList;
            }
        }
Пример #6
0
        public static List <IndexDistancePair> Neighbors(this Matches matches, int i)
        {
            var matchArray = matches.MatchArray;

            return(matchArray[i]);
        }
Пример #7
0
        public static bool HasMatch(this Matches matches, int i, int j)
        {
            var matchArray = matches.MatchArray;

            return(matchArray[i].Any(pair => pair.Index == j));
        }
Пример #8
0
 public static void Serialize(this Matches matches, string path)
 {
     Serializer.Serialize(matches, path);
 }
Пример #9
0
        public static Matches FuzzyMatchOnNImportantFields(int[] eids, List <RowMatchObject> matchObjectsForFields, int n, Row[] allData)
        {
            int     maxEid   = eids.Max();
            Matches toReturn = MatchesEngine.NewMatches(maxEid + 1);

            FastEditDistanceGrouper fastEditDistanceGrouper = new FastEditDistanceGrouper();

            int c = 0;

            int[]      eidToMatchCount = new int[maxEid + 1];
            List <int> usedEids        = new List <int>();

            Random r = new Random();

            foreach (int eid in eids)
            {
                usedEids.Clear();
                Console.Write($"\r{c++}/{eids.Count()} Final Row Matches");
                foreach (var matchObject in matchObjectsForFields)
                {
                    int index = matchObject.EidToIndex[eid];
                    if (index == -1)
                    {
                        continue;
                    }
                    var neigborIndices = matchObject.Matches.Neighbors(index);
                    foreach (var neighborIndex in neigborIndices)
                    {
                        var neighborEids = matchObject.IndexToEids[neighborIndex.Index];
                        foreach (var neighborEid in neighborEids)
                        {
                            if (eid > neighborEid)  //We will do the exact same computation when we find all the matches for neighborEID.
                            {
                                continue;
                            }
                            usedEids.Add(neighborEid);
                            eidToMatchCount[neighborEid]++;
                        }
                    }
                }

                int bigNeighborCount = 0;
                foreach (var usedEid in usedEids)
                {
                    if (eidToMatchCount[usedEid] >= n)
                    {
                        toReturn.AddMatch(eid, usedEid, eidToMatchCount[usedEid]);  //One way of recording the NUMBER of fuzzy matches
                        bigNeighborCount++;
                    }

                    eidToMatchCount[usedEid] = 0;
                }

                //if (r.NextDouble() < .0001)
                //{
                //    Console.WriteLine("Delete me");
                //    Console.WriteLine(bigNeighborCount);
                //    int betterCount = 0;
                //    var myGuy = allData.Where(d => d.EnterpriseID == eid).First();
                //    foreach (var datum in allData)
                //    {
                //        if (eid <= datum.EnterpriseID)
                //        {
                //            if (EasiestAgreementCount(myGuy, datum) >= n)
                //            {
                //                betterCount++;
                //            }
                //        }
                //    }

                //    Console.WriteLine(betterCount);
                //}
            }

            Console.WriteLine("\nCleaning Two Field Fuzzy Match Object");
            toReturn.Clean();  //I think I've actually staged things in a way that makes this unnecessary
            Console.WriteLine("Done Cleaning");
            return(toReturn);
        }
Пример #10
0
        /// <summary>
        /// Returns pairs where first element is from first part and second element from second.  The indices for elements of T are offset by S.Length
        /// </summary>
        /// <param name="S"></param>
        /// <param name="T"></param>
        /// <param name="n"></param>
        /// <returns></returns>
        public static Matches EditDistanceAtMostN(string[] S, string[] T, int n)
        {
            Matches toReturn = MatchesEngine.NewMatches(S.Length);

            Console.WriteLine("Creating the neighborhoods");
            List <BipartiteEditDistanceMatchObject> neighborHood = new List <BipartiteEditDistanceMatchObject>();
            int c = 0;

            for (int i = 0; i < S.Length; i++)
            {
                Console.Write($"\r{c++}/{S.Length} S neighborhoods computed");
                var withoutParts = DeleteN(S[i], i, n);
                foreach (var edmo in withoutParts)
                {
                    neighborHood.Add(new BipartiteEditDistanceMatchObject {
                        EditDistanceMatchObject = edmo, Part = 0
                    });
                }
            }

            c = 0;
            for (int i = 0; i < T.Length; i++)
            {
                Console.Write($"\r{c++}/{T.Length} T neighborhoods computed");
                var withoutParts = DeleteN(T[i], i, n);
                foreach (var edmo in withoutParts)
                {
                    neighborHood.Add(new BipartiteEditDistanceMatchObject {
                        EditDistanceMatchObject = edmo, Part = 1
                    });
                }
            }

            Console.WriteLine();

            Console.WriteLine("Grouping by neighborhood");
            var grouped = neighborHood.GroupBy(edmo => edmo.EditDistanceMatchObject.Substring).ToArray();


            Console.WriteLine("Checking edit distance");
            c = 0;
            foreach (var group in grouped)
            {
                var groupS = group.Where(bedmo => bedmo.Part == 0).Select(bedmo => bedmo.EditDistanceMatchObject).ToArray();
                var groupT = group.Where(bedmo => bedmo.Part == 1).Select(bedmo => bedmo.EditDistanceMatchObject).ToArray();

                Console.Write($"\r{c++}/{grouped.Length} edit distance groups checked");
                foreach (var s in groupS)
                {
                    foreach (var t in groupT)
                    {
                        int ed = EditDistance(s, t);
                        if (ed <= n)
                        {
                            toReturn.AddDirectedMatch(s.Index, t.Index, ed);
                        }
                    }
                }
            }
            Console.WriteLine();

            Console.WriteLine("Cleaning string match object");
            toReturn.Clean();

            //ExploreStrings(strings, toReturn);

            return(toReturn);
        }
Пример #11
0
        public override Matches DistanceAtMostN(string[] strings, int n)
        {
            DateTime[] dates = strings.Select(s => DateTime.Parse(s)).ToArray();

            Matches toReturn = MatchesEngine.NewMatches(dates.Length);

            var dateIndices = dates.Select((d, i) => new DateIndex {
                Date = d, Index = i
            }).ToArray();


            //Day month transpositions
            var groupedByYearAndNormalizedDateTime = dateIndices.GroupBy(d => System.Math.Min(d.Date.Day, d.Date.Month) + "/" + System.Math.Max(d.Date.Day, d.Date.Month) + "/" + d.Date.Year);

            foreach (var group in groupedByYearAndNormalizedDateTime)
            {
                var groupArray = group.ToArray();

                for (int i = 0; i < groupArray.Length; i++)
                {
                    for (int j = i; j < groupArray.Length; j++)
                    {
                        int distance;
                        if (groupArray[i].Date == groupArray[j].Date)
                        {
                            distance = 0;
                        }
                        else
                        {
                            distance = 1;
                        }

                        toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, distance);
                    }
                }
            }

            //Day transposed, or off by one, or off by 1 digit
            var groupedByMonthAndYear = dateIndices.GroupBy(d => d.Date.Month + "/" + d.Date.Year);

            foreach (var group in groupedByMonthAndYear)
            {
                var groupArray = group.ToArray();
                for (int i = 0; i < groupArray.Length; i++)
                {
                    for (int j = i + 1; j < groupArray.Length; j++)
                    {
                        int day1 = groupArray[i].Date.Day;
                        int day2 = groupArray[j].Date.Day;
                        if (MatchingManager.OneOrOneDigit(day1, day2) || MatchingManager.TransposedDigit(day1, day2))
                        {
                            toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, 1);
                        }
                    }
                }
            }

            //Month transposed, or off by one, or off by 1 digit
            var groupedByDayAndYear = dateIndices.GroupBy(d => d.Date.Day + "/" + d.Date.Year);

            foreach (var group in groupedByDayAndYear)
            {
                var groupArray = group.ToArray();
                for (int i = 0; i < groupArray.Length; i++)
                {
                    for (int j = i + 1; j < groupArray.Length; j++)
                    {
                        int month1 = groupArray[i].Date.Month;
                        int month2 = groupArray[j].Date.Month;
                        if (MatchingManager.OneOrOneDigit(month1, month2) || MatchingManager.TransposedDigit(month1, month2))
                        {
                            toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, 1);
                        }
                    }
                }
            }

            //Year has transposed digit or is off by one or one digit, or is off by 100
            var groupedByDayAndMonth = dateIndices.GroupBy(d => d.Date.Day + "/" + d.Date.Month);

            foreach (var group in groupedByDayAndMonth)
            {
                var groupArray = group.ToArray();
                for (int i = 0; i < groupArray.Length; i++)
                {
                    for (int j = i + 1; j < groupArray.Length; j++)
                    {
                        int year1 = groupArray[i].Date.Year;
                        int year2 = groupArray[j].Date.Year;
                        if (MatchingManager.OneOrOneDigit(year1, year2) || MatchingManager.TransposedDigit(year1, year2) || MatchingManager.OffBy100(year1, year2))
                        {
                            toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, 1);
                        }
                    }
                }
            }

            toReturn.Clean();

            return(toReturn);
        }