public static Matches NewMatches(int n) { Matches toReturn = new Matches(); toReturn.MatchArray = new List <IndexDistancePair> [n]; for (int i = 0; i < n; i++) { toReturn.MatchArray[i] = new List <IndexDistancePair>(); } return(toReturn); }
public override Matches DistanceAtMostN(string[] strings, int n) { Matches toReturn = MatchesEngine.NewMatches(strings.Count()); //Every string matches itself for (int i = 0; i < strings.Length; i++) { toReturn.AddMatch(i, i, 0); } Console.WriteLine("Creating the neighborhoods"); List <EditDistanceMatchObject> neighborHood = new List <EditDistanceMatchObject>(); for (int i = 0; i < strings.Length; i++) { Console.Write($"\r{i}/{strings.Length} neighborhoods computed"); neighborHood.AddRange(DeleteN(strings[i], i, n)); } Console.WriteLine(); Console.WriteLine("Grouping by neighborhood"); var grouped = neighborHood.GroupBy(edmo => edmo.Substring).ToArray(); Console.WriteLine("Checking edit distance"); int c = 0; foreach (var group in grouped) { Console.Write($"\r{c++}/{grouped.Length} edit distance groups checked"); var groupArray = group.ToArray(); for (int i = 0; i < groupArray.Length; i++) { for (int j = i + 1; j < groupArray.Length; j++) { int ed = EditDistance(groupArray[i], groupArray[j]); if (ed <= n) { toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, ed); } } } } Console.WriteLine(); Console.WriteLine("Cleaning string match object"); toReturn.Clean(); //ExploreStrings(strings, toReturn); return(toReturn); }
public static void AddMatch(this Matches matches, int i, int j, int distance) { var matchArray = matches.MatchArray; matches.AddDirectedMatch(i, j, distance); matches.AddDirectedMatch(j, i, distance); matchArray[i].Add(new IndexDistancePair { Index = j, Distance = distance }); matchArray[j].Add(new IndexDistancePair { Index = i, Distance = distance }); }
public static Matches EditDistanceAtMostN(string[] S, string[] T, int n) { Matches toReturn = MatchesEngine.NewMatches(S.Length); //Create BKTree var bkTree = BKTreeEngine.CreateBKTree(S.ToList()); //Create lookup table Dictionary <string, int> stringToInt = new Dictionary <string, int>(); for (int i = 0; i < S.Length; i++) { stringToInt[S[i]] = i; } int c = 0; object cLock = new object(); object matchLock = new object(); Parallel.For(0, T.Length, j => { lock (cLock) { Console.Write($"\r{c++}/{T.Length}"); } var neighbors = BKTreeEngine.EditDistanceAtMostN(T[j], bkTree, n); foreach (var neighbor in neighbors) { int i = stringToInt[neighbor]; lock (cLock) { toReturn.AddDirectedMatch(i, j, EditDistanceEngine.Compute(T[j], neighbor)); } } } ); return(toReturn); }
public static void Clean(this Matches matches) { var matchArray = matches.MatchArray; for (int i = 0; i < matchArray.Length; i++) { List <IndexDistancePair> newList = new List <IndexDistancePair>(); var groupedByIndex = matchArray[i].GroupBy(pair => pair.Index); foreach (var group in groupedByIndex) { if (group.Count() == 1) { newList.Add(group.First()); } else { newList.Add(new IndexDistancePair { Index = group.Key, Distance = group.Min(p => p.Distance) }); } } matchArray[i] = newList; } }
public static List <IndexDistancePair> Neighbors(this Matches matches, int i) { var matchArray = matches.MatchArray; return(matchArray[i]); }
public static bool HasMatch(this Matches matches, int i, int j) { var matchArray = matches.MatchArray; return(matchArray[i].Any(pair => pair.Index == j)); }
public static void Serialize(this Matches matches, string path) { Serializer.Serialize(matches, path); }
public static Matches FuzzyMatchOnNImportantFields(int[] eids, List <RowMatchObject> matchObjectsForFields, int n, Row[] allData) { int maxEid = eids.Max(); Matches toReturn = MatchesEngine.NewMatches(maxEid + 1); FastEditDistanceGrouper fastEditDistanceGrouper = new FastEditDistanceGrouper(); int c = 0; int[] eidToMatchCount = new int[maxEid + 1]; List <int> usedEids = new List <int>(); Random r = new Random(); foreach (int eid in eids) { usedEids.Clear(); Console.Write($"\r{c++}/{eids.Count()} Final Row Matches"); foreach (var matchObject in matchObjectsForFields) { int index = matchObject.EidToIndex[eid]; if (index == -1) { continue; } var neigborIndices = matchObject.Matches.Neighbors(index); foreach (var neighborIndex in neigborIndices) { var neighborEids = matchObject.IndexToEids[neighborIndex.Index]; foreach (var neighborEid in neighborEids) { if (eid > neighborEid) //We will do the exact same computation when we find all the matches for neighborEID. { continue; } usedEids.Add(neighborEid); eidToMatchCount[neighborEid]++; } } } int bigNeighborCount = 0; foreach (var usedEid in usedEids) { if (eidToMatchCount[usedEid] >= n) { toReturn.AddMatch(eid, usedEid, eidToMatchCount[usedEid]); //One way of recording the NUMBER of fuzzy matches bigNeighborCount++; } eidToMatchCount[usedEid] = 0; } //if (r.NextDouble() < .0001) //{ // Console.WriteLine("Delete me"); // Console.WriteLine(bigNeighborCount); // int betterCount = 0; // var myGuy = allData.Where(d => d.EnterpriseID == eid).First(); // foreach (var datum in allData) // { // if (eid <= datum.EnterpriseID) // { // if (EasiestAgreementCount(myGuy, datum) >= n) // { // betterCount++; // } // } // } // Console.WriteLine(betterCount); //} } Console.WriteLine("\nCleaning Two Field Fuzzy Match Object"); toReturn.Clean(); //I think I've actually staged things in a way that makes this unnecessary Console.WriteLine("Done Cleaning"); return(toReturn); }
/// <summary> /// Returns pairs where first element is from first part and second element from second. The indices for elements of T are offset by S.Length /// </summary> /// <param name="S"></param> /// <param name="T"></param> /// <param name="n"></param> /// <returns></returns> public static Matches EditDistanceAtMostN(string[] S, string[] T, int n) { Matches toReturn = MatchesEngine.NewMatches(S.Length); Console.WriteLine("Creating the neighborhoods"); List <BipartiteEditDistanceMatchObject> neighborHood = new List <BipartiteEditDistanceMatchObject>(); int c = 0; for (int i = 0; i < S.Length; i++) { Console.Write($"\r{c++}/{S.Length} S neighborhoods computed"); var withoutParts = DeleteN(S[i], i, n); foreach (var edmo in withoutParts) { neighborHood.Add(new BipartiteEditDistanceMatchObject { EditDistanceMatchObject = edmo, Part = 0 }); } } c = 0; for (int i = 0; i < T.Length; i++) { Console.Write($"\r{c++}/{T.Length} T neighborhoods computed"); var withoutParts = DeleteN(T[i], i, n); foreach (var edmo in withoutParts) { neighborHood.Add(new BipartiteEditDistanceMatchObject { EditDistanceMatchObject = edmo, Part = 1 }); } } Console.WriteLine(); Console.WriteLine("Grouping by neighborhood"); var grouped = neighborHood.GroupBy(edmo => edmo.EditDistanceMatchObject.Substring).ToArray(); Console.WriteLine("Checking edit distance"); c = 0; foreach (var group in grouped) { var groupS = group.Where(bedmo => bedmo.Part == 0).Select(bedmo => bedmo.EditDistanceMatchObject).ToArray(); var groupT = group.Where(bedmo => bedmo.Part == 1).Select(bedmo => bedmo.EditDistanceMatchObject).ToArray(); Console.Write($"\r{c++}/{grouped.Length} edit distance groups checked"); foreach (var s in groupS) { foreach (var t in groupT) { int ed = EditDistance(s, t); if (ed <= n) { toReturn.AddDirectedMatch(s.Index, t.Index, ed); } } } } Console.WriteLine(); Console.WriteLine("Cleaning string match object"); toReturn.Clean(); //ExploreStrings(strings, toReturn); return(toReturn); }
public override Matches DistanceAtMostN(string[] strings, int n) { DateTime[] dates = strings.Select(s => DateTime.Parse(s)).ToArray(); Matches toReturn = MatchesEngine.NewMatches(dates.Length); var dateIndices = dates.Select((d, i) => new DateIndex { Date = d, Index = i }).ToArray(); //Day month transpositions var groupedByYearAndNormalizedDateTime = dateIndices.GroupBy(d => System.Math.Min(d.Date.Day, d.Date.Month) + "/" + System.Math.Max(d.Date.Day, d.Date.Month) + "/" + d.Date.Year); foreach (var group in groupedByYearAndNormalizedDateTime) { var groupArray = group.ToArray(); for (int i = 0; i < groupArray.Length; i++) { for (int j = i; j < groupArray.Length; j++) { int distance; if (groupArray[i].Date == groupArray[j].Date) { distance = 0; } else { distance = 1; } toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, distance); } } } //Day transposed, or off by one, or off by 1 digit var groupedByMonthAndYear = dateIndices.GroupBy(d => d.Date.Month + "/" + d.Date.Year); foreach (var group in groupedByMonthAndYear) { var groupArray = group.ToArray(); for (int i = 0; i < groupArray.Length; i++) { for (int j = i + 1; j < groupArray.Length; j++) { int day1 = groupArray[i].Date.Day; int day2 = groupArray[j].Date.Day; if (MatchingManager.OneOrOneDigit(day1, day2) || MatchingManager.TransposedDigit(day1, day2)) { toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, 1); } } } } //Month transposed, or off by one, or off by 1 digit var groupedByDayAndYear = dateIndices.GroupBy(d => d.Date.Day + "/" + d.Date.Year); foreach (var group in groupedByDayAndYear) { var groupArray = group.ToArray(); for (int i = 0; i < groupArray.Length; i++) { for (int j = i + 1; j < groupArray.Length; j++) { int month1 = groupArray[i].Date.Month; int month2 = groupArray[j].Date.Month; if (MatchingManager.OneOrOneDigit(month1, month2) || MatchingManager.TransposedDigit(month1, month2)) { toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, 1); } } } } //Year has transposed digit or is off by one or one digit, or is off by 100 var groupedByDayAndMonth = dateIndices.GroupBy(d => d.Date.Day + "/" + d.Date.Month); foreach (var group in groupedByDayAndMonth) { var groupArray = group.ToArray(); for (int i = 0; i < groupArray.Length; i++) { for (int j = i + 1; j < groupArray.Length; j++) { int year1 = groupArray[i].Date.Year; int year2 = groupArray[j].Date.Year; if (MatchingManager.OneOrOneDigit(year1, year2) || MatchingManager.TransposedDigit(year1, year2) || MatchingManager.OffBy100(year1, year2)) { toReturn.AddMatch(groupArray[i].Index, groupArray[j].Index, 1); } } } } toReturn.Clean(); return(toReturn); }