public static long[] DistanceCounts(RowMatchObject fieldMatches, long totalRowCount, int largestDistance) { long[] toReturn = new long[largestDistance + 3]; var matchArray = fieldMatches.Matches.MatchArray; //Handle not blank and not over the max distance threshold for (int i = 0; i < matchArray.Length; i++) { var l = matchArray[i]; foreach (var p in l) { int j = p.Index; if (i != j) { toReturn[p.Distance] += ((long)(fieldMatches.IndexToEids[i].Count)) * ((long)(fieldMatches.IndexToEids[j].Count)); } else { toReturn[p.Distance] += ((long)(fieldMatches.IndexToEids[i].Count)) * ((long)(fieldMatches.IndexToEids[j].Count - 1)); } } } long notBlank = fieldMatches.IndexToEids.Sum(l => l.Count); //Handle over the max distance threshold; toReturn[largestDistance + 1] = notBlank * (notBlank - 1) - toReturn.Sum(); //Divide by two for (int i = 0; i < toReturn.Length; i++) { toReturn[i] /= 2; } return(toReturn); }
public RowMatchObject DistanceAtMostN(Row[] data, Func <Row, string> fieldSelector, int n) { //Start by grouping the data into fields Console.WriteLine("Grouping By Field Value"); Dictionary <string, List <Row> > rowsByFieldValue = new Dictionary <string, List <Row> >(); foreach (var d in data) { string field = fieldSelector(d); if (field == "") { continue; } if (!rowsByFieldValue.ContainsKey(field)) { rowsByFieldValue[field] = new List <Row>(); } rowsByFieldValue[field].Add(d); } Console.WriteLine("Creating EID <=> Index Maps"); int[] eidToIndex = new int[data.Max(d => d.EnterpriseID) + 1]; for (int i = 0; i < eidToIndex.Length; i++) { eidToIndex[i] = -1; } int groupIndex = 0; List <int>[] indexToEids = new List <int> [rowsByFieldValue.Count()]; foreach (var pair in rowsByFieldValue) { foreach (var row in pair.Value) { eidToIndex[row.EnterpriseID] = groupIndex; } indexToEids[groupIndex] = pair.Value.Select(r => r.EnterpriseID).ToList(); groupIndex++; } var strings = rowsByFieldValue.Select(p => p.Key).ToArray(); List <Row>[] rowsWithThisField = new List <Row> [strings.Length]; for (int i = 0; i < strings.Length; i++) { rowsWithThisField[i] = rowsByFieldValue[strings[i]]; } var stringMatches = DistanceAtMostN(strings, n); RowMatchObject toReturn = new RowMatchObject { Matches = stringMatches, EidToIndex = eidToIndex, IndexToEids = indexToEids }; return(toReturn); }
public static long[] DistanceCountsGivenMatched(List <List <int> > knownMatches, RowMatchObject fieldMatches, int largestDistance) { long[] toReturn = new long[largestDistance + 2]; var fieldMatchArray = fieldMatches.Matches.MatchArray; var eidToIndex = fieldMatches.EidToIndex; foreach (var set in knownMatches) { for (int i = 0; i < set.Count; i++) { for (int j = i + 1; j < set.Count; j++) { int ii = fieldMatches.EidToIndex[set[i]]; int ij = fieldMatches.EidToIndex[set[j]]; if (ii == -1 || ij == -1) { continue; } bool pairFound = false; IndexDistancePair foundPair = default(IndexDistancePair); foreach (var pair in fieldMatchArray[ii]) { if (pair.Index == ij) { pairFound = true; foundPair = pair; break; } } if (pairFound) { toReturn[foundPair.Distance]++; } else { //In this case, we give the maximum distance; toReturn[largestDistance + 1]++; } } } } return(toReturn); }
public static ConditionedDistribution ComputeConditionedDistribution(List <List <int> > knownMatches, RowMatchObject fieldMatches, int totalRowCount) { int largestDistance = fieldMatches.Matches.MatchArray.Max(l => l.Max(e => e.Distance)); var distanceCounts = DistanceCounts(fieldMatches, totalRowCount, largestDistance); var distanceCountsGivenMatched = DistanceCountsGivenMatched(knownMatches, fieldMatches, largestDistance); long[] distanceCountsGivenUnmatched = new long[largestDistance + 2]; for (int i = 0; i < largestDistance + 2; i++) { distanceCountsGivenUnmatched[i] = distanceCounts[i] - distanceCountsGivenMatched[i]; } ConditionedDistribution toReturn = new ConditionedDistribution(); toReturn.GivenMatch = CountsToDistribution(distanceCountsGivenMatched); toReturn.GivenNoMatch = CountsToDistribution(distanceCountsGivenUnmatched); return(toReturn); }