Beispiel #1
0
        public static long[] DistanceCounts(RowMatchObject fieldMatches, long totalRowCount, int largestDistance)
        {
            long[] toReturn = new long[largestDistance + 3];

            var matchArray = fieldMatches.Matches.MatchArray;

            //Handle not blank and not over the max distance threshold
            for (int i = 0; i < matchArray.Length; i++)
            {
                var l = matchArray[i];
                foreach (var p in l)
                {
                    int j = p.Index;
                    if (i != j)
                    {
                        toReturn[p.Distance] += ((long)(fieldMatches.IndexToEids[i].Count)) * ((long)(fieldMatches.IndexToEids[j].Count));
                    }
                    else
                    {
                        toReturn[p.Distance] += ((long)(fieldMatches.IndexToEids[i].Count)) * ((long)(fieldMatches.IndexToEids[j].Count - 1));
                    }
                }
            }


            long notBlank = fieldMatches.IndexToEids.Sum(l => l.Count);

            //Handle over the max distance threshold;
            toReturn[largestDistance + 1] = notBlank * (notBlank - 1) - toReturn.Sum();

            //Divide by two
            for (int i = 0; i < toReturn.Length; i++)
            {
                toReturn[i] /= 2;
            }

            return(toReturn);
        }
Beispiel #2
0
        public RowMatchObject DistanceAtMostN(Row[] data, Func <Row, string> fieldSelector, int n)
        {
            //Start by grouping the data into fields
            Console.WriteLine("Grouping By Field Value");
            Dictionary <string, List <Row> > rowsByFieldValue = new Dictionary <string, List <Row> >();

            foreach (var d in data)
            {
                string field = fieldSelector(d);
                if (field == "")
                {
                    continue;
                }
                if (!rowsByFieldValue.ContainsKey(field))
                {
                    rowsByFieldValue[field] = new List <Row>();
                }

                rowsByFieldValue[field].Add(d);
            }



            Console.WriteLine("Creating EID <=> Index Maps");
            int[] eidToIndex = new int[data.Max(d => d.EnterpriseID) + 1];
            for (int i = 0; i < eidToIndex.Length; i++)
            {
                eidToIndex[i] = -1;
            }
            int groupIndex = 0;

            List <int>[] indexToEids = new List <int> [rowsByFieldValue.Count()];
            foreach (var pair in rowsByFieldValue)
            {
                foreach (var row in pair.Value)
                {
                    eidToIndex[row.EnterpriseID] = groupIndex;
                }

                indexToEids[groupIndex] = pair.Value.Select(r => r.EnterpriseID).ToList();
                groupIndex++;
            }

            var strings = rowsByFieldValue.Select(p => p.Key).ToArray();



            List <Row>[] rowsWithThisField = new List <Row> [strings.Length];
            for (int i = 0; i < strings.Length; i++)
            {
                rowsWithThisField[i] = rowsByFieldValue[strings[i]];
            }

            var stringMatches = DistanceAtMostN(strings, n);

            RowMatchObject toReturn = new RowMatchObject
            {
                Matches     = stringMatches,
                EidToIndex  = eidToIndex,
                IndexToEids = indexToEids
            };

            return(toReturn);
        }
Beispiel #3
0
        public static long[] DistanceCountsGivenMatched(List <List <int> > knownMatches, RowMatchObject fieldMatches, int largestDistance)
        {
            long[] toReturn = new long[largestDistance + 2];

            var fieldMatchArray = fieldMatches.Matches.MatchArray;
            var eidToIndex      = fieldMatches.EidToIndex;

            foreach (var set in knownMatches)
            {
                for (int i = 0; i < set.Count; i++)
                {
                    for (int j = i + 1; j < set.Count; j++)
                    {
                        int ii = fieldMatches.EidToIndex[set[i]];
                        int ij = fieldMatches.EidToIndex[set[j]];

                        if (ii == -1 || ij == -1)
                        {
                            continue;
                        }

                        bool pairFound = false;
                        IndexDistancePair foundPair = default(IndexDistancePair);

                        foreach (var pair in fieldMatchArray[ii])
                        {
                            if (pair.Index == ij)
                            {
                                pairFound = true;
                                foundPair = pair;
                                break;
                            }
                        }

                        if (pairFound)
                        {
                            toReturn[foundPair.Distance]++;
                        }
                        else
                        {
                            //In this case, we give the maximum distance;
                            toReturn[largestDistance + 1]++;
                        }
                    }
                }
            }

            return(toReturn);
        }
Beispiel #4
0
        public static ConditionedDistribution ComputeConditionedDistribution(List <List <int> > knownMatches, RowMatchObject fieldMatches, int totalRowCount)
        {
            int largestDistance = fieldMatches.Matches.MatchArray.Max(l => l.Max(e => e.Distance));

            var distanceCounts             = DistanceCounts(fieldMatches, totalRowCount, largestDistance);
            var distanceCountsGivenMatched = DistanceCountsGivenMatched(knownMatches, fieldMatches, largestDistance);

            long[] distanceCountsGivenUnmatched = new long[largestDistance + 2];
            for (int i = 0; i < largestDistance + 2; i++)
            {
                distanceCountsGivenUnmatched[i] = distanceCounts[i] - distanceCountsGivenMatched[i];
            }

            ConditionedDistribution toReturn = new ConditionedDistribution();

            toReturn.GivenMatch   = CountsToDistribution(distanceCountsGivenMatched);
            toReturn.GivenNoMatch = CountsToDistribution(distanceCountsGivenUnmatched);


            return(toReturn);
        }