예제 #1
0
        public IEnumerator <ListPQNode <Cluster> > GetEnumerator()
        {
            ListPQNode <Cluster> current = head;

            while (current != null)
            {
                yield return(current);

                current = current.prev;
            }
        }
예제 #2
0
 public void removeMin()
 {
     if (N == 1)
     {
         N--;
         head = null;
         tail = null;
         return;
     }
     tail.next.prev = null;
     tail           = tail.next;
     N--;
 }
예제 #3
0
        public void insertMax(Cluster c)
        {
            if (N == 0)
            {
                head = new ListPQNode <Cluster>(c);
                tail = head;
                N++;
                return;
            }
            ListPQNode <Cluster> n = new ListPQNode <Cluster>(c);

            head.next = n;
            n.prev    = head;
            head      = n;
            N++;

            if (N > M)
            {
                removeMin();
            }
        }
예제 #4
0
        public void setMax(ListPQNode <Cluster> n)
        {
            if (N == 1 || head == n)
            {
                return;
            }
            else if (tail == n)
            {
                tail      = n.next;
                tail.prev = null;
            }
            else
            {
                ListPQNode <Cluster> left  = n.prev;
                ListPQNode <Cluster> right = n.next;
                left.next  = right;
                right.prev = left;
            }

            //set selected node to max
            head.next = n;
            n.prev    = head;
            head      = n;
        }
예제 #5
0
        /// <summary>
        /// Checks whether a record belongs in a cluster using string comparison
        /// </summary>
        private bool compareRecordToCluster(Record queryRecord, Cluster cluster, double tolerance, ListPQNode <Cluster> node, bool scanDates, bool scanDescriptions, double namePrecision, double datePrecision, double descriptionPrecision)
        {
            bool result = false;

            foreach (Record clusterRecord in cluster.getRecords())
            {   //check if record is similar enough to record in cluster to be added
                //TODO change logic so that it only calculates all three measures if search enhance is on
                bool   similarityFail  = false;
                double totalSimilarity = 0;
                int    divisor         = 1;

                //first perform mandatory name check
                double nameSimilarity = strComp.jaroWinklerCompare(queryRecord, clusterRecord);
                if (nameSimilarity < namePrecision)
                {
                    similarityFail = true;
                }

                double dateSimilarity = compareDates(queryRecord, clusterRecord);
                if (scanDates && (dateSimilarity < datePrecision))
                {
                    similarityFail = true;
                    divisor++;
                }

                double descriptionSimilarity = compareDescriptions(queryRecord, clusterRecord);
                if (scanDescriptions && (descriptionSimilarity < descriptionPrecision))
                {
                    similarityFail = true;
                    divisor++;
                }

                //calculate total similairty
                //TODO smarter weighting
                totalSimilarity = (nameSimilarity + dateSimilarity + descriptionPrecision) / divisor;


                //if all three similarity checks succeeded, it's a match
                if (!similarityFail)
                {                   //if yes, update the cluster
                    addRecordToCluster(queryRecord, cluster);
                    updatePQ(node); //and update the priority queue
                    result = true;
                    break;
                }
                else if (totalSimilarity < 0.4)    //if the similarity is way off, don't bother checking the rest of the cluster
                {
                    break;
                }
            }
            return(result);
        }
예제 #6
0
 /// <summary>
 /// Updates the PQ with the given cluster
 /// </summary>
 private void updatePQ(ListPQNode <Cluster> node)
 {
     listPQ.setMax(node);
 }
예제 #7
0
        /// <summary>
        /// Checks whether record is in cluster, using pre-defined tolerance
        /// </summary>
        private bool compareRecordToClusterAuto(Record queryRecord, Cluster cluster, double tolerance, ListPQNode <Cluster> node, bool scanDates, bool scanDescriptions, List <String> ignoreList)
        {
            bool result = false;

            foreach (Record r in cluster.getRecords())
            {                                                                               //check if record is similar enough to record in cluster to be added
                double nameWeight = 0; double dateWeight = 0; double descriptionWeight = 0; //will be used to determine how much weight to give to each field
                double nameSimilarity = 0; double dateSimilarity = 0; double descriptionSimilarity = 0;

                nameSimilarity = normalize(MIN_NAME_SIM, 1, strComp.jaroWinklerCompare(queryRecord, r));
                //account for cases where one or both of the records are missing a date - ignore the field in calculation
                if (queryRecord.getDate().Equals(new DateTime(1900, 1, 1)) || r.getDate().Equals(new DateTime(1900, 1, 1)))
                {
                    scanDates = false;
                }
                else
                {
                    dateSimilarity = normalize(0, MAX_DAYS, (MAX_DAYS - compareDates(queryRecord, r)));
                }

                //do the same for descriptions
                if ((scanDescriptions == false) || IgnoreDescriptions(queryRecord, r, ignoreList))
                {
                    scanDescriptions = false;
                }
                else
                {
                    descriptionSimilarity = normalize(MIN_DESCRIPTION_SIM, MAX_DESCRIPTION_SIM, compareDescriptions(queryRecord, r));
                }

                calculateWeights(ref nameWeight, ref dateWeight, ref descriptionWeight, scanDates, scanDescriptions);

                double similarity = (nameSimilarity * nameWeight) + (dateSimilarity * dateWeight) + (descriptionSimilarity * descriptionWeight);

                if (similarity >= tolerance)
                {                   //if yes, update the cluster
                    addRecordToCluster(queryRecord, cluster);
                    updatePQ(node); //and update the priority queue
                    result = true;
                    break;
                }
                else if (similarity < tolerance / TOLERANCE_DISCARD_FACTOR)    //if the similarity is way off, don't bother checking the rest of the cluster
                {
                    break;
                }
            }
            return(result);
        }