public IEnumerator <ListPQNode <Cluster> > GetEnumerator() { ListPQNode <Cluster> current = head; while (current != null) { yield return(current); current = current.prev; } }
public void removeMin() { if (N == 1) { N--; head = null; tail = null; return; } tail.next.prev = null; tail = tail.next; N--; }
public void insertMax(Cluster c) { if (N == 0) { head = new ListPQNode <Cluster>(c); tail = head; N++; return; } ListPQNode <Cluster> n = new ListPQNode <Cluster>(c); head.next = n; n.prev = head; head = n; N++; if (N > M) { removeMin(); } }
public void setMax(ListPQNode <Cluster> n) { if (N == 1 || head == n) { return; } else if (tail == n) { tail = n.next; tail.prev = null; } else { ListPQNode <Cluster> left = n.prev; ListPQNode <Cluster> right = n.next; left.next = right; right.prev = left; } //set selected node to max head.next = n; n.prev = head; head = n; }
/// <summary> /// Checks whether a record belongs in a cluster using string comparison /// </summary> private bool compareRecordToCluster(Record queryRecord, Cluster cluster, double tolerance, ListPQNode <Cluster> node, bool scanDates, bool scanDescriptions, double namePrecision, double datePrecision, double descriptionPrecision) { bool result = false; foreach (Record clusterRecord in cluster.getRecords()) { //check if record is similar enough to record in cluster to be added //TODO change logic so that it only calculates all three measures if search enhance is on bool similarityFail = false; double totalSimilarity = 0; int divisor = 1; //first perform mandatory name check double nameSimilarity = strComp.jaroWinklerCompare(queryRecord, clusterRecord); if (nameSimilarity < namePrecision) { similarityFail = true; } double dateSimilarity = compareDates(queryRecord, clusterRecord); if (scanDates && (dateSimilarity < datePrecision)) { similarityFail = true; divisor++; } double descriptionSimilarity = compareDescriptions(queryRecord, clusterRecord); if (scanDescriptions && (descriptionSimilarity < descriptionPrecision)) { similarityFail = true; divisor++; } //calculate total similairty //TODO smarter weighting totalSimilarity = (nameSimilarity + dateSimilarity + descriptionPrecision) / divisor; //if all three similarity checks succeeded, it's a match if (!similarityFail) { //if yes, update the cluster addRecordToCluster(queryRecord, cluster); updatePQ(node); //and update the priority queue result = true; break; } else if (totalSimilarity < 0.4) //if the similarity is way off, don't bother checking the rest of the cluster { break; } } return(result); }
/// <summary> /// Updates the PQ with the given cluster /// </summary> private void updatePQ(ListPQNode <Cluster> node) { listPQ.setMax(node); }
/// <summary> /// Checks whether record is in cluster, using pre-defined tolerance /// </summary> private bool compareRecordToClusterAuto(Record queryRecord, Cluster cluster, double tolerance, ListPQNode <Cluster> node, bool scanDates, bool scanDescriptions, List <String> ignoreList) { bool result = false; foreach (Record r in cluster.getRecords()) { //check if record is similar enough to record in cluster to be added double nameWeight = 0; double dateWeight = 0; double descriptionWeight = 0; //will be used to determine how much weight to give to each field double nameSimilarity = 0; double dateSimilarity = 0; double descriptionSimilarity = 0; nameSimilarity = normalize(MIN_NAME_SIM, 1, strComp.jaroWinklerCompare(queryRecord, r)); //account for cases where one or both of the records are missing a date - ignore the field in calculation if (queryRecord.getDate().Equals(new DateTime(1900, 1, 1)) || r.getDate().Equals(new DateTime(1900, 1, 1))) { scanDates = false; } else { dateSimilarity = normalize(0, MAX_DAYS, (MAX_DAYS - compareDates(queryRecord, r))); } //do the same for descriptions if ((scanDescriptions == false) || IgnoreDescriptions(queryRecord, r, ignoreList)) { scanDescriptions = false; } else { descriptionSimilarity = normalize(MIN_DESCRIPTION_SIM, MAX_DESCRIPTION_SIM, compareDescriptions(queryRecord, r)); } calculateWeights(ref nameWeight, ref dateWeight, ref descriptionWeight, scanDates, scanDescriptions); double similarity = (nameSimilarity * nameWeight) + (dateSimilarity * dateWeight) + (descriptionSimilarity * descriptionWeight); if (similarity >= tolerance) { //if yes, update the cluster addRecordToCluster(queryRecord, cluster); updatePQ(node); //and update the priority queue result = true; break; } else if (similarity < tolerance / TOLERANCE_DISCARD_FACTOR) //if the similarity is way off, don't bother checking the rest of the cluster { break; } } return(result); }