/// <summary> /// Creates a new cluster object for an item. /// Override this method to replace the clustering algorithm /// </summary> protected virtual ClusterBase CreateCluster(ClusterItem item) { ClusterBase cluster = new ZipLineCluster(this.Config, this.Vocabulary); cluster.AddToCluster(item, 1.0f); return(cluster); }
internal void AddToCluster(ClusterItem item, float affinity) { if (!this.Items.ContainsKey(item.Id)) { this.StatsTextLength.Push(item.TextLength); this.StatsTokenCount.Push(item.TokenIndex.TotalCount); this.StatsAffinity.Push(affinity); this.Items.Add(item.Id, item); this.LastItem = item; if (!string.IsNullOrEmpty(item.Hash)) { // if the bucket is already "sharp" enough then preserve the if (this.StatsAffinity.Mean - this.StatsAffinity.StandardDeviation > this.Config.MinClusterAffinity) { if (this.ItemContentHashes.Count == 0) { foreach (ClusterItem itm in this.Items.Values.Where(it => !string.IsNullOrEmpty(it.Hash))) { if (this.ItemContentHashes.Count < 100) { this.ItemContentHashes.Add(itm.Hash); } else { break; } } } if (this.ItemContentHashes.Contains(item.Hash)) { item.PreserveHash = false; } else if (this.ItemContentHashes.Count < 100) // 100 hashes at most. The buckets where it helps most have very few hashes { this.ItemContentHashes.Add(item.Hash); } } } item.Affinity = affinity; this.OnAddToCluster(item, affinity); item.OnAdded(); } }
/// <summary> /// Adds a new clustering item /// </summary> /// <param name="id">ID to track this item</param> /// <param name="content">The item's contents - this is what the clustering algorithm parses</param> public void AddItem(string id, string content) { if (!this.IsKnownItem(id)) { this.InitIfPending(); var item = new ClusterItem(id, content, this.Vocabulary); List <ClusterBase> checkClusters = null; float bestAffinity = 0.0f; for (int c = 0; c < this.Clusters.Count; c++) { if ((this.Clusters[c].LastItemHash != null && this.Clusters[c].LastItemHash == item.Hash) || (this.Clusters[c].StatsAffinity.Mean - this.Clusters[c].StatsAffinity.StandardDeviation > this.Config.MinClusterAffinity && !string.IsNullOrEmpty(item.Hash) && this.Clusters[c].ItemContentHashes.Contains(item.Hash))) { checkClusters = checkClusters ?? new List <ClusterBase>(); checkClusters.Add(this.Clusters[c]); bestAffinity = 1.0f; } } int bestCluster = -1; checkClusters = checkClusters ?? this.Clusters; if (checkClusters.Count <= 1) { if (checkClusters.Count == 1) { bestCluster = 0; if (bestAffinity < 0.99999f) { bestAffinity = checkClusters[0].GetAffinity(item, this.Config.MinClusterAffinity); } } } else { bool breakOnMax = checkClusters.Count > 5; if (this.Config.LogDebug && checkClusters.Count < this.Clusters.Count) { Debug.WriteLine($"Clusters ({checkClusters.Count} of {this.Clusters.Count}) found by hash. Item {item.Id}, Hash: {item.Hash} Content:{content.Replace('\r', ' ').Replace('\n', ' ')}"); } float[] clusterAffinity = new float[checkClusters.Count]; Parallel.For(0, checkClusters.Count, new ParallelOptions { MaxDegreeOfParallelism = Math.Max(1, this.Config.MaxDegreeOfParallelism) }, (ci, state) => { ClusterBase cluster = checkClusters[ci]; float caff = cluster.GetAffinity(item, this.Config.MinClusterAffinity); clusterAffinity[ci] = caff; if (breakOnMax && caff > 0.99999) { bestCluster = ci; bestAffinity = caff; state.Stop(); } }); if (bestCluster < 0) { bestAffinity = 0; for (int ci = 0; ci < checkClusters.Count; ci++) { if (clusterAffinity[ci] > bestAffinity) { bestAffinity = clusterAffinity[ci]; bestCluster = ci; } } } } if (bestCluster >= 0 && bestAffinity < this.Config.MinClusterAffinity) { bestCluster = -1; } if (bestCluster >= 0) { ClusterBase cluster = checkClusters[bestCluster]; cluster.AddToCluster(item, cluster.GetAffinity(item, this.Config.MinClusterAffinity)); } else { var newCluster = this.CreateCluster(item); this.Clusters.Add(newCluster); } } }
public static bool IsSmallItem(ClusterItem item) => item.TokenIndex.TotalCount < 7 * TokenParts;
protected virtual void OnAddToCluster(ClusterItem item, float affinity) { }
protected abstract float OnGetAffinity(ClusterItem item, float minAffinity);
public float GetAffinity(ClusterItem item, float minAffinity) { // trivial case - already in cluster if (this.Items.ContainsKey(item.Id)) { return(1.0f); } // trivial case - new cluster if (!this.Items.Any()) { return(1.0f); } if ((item.TextLength == 0 || item.Content == null) && this.StatsTextLength.Mean < 1) { return(1.0f); } if (this.Config.StDevFactorTextLength > 0) { if (this.CalculateConfidence(this.StatsTextLength, item.TextLength, this.Config.StDevFactorTextLength, 5) == null) { if (this.Config.LogDebug) { Debug.WriteLine($"[TSCBSGIETUJN] Item {item.Id} excluded for text length"); } return(0); } } if (this.Config.StDevFactorTokenCount > 0) { if (this.CalculateConfidence(this.StatsTokenCount, item.TokenIndex.TotalCount, this.Config.StDevFactorTokenCount, 2) == null) { if (this.Config.LogDebug) { Debug.WriteLine($"Item {item.Id} excluded for token count"); } return(0); } } float affinity = this.OnGetAffinity(item, minAffinity); if (affinity > this.Config.MinClusterAffinity && this.Config.StDevFactorAffinity > 0 && this.Items.Count >= this.Config.MinClusterSizeToSplit) { float reAff = (float)(this.CalculateConfidence(this.StatsAffinity, affinity, this.Config.StDevFactorAffinity, 0.015, false) ?? 0.0); if (this.Config.LogDebug && reAff < this.Config.MinClusterAffinity) { Debug.WriteLine($"Item {item.Id} excluded for affinity"); } return(reAff); } return(affinity); }