Exemple #1
0
        /// <summary>
        /// Creates a new cluster object for an item.
        /// Override this method to replace the clustering algorithm
        /// </summary>
        protected virtual ClusterBase CreateCluster(ClusterItem item)
        {
            ClusterBase cluster = new ZipLineCluster(this.Config, this.Vocabulary);

            cluster.AddToCluster(item, 1.0f);
            return(cluster);
        }
Exemple #2
0
        internal void AddToCluster(ClusterItem item, float affinity)
        {
            if (!this.Items.ContainsKey(item.Id))
            {
                this.StatsTextLength.Push(item.TextLength);
                this.StatsTokenCount.Push(item.TokenIndex.TotalCount);
                this.StatsAffinity.Push(affinity);

                this.Items.Add(item.Id, item);

                this.LastItem = item;

                if (!string.IsNullOrEmpty(item.Hash))
                {
                    // if the bucket is already "sharp" enough then preserve the
                    if (this.StatsAffinity.Mean - this.StatsAffinity.StandardDeviation > this.Config.MinClusterAffinity)
                    {
                        if (this.ItemContentHashes.Count == 0)
                        {
                            foreach (ClusterItem itm in this.Items.Values.Where(it => !string.IsNullOrEmpty(it.Hash)))
                            {
                                if (this.ItemContentHashes.Count < 100)
                                {
                                    this.ItemContentHashes.Add(itm.Hash);
                                }
                                else
                                {
                                    break;
                                }
                            }
                        }

                        if (this.ItemContentHashes.Contains(item.Hash))
                        {
                            item.PreserveHash = false;
                        }
                        else if (this.ItemContentHashes.Count < 100) // 100 hashes at most. The buckets where it helps most have very few hashes
                        {
                            this.ItemContentHashes.Add(item.Hash);
                        }
                    }
                }

                item.Affinity = affinity;
                this.OnAddToCluster(item, affinity);
                item.OnAdded();
            }
        }
Exemple #3
0
        /// <summary>
        /// Adds a new clustering item
        /// </summary>
        /// <param name="id">ID to track this item</param>
        /// <param name="content">The item's contents - this is what the clustering algorithm parses</param>
        public void AddItem(string id, string content)
        {
            if (!this.IsKnownItem(id))
            {
                this.InitIfPending();

                var item = new ClusterItem(id, content, this.Vocabulary);
                List <ClusterBase> checkClusters = null;

                float bestAffinity = 0.0f;
                for (int c = 0; c < this.Clusters.Count; c++)
                {
                    if ((this.Clusters[c].LastItemHash != null && this.Clusters[c].LastItemHash == item.Hash) ||
                        (this.Clusters[c].StatsAffinity.Mean - this.Clusters[c].StatsAffinity.StandardDeviation > this.Config.MinClusterAffinity &&
                         !string.IsNullOrEmpty(item.Hash) && this.Clusters[c].ItemContentHashes.Contains(item.Hash)))
                    {
                        checkClusters = checkClusters ?? new List <ClusterBase>();
                        checkClusters.Add(this.Clusters[c]);
                        bestAffinity = 1.0f;
                    }
                }

                int bestCluster = -1;
                checkClusters = checkClusters ?? this.Clusters;

                if (checkClusters.Count <= 1)
                {
                    if (checkClusters.Count == 1)
                    {
                        bestCluster = 0;
                        if (bestAffinity < 0.99999f)
                        {
                            bestAffinity = checkClusters[0].GetAffinity(item, this.Config.MinClusterAffinity);
                        }
                    }
                }
                else
                {
                    bool breakOnMax = checkClusters.Count > 5;

                    if (this.Config.LogDebug && checkClusters.Count < this.Clusters.Count)
                    {
                        Debug.WriteLine($"Clusters ({checkClusters.Count} of {this.Clusters.Count}) found by hash. Item {item.Id}, Hash: {item.Hash} Content:{content.Replace('\r', ' ').Replace('\n', ' ')}");
                    }

                    float[] clusterAffinity = new float[checkClusters.Count];

                    Parallel.For(0, checkClusters.Count, new ParallelOptions {
                        MaxDegreeOfParallelism = Math.Max(1, this.Config.MaxDegreeOfParallelism)
                    },
                                 (ci, state) =>
                    {
                        ClusterBase cluster = checkClusters[ci];

                        float caff          = cluster.GetAffinity(item, this.Config.MinClusterAffinity);
                        clusterAffinity[ci] = caff;
                        if (breakOnMax && caff > 0.99999)
                        {
                            bestCluster  = ci;
                            bestAffinity = caff;
                            state.Stop();
                        }
                    });

                    if (bestCluster < 0)
                    {
                        bestAffinity = 0;
                        for (int ci = 0; ci < checkClusters.Count; ci++)
                        {
                            if (clusterAffinity[ci] > bestAffinity)
                            {
                                bestAffinity = clusterAffinity[ci];
                                bestCluster  = ci;
                            }
                        }
                    }
                }

                if (bestCluster >= 0 && bestAffinity < this.Config.MinClusterAffinity)
                {
                    bestCluster = -1;
                }
                if (bestCluster >= 0)
                {
                    ClusterBase cluster = checkClusters[bestCluster];
                    cluster.AddToCluster(item, cluster.GetAffinity(item, this.Config.MinClusterAffinity));
                }
                else
                {
                    var newCluster = this.CreateCluster(item);
                    this.Clusters.Add(newCluster);
                }
            }
        }
Exemple #4
0
 public static bool IsSmallItem(ClusterItem item) => item.TokenIndex.TotalCount < 7 * TokenParts;
Exemple #5
0
 protected virtual void OnAddToCluster(ClusterItem item, float affinity)
 {
 }
Exemple #6
0
 protected abstract float OnGetAffinity(ClusterItem item, float minAffinity);
Exemple #7
0
        public float GetAffinity(ClusterItem item, float minAffinity)
        {
            // trivial case - already in cluster
            if (this.Items.ContainsKey(item.Id))
            {
                return(1.0f);
            }
            // trivial case - new cluster
            if (!this.Items.Any())
            {
                return(1.0f);
            }

            if ((item.TextLength == 0 || item.Content == null) && this.StatsTextLength.Mean < 1)
            {
                return(1.0f);
            }



            if (this.Config.StDevFactorTextLength > 0)
            {
                if (this.CalculateConfidence(this.StatsTextLength, item.TextLength, this.Config.StDevFactorTextLength, 5) == null)
                {
                    if (this.Config.LogDebug)
                    {
                        Debug.WriteLine($"[TSCBSGIETUJN] Item {item.Id} excluded for text length");
                    }

                    return(0);
                }
            }

            if (this.Config.StDevFactorTokenCount > 0)
            {
                if (this.CalculateConfidence(this.StatsTokenCount, item.TokenIndex.TotalCount, this.Config.StDevFactorTokenCount, 2) == null)
                {
                    if (this.Config.LogDebug)
                    {
                        Debug.WriteLine($"Item {item.Id} excluded for token count");
                    }

                    return(0);
                }
            }

            float affinity = this.OnGetAffinity(item, minAffinity);

            if (affinity > this.Config.MinClusterAffinity && this.Config.StDevFactorAffinity > 0 && this.Items.Count >= this.Config.MinClusterSizeToSplit)
            {
                float reAff = (float)(this.CalculateConfidence(this.StatsAffinity, affinity, this.Config.StDevFactorAffinity, 0.015, false) ?? 0.0);
                if (this.Config.LogDebug && reAff < this.Config.MinClusterAffinity)
                {
                    Debug.WriteLine($"Item {item.Id} excluded for affinity");
                }

                return(reAff);
            }

            return(affinity);
        }