Пример #1
0
        public static WordVector GetNormalizedAverage(IEnumerable <WordVector> vectors, int?maxComponentCount = null)
        {
            WordVector average = new WordVector();

            foreach (WordVector v in vectors)
            {
                foreach (var item in v._itemWeights)
                {
                    if (average._itemWeights.ContainsKey(item.Key))
                    {
                        average._itemWeights[item.Key] += item.Value * v.OriginalLength;
                    }
                    else
                    {
                        average._itemWeights.Add(item.Key, item.Value * v.OriginalLength);
                    }
                }
            }

            if (maxComponentCount.HasValue)
            {
                average.LimitComponentCount(maxComponentCount.Value);
            }

            average.Normalize();
            return(average);
        }
Пример #2
0
        public CustomBitArray CalculateHashScore(WordVector vector)
        {
            CustomBitArray arr = new CustomBitArray(_hyperPlanes.Count);

            for (int i = _hyperPlanes.Count - 1; i != -1; i--)
            {
                arr[i] = (_hyperPlanes[i] * vector) >= 0;
            }
            return(arr);
        }
Пример #3
0
        public CustomBitArray CalculateHashScore(WordVector vector, out bool anyTrue)
        {
            anyTrue = false;
            CustomBitArray arr = new CustomBitArray(_hyperPlanes.Count);

            for (int i = _hyperPlanes.Count - 1; i != -1; i--)
            {
                anyTrue |= (arr[i] = (_hyperPlanes[i] * vector) > 0);
            }
            return(arr);
        }
Пример #4
0
        public WordVector Copy()
        {
            WordVector copy = new WordVector();

            foreach (var item in this._itemWeights)
            {
                copy._itemWeights.Add(item.Key, item.Value);
            }
            copy._normalized = this._normalized;

            return(copy);
        }
Пример #5
0
        LSHashFunction GetNewHashFunction()
        {
            //Generate hyper planes
            List <WordVector> planes = new List <WordVector>();

            for (int j = 0; j < Settings.TweetClusterer_TCW_HyperPlaneCount; j++)
            {
                WordVector  plane   = new WordVector();
                List <long> wordIDs = GetRandomWordIDs(Settings.TweetClusterer_TCW_WordsPerHyperPlane);
                foreach (long id in wordIDs)
                {
                    plane.AddItem(id, Helpers.NextGaussian());
                }
                planes.Add(plane);
                Console.Write('.');
            }
            return(new LSHashFunction(planes));
        }
Пример #6
0
 public SimpleTweetCluster()
 {
     WordVector = new WordVector();
 }
Пример #7
0
 public SimpleStory()
 {
     WordVector = new WordVector();
 }
Пример #8
0
        static void AssignTweetClustersToNearestStory(Dictionary <long, SimpleTweetCluster> clusters, Dictionary <long, SimpleStory> stories, ref long nextStoryID)
        {
            //Assign batch clusters to nearest story
            foreach (SimpleTweetCluster cluster in clusters.Values.OrderBy(n => n.TweetClusterID))
            {
                List <long> nearStories = new List <long>();
                if (stories.Count > 0)
                {
                    var distances = stories.Values
                                    .Select(n => new { StoryID = n.StoryID, Distance = n.Vector * cluster.Vector })
                                    .OrderByDescending(n => n.Distance);
                    double      prevDist   = double.MaxValue;
                    List <long> candidates = new List <long>();
                    foreach (var item in distances)
                    {
                        if (prevDist == double.MaxValue)
                        {
                            prevDist = item.Distance;
                        }

                        if (item.Distance > Settings.TweetClusterer_SW_MergeUpperThreshold) //Similar enough
                        {
                            nearStories.Add(item.StoryID);
                        }
                        else if (item.Distance > Settings.TweetClusterer_SW_MergeLowerThreshold &&
                                 item.Distance > prevDist * Settings.TweetClusterer_SW_MergeDropScale) //Still looking for a drop
                        {
                            candidates.Add(item.StoryID);
                        }
                        else if (item.Distance <= prevDist * Settings.TweetClusterer_SW_MergeDropScale) //Found a drop
                        {
                            nearStories.AddRange(candidates);
                            candidates.Clear(); //Don't break, as it is theoretically possible to find multiple drops

                            if (item.Distance > Settings.TweetClusterer_SW_MergeLowerThreshold)
                            {
                                candidates.Add(item.StoryID);
                            }
                            else
                            {
                                break;
                            }
                        }
                        else //if (item.Distance < _storyMergeLowerThreshold) //Didn't find a drop
                        {
                            break;
                        }

                        prevDist = item.Distance;
                    }
                }

                //foreach (SimpleStory story in stories.Values)
                //{
                //    double sim = story.Vector * cluster.Vector;
                //    double ratio = sim > 0.1 ? story.Vector.SharedWordRatio(cluster.Vector) : 0;
                //    if (sim > GetSimilarityThreshold(story.Vector.ItemCount) || ratio >= 0.6)
                //        nearStories.Add(story.StoryID);
                //}

                long storyID = -1;
                if (nearStories.Count == 0) //Create new story
                {
                    storyID = nextStoryID++;
                    stories.Add(storyID, new SimpleStory()
                    {
                        StoryID = storyID, Vector = cluster.Vector, Changed = true, Created = true
                    });
                }
                else if (nearStories.Count == 1) //Add to story
                {
                    storyID = nearStories[0];
                }
                else //Merge multiple stories
                {
                    storyID = nearStories.Min();
                    stories[storyID].Vector = WordVector.GetNormalizedAverage(nearStories.Select(n => stories[n].Vector), Settings.TweetClusterer_SW_MaxWordsInStoryVector);

                    foreach (long deleteStoryID in nearStories.Where(n => n != storyID))
                    {
                        stories[deleteStoryID].MergedWith = storyID;
                        foreach (SimpleTweetCluster clusterToMove in clusters.Values.Where(n => n.StoryID == deleteStoryID))
                        {
                            clusterToMove.StoryID = storyID;
                        }
                    }
                }

                cluster.StoryID          = storyID;
                stories[storyID].Changed = true;
            }
        }