public static WordVector GetNormalizedAverage(IEnumerable <WordVector> vectors, int?maxComponentCount = null) { WordVector average = new WordVector(); foreach (WordVector v in vectors) { foreach (var item in v._itemWeights) { if (average._itemWeights.ContainsKey(item.Key)) { average._itemWeights[item.Key] += item.Value * v.OriginalLength; } else { average._itemWeights.Add(item.Key, item.Value * v.OriginalLength); } } } if (maxComponentCount.HasValue) { average.LimitComponentCount(maxComponentCount.Value); } average.Normalize(); return(average); }
public CustomBitArray CalculateHashScore(WordVector vector) { CustomBitArray arr = new CustomBitArray(_hyperPlanes.Count); for (int i = _hyperPlanes.Count - 1; i != -1; i--) { arr[i] = (_hyperPlanes[i] * vector) >= 0; } return(arr); }
public CustomBitArray CalculateHashScore(WordVector vector, out bool anyTrue) { anyTrue = false; CustomBitArray arr = new CustomBitArray(_hyperPlanes.Count); for (int i = _hyperPlanes.Count - 1; i != -1; i--) { anyTrue |= (arr[i] = (_hyperPlanes[i] * vector) > 0); } return(arr); }
public WordVector Copy() { WordVector copy = new WordVector(); foreach (var item in this._itemWeights) { copy._itemWeights.Add(item.Key, item.Value); } copy._normalized = this._normalized; return(copy); }
LSHashFunction GetNewHashFunction() { //Generate hyper planes List <WordVector> planes = new List <WordVector>(); for (int j = 0; j < Settings.TweetClusterer_TCW_HyperPlaneCount; j++) { WordVector plane = new WordVector(); List <long> wordIDs = GetRandomWordIDs(Settings.TweetClusterer_TCW_WordsPerHyperPlane); foreach (long id in wordIDs) { plane.AddItem(id, Helpers.NextGaussian()); } planes.Add(plane); Console.Write('.'); } return(new LSHashFunction(planes)); }
public SimpleTweetCluster() { WordVector = new WordVector(); }
public SimpleStory() { WordVector = new WordVector(); }
static void AssignTweetClustersToNearestStory(Dictionary <long, SimpleTweetCluster> clusters, Dictionary <long, SimpleStory> stories, ref long nextStoryID) { //Assign batch clusters to nearest story foreach (SimpleTweetCluster cluster in clusters.Values.OrderBy(n => n.TweetClusterID)) { List <long> nearStories = new List <long>(); if (stories.Count > 0) { var distances = stories.Values .Select(n => new { StoryID = n.StoryID, Distance = n.Vector * cluster.Vector }) .OrderByDescending(n => n.Distance); double prevDist = double.MaxValue; List <long> candidates = new List <long>(); foreach (var item in distances) { if (prevDist == double.MaxValue) { prevDist = item.Distance; } if (item.Distance > Settings.TweetClusterer_SW_MergeUpperThreshold) //Similar enough { nearStories.Add(item.StoryID); } else if (item.Distance > Settings.TweetClusterer_SW_MergeLowerThreshold && item.Distance > prevDist * Settings.TweetClusterer_SW_MergeDropScale) //Still looking for a drop { candidates.Add(item.StoryID); } else if (item.Distance <= prevDist * Settings.TweetClusterer_SW_MergeDropScale) //Found a drop { nearStories.AddRange(candidates); candidates.Clear(); //Don't break, as it is theoretically possible to find multiple drops if (item.Distance > Settings.TweetClusterer_SW_MergeLowerThreshold) { candidates.Add(item.StoryID); } else { break; } } else //if (item.Distance < _storyMergeLowerThreshold) //Didn't find a drop { break; } prevDist = item.Distance; } } //foreach (SimpleStory story in stories.Values) //{ // double sim = story.Vector * cluster.Vector; // double ratio = sim > 0.1 ? story.Vector.SharedWordRatio(cluster.Vector) : 0; // if (sim > GetSimilarityThreshold(story.Vector.ItemCount) || ratio >= 0.6) // nearStories.Add(story.StoryID); //} long storyID = -1; if (nearStories.Count == 0) //Create new story { storyID = nextStoryID++; stories.Add(storyID, new SimpleStory() { StoryID = storyID, Vector = cluster.Vector, Changed = true, Created = true }); } else if (nearStories.Count == 1) //Add to story { storyID = nearStories[0]; } else //Merge multiple stories { storyID = nearStories.Min(); stories[storyID].Vector = WordVector.GetNormalizedAverage(nearStories.Select(n => stories[n].Vector), Settings.TweetClusterer_SW_MaxWordsInStoryVector); foreach (long deleteStoryID in nearStories.Where(n => n != storyID)) { stories[deleteStoryID].MergedWith = storyID; foreach (SimpleTweetCluster clusterToMove in clusters.Values.Where(n => n.StoryID == deleteStoryID)) { clusterToMove.StoryID = storyID; } } } cluster.StoryID = storyID; stories[storyID].Changed = true; } }