/// <summary> /// Adds the vector to the hashtable and returns any previously added tweets with the same hash value /// </summary> /// <param name="vector"></param> /// <returns></returns> public List <LSHashTweet> Add(LSHashTweet tweet, out bool anyTrue) { lock (accessLock) { CustomBitArray hash = _hashFunction.CalculateHashScore(tweet.Vector, out anyTrue); LSHashTableCell cell; if (_values.TryGetValue(hash, out cell)) { List <LSHashTweet> neighbors = cell.GetTweets(); cell.Add(tweet); _lastUpdatedCell = cell; if (anyTrue) { return(neighbors); } return(new List <LSHashTweet>()); } else { cell = new LSHashTableCell(_cellCapacity); cell.Add(tweet); _values.Add(hash, cell); _lastUpdatedCell = cell; return(new List <LSHashTweet>()); } } }
private static double GetNeighbors(List <TweetRelationSimple> relations, LSHashTweet t, Dictionary <LSHashTweet, int> neighborCandidates, Double maxSimilarity) { if (neighborCandidates.Count > 0) { IEnumerable <LSHashTweet> nearestCandidates = neighborCandidates .OrderByDescending(n => n.Value) //sort by number of times the candidate was in the same bin as this tweet .Select(n => n.Key) //select the candidates .Take(3 * Settings.TweetClusterer_TCW_HashTableCount) //take the 3L best candidates .OrderByDescending(n => n.Vector * t.Vector) //sort by actual similarity .Take(Settings.TweetClusterer_TCW_MaxLinksPerTweet); //take best foreach (LSHashTweet candidate in nearestCandidates) { double similarity = candidate.Vector * t.Vector; if (similarity > maxSimilarity) { maxSimilarity = similarity; } if (similarity > Settings.TweetClusterer_TCW_MinTweetSimilarityForLink && t.ID != candidate.ID) { relations.Add(new TweetRelationSimple() { TweetID = t.ID, TweetID2 = candidate.ID, Similarity = similarity }); } } } return(maxSimilarity); }
/// <summary> /// Adds the vector to the hashtable and returns any previously added tweets with the same hash value /// </summary> /// <param name="vector"></param> /// <returns></returns> public List<LSHashTweet> Add(LSHashTweet tweet, out bool anyTrue) { lock (accessLock) { CustomBitArray hash = _hashFunction.CalculateHashScore(tweet.Vector, out anyTrue); LSHashTableCell cell; if (_values.TryGetValue(hash, out cell)) { List<LSHashTweet> neighbors = cell.GetTweets(); cell.Add(tweet); _lastUpdatedCell = cell; if (anyTrue) return neighbors; return new List<LSHashTweet>(); } else { cell = new LSHashTableCell(_cellCapacity); cell.Add(tweet); _values.Add(hash, cell); _lastUpdatedCell = cell; return new List<LSHashTweet>(); } } }
public List <LSHashTweet> GetNearestNeighbors(LSHashTweet tweet, int n) { if (_content.Count == 0) { return(new List <LSHashTweet>()); } HashSet <LSHashTweet> candidates = new HashSet <LSHashTweet>(); List <long> wordIDs = tweet.Vector.GetItemIDs(); foreach (long wordID in wordIDs) { if (_wordIndex.ContainsKey(wordID)) { foreach (LSHashTweet candidate in _wordIndex[wordID]) { candidates.Add(candidate); } } } if (candidates.Count == 0) { return(new List <LSHashTweet>()); } return(candidates.OrderByDescending(t => t.Vector * tweet.Vector).Take(n).ToList()); }
public void Add(LSHashTweet item) { _content.Enqueue(item); if (_content.Count > Capacity) { _content.Dequeue(); } }
public void Add(LSHashTweet item) { _content.Enqueue(item); List<long> wordIDs = item.Vector.GetItemIDs(); foreach (long wordID in wordIDs) { if (!_wordIndex.ContainsKey(wordID)) _wordIndex.Add(wordID, new List<LSHashTweet>()); _wordIndex[wordID].Add(item); } if (_content.Count > Capacity) RemoveOldest(); }
void RemoveOldest() { LSHashTweet tweet = _content.Dequeue(); List <long> wordIDs = tweet.Vector.GetItemIDs(); foreach (long wordID in wordIDs) { _wordIndex[wordID].Remove(tweet); if (_wordIndex[wordID].Count == 0) { _wordIndex.Remove(wordID); } } }
public void Add(LSHashTweet item) { _content.Enqueue(item); List <long> wordIDs = item.Vector.GetItemIDs(); foreach (long wordID in wordIDs) { if (!_wordIndex.ContainsKey(wordID)) { _wordIndex.Add(wordID, new List <LSHashTweet>()); } _wordIndex[wordID].Add(item); } if (_content.Count > Capacity) { RemoveOldest(); } }
public List<LSHashTweet> GetNearestNeighbors(LSHashTweet tweet, int n) { if (_content.Count == 0) return new List<LSHashTweet>(); HashSet<LSHashTweet> candidates = new HashSet<LSHashTweet>(); List<long> wordIDs = tweet.Vector.GetItemIDs(); foreach (long wordID in wordIDs) { if (_wordIndex.ContainsKey(wordID)) { foreach (LSHashTweet candidate in _wordIndex[wordID]) candidates.Add(candidate); } } if (candidates.Count == 0) return new List<LSHashTweet>(); return candidates.OrderByDescending(t => t.Vector * tweet.Vector).Take(n).ToList(); }
private Dictionary <LSHashTweet, int> GetNeighborCandidates(LSHashTweet t) { Dictionary <LSHashTweet, int> neighborCandidates = new Dictionary <LSHashTweet, int>(); foreach (LSHashTable table in _tables) { bool dummy; List <LSHashTweet> hits = table.Add(t, out dummy); foreach (LSHashTweet hit in hits) { if (neighborCandidates.ContainsKey(hit)) { neighborCandidates[hit]++; } else { neighborCandidates.Add(hit, 1); } } } return(neighborCandidates); }
public void Add(LSHashTweet item) { _content.Enqueue(item); if (_content.Count > Capacity) _content.Dequeue(); }
private Double GetNeighborsFromRecentHistory(List <TweetRelationSimple> relations, LSHashTweet t, Double maxSimilarity) { List <LSHashTweet> nearestCandidates = _history.GetNearestNeighbors(t, Settings.TweetClusterer_TCW_MaxLinksPerTweet - relations.Count); foreach (LSHashTweet candidate in nearestCandidates) { double similarity = candidate.Vector * t.Vector; if (similarity > Settings.TweetClusterer_TCW_MinTweetSimilarityForLink) { relations.Add(new TweetRelationSimple() { TweetID = t.ID, TweetID2 = candidate.ID, Similarity = similarity }); if (similarity > maxSimilarity) { maxSimilarity = similarity; } } } return(maxSimilarity); }
private Double GetNeighborsFromRecentHistory(List<TweetRelationSimple> relations, LSHashTweet t, Double maxSimilarity) { List<LSHashTweet> nearestCandidates = _history.GetNearestNeighbors(t, Settings.TweetClusterer_TCW_MaxLinksPerTweet - relations.Count); foreach (LSHashTweet candidate in nearestCandidates) { double similarity = candidate.Vector * t.Vector; if (similarity > Settings.TweetClusterer_TCW_MinTweetSimilarityForLink) { relations.Add(new TweetRelationSimple() { TweetID = t.ID, TweetID2 = candidate.ID, Similarity = similarity }); if (similarity > maxSimilarity) maxSimilarity = similarity; } } return maxSimilarity; }
private Dictionary<LSHashTweet, int> GetNeighborCandidates(LSHashTweet t) { Dictionary<LSHashTweet, int> neighborCandidates = new Dictionary<LSHashTweet, int>(); foreach (LSHashTable table in _tables) { bool dummy; List<LSHashTweet> hits = table.Add(t, out dummy); foreach (LSHashTweet hit in hits) { if (neighborCandidates.ContainsKey(hit)) neighborCandidates[hit]++; else neighborCandidates.Add(hit, 1); } } return neighborCandidates; }
private static double GetNeighbors(List<TweetRelationSimple> relations, LSHashTweet t, Dictionary<LSHashTweet, int> neighborCandidates, Double maxSimilarity) { if (neighborCandidates.Count > 0) { IEnumerable<LSHashTweet> nearestCandidates = neighborCandidates .OrderByDescending(n => n.Value) //sort by number of times the candidate was in the same bin as this tweet .Select(n => n.Key) //select the candidates .Take(3 * Settings.TweetClusterer_TCW_HashTableCount) //take the 3L best candidates .OrderByDescending(n => n.Vector * t.Vector) //sort by actual similarity .Take(Settings.TweetClusterer_TCW_MaxLinksPerTweet); //take best foreach (LSHashTweet candidate in nearestCandidates) { double similarity = candidate.Vector * t.Vector; if (similarity > maxSimilarity) maxSimilarity = similarity; if (similarity > Settings.TweetClusterer_TCW_MinTweetSimilarityForLink && t.ID != candidate.ID) { relations.Add(new TweetRelationSimple() { TweetID = t.ID, TweetID2 = candidate.ID, Similarity = similarity }); } } } return maxSimilarity; }