Пример #1
0
 /// <summary>
 /// Adds the vector to the hashtable and returns any previously added tweets with the same hash value
 /// </summary>
 /// <param name="vector"></param>
 /// <returns></returns>
 public List <LSHashTweet> Add(LSHashTweet tweet, out bool anyTrue)
 {
     lock (accessLock)
     {
         CustomBitArray  hash = _hashFunction.CalculateHashScore(tweet.Vector, out anyTrue);
         LSHashTableCell cell;
         if (_values.TryGetValue(hash, out cell))
         {
             List <LSHashTweet> neighbors = cell.GetTweets();
             cell.Add(tweet);
             _lastUpdatedCell = cell;
             if (anyTrue)
             {
                 return(neighbors);
             }
             return(new List <LSHashTweet>());
         }
         else
         {
             cell = new LSHashTableCell(_cellCapacity);
             cell.Add(tweet);
             _values.Add(hash, cell);
             _lastUpdatedCell = cell;
             return(new List <LSHashTweet>());
         }
     }
 }
Пример #2
0
 private static double GetNeighbors(List <TweetRelationSimple> relations, LSHashTweet t, Dictionary <LSHashTweet, int> neighborCandidates, Double maxSimilarity)
 {
     if (neighborCandidates.Count > 0)
     {
         IEnumerable <LSHashTweet> nearestCandidates = neighborCandidates
                                                       .OrderByDescending(n => n.Value)                      //sort by number of times the candidate was in the same bin as this tweet
                                                       .Select(n => n.Key)                                   //select the candidates
                                                       .Take(3 * Settings.TweetClusterer_TCW_HashTableCount) //take the 3L best candidates
                                                       .OrderByDescending(n => n.Vector * t.Vector)          //sort by actual similarity
                                                       .Take(Settings.TweetClusterer_TCW_MaxLinksPerTweet);  //take best
         foreach (LSHashTweet candidate in nearestCandidates)
         {
             double similarity = candidate.Vector * t.Vector;
             if (similarity > maxSimilarity)
             {
                 maxSimilarity = similarity;
             }
             if (similarity > Settings.TweetClusterer_TCW_MinTweetSimilarityForLink && t.ID != candidate.ID)
             {
                 relations.Add(new TweetRelationSimple()
                 {
                     TweetID = t.ID, TweetID2 = candidate.ID, Similarity = similarity
                 });
             }
         }
     }
     return(maxSimilarity);
 }
Пример #3
0
 /// <summary>
 /// Adds the vector to the hashtable and returns any previously added tweets with the same hash value
 /// </summary>
 /// <param name="vector"></param>
 /// <returns></returns>
 public List<LSHashTweet> Add(LSHashTweet tweet, out bool anyTrue)
 {
     lock (accessLock)
     {
         CustomBitArray hash = _hashFunction.CalculateHashScore(tweet.Vector, out anyTrue);
         LSHashTableCell cell;
         if (_values.TryGetValue(hash, out cell))
         {
             List<LSHashTweet> neighbors = cell.GetTweets();
             cell.Add(tweet);
             _lastUpdatedCell = cell;
             if (anyTrue)
                 return neighbors;
             return new List<LSHashTweet>();
         }
         else
         {
             cell = new LSHashTableCell(_cellCapacity);
             cell.Add(tweet);
             _values.Add(hash, cell);
             _lastUpdatedCell = cell;
             return new List<LSHashTweet>();
         }
     }
 }
Пример #4
0
        public List <LSHashTweet> GetNearestNeighbors(LSHashTweet tweet, int n)
        {
            if (_content.Count == 0)
            {
                return(new List <LSHashTweet>());
            }

            HashSet <LSHashTweet> candidates = new HashSet <LSHashTweet>();
            List <long>           wordIDs    = tweet.Vector.GetItemIDs();

            foreach (long wordID in wordIDs)
            {
                if (_wordIndex.ContainsKey(wordID))
                {
                    foreach (LSHashTweet candidate in _wordIndex[wordID])
                    {
                        candidates.Add(candidate);
                    }
                }
            }

            if (candidates.Count == 0)
            {
                return(new List <LSHashTweet>());
            }

            return(candidates.OrderByDescending(t => t.Vector * tweet.Vector).Take(n).ToList());
        }
Пример #5
0
 public void Add(LSHashTweet item)
 {
     _content.Enqueue(item);
     if (_content.Count > Capacity)
     {
         _content.Dequeue();
     }
 }
Пример #6
0
 public void Add(LSHashTweet item)
 {
     _content.Enqueue(item);
     List<long> wordIDs = item.Vector.GetItemIDs();
     foreach (long wordID in wordIDs)
     {
         if (!_wordIndex.ContainsKey(wordID))
             _wordIndex.Add(wordID, new List<LSHashTweet>());
         _wordIndex[wordID].Add(item);
     }
     if (_content.Count > Capacity)
         RemoveOldest();
 }
Пример #7
0
        void RemoveOldest()
        {
            LSHashTweet tweet   = _content.Dequeue();
            List <long> wordIDs = tweet.Vector.GetItemIDs();

            foreach (long wordID in wordIDs)
            {
                _wordIndex[wordID].Remove(tweet);
                if (_wordIndex[wordID].Count == 0)
                {
                    _wordIndex.Remove(wordID);
                }
            }
        }
Пример #8
0
        public void Add(LSHashTweet item)
        {
            _content.Enqueue(item);
            List <long> wordIDs = item.Vector.GetItemIDs();

            foreach (long wordID in wordIDs)
            {
                if (!_wordIndex.ContainsKey(wordID))
                {
                    _wordIndex.Add(wordID, new List <LSHashTweet>());
                }
                _wordIndex[wordID].Add(item);
            }
            if (_content.Count > Capacity)
            {
                RemoveOldest();
            }
        }
Пример #9
0
        public List<LSHashTweet> GetNearestNeighbors(LSHashTweet tweet, int n)
        {
            if (_content.Count == 0)
                return new List<LSHashTweet>();

            HashSet<LSHashTweet> candidates = new HashSet<LSHashTweet>();
            List<long> wordIDs = tweet.Vector.GetItemIDs();
            foreach (long wordID in wordIDs)
            {
                if (_wordIndex.ContainsKey(wordID))
                {
                    foreach (LSHashTweet candidate in _wordIndex[wordID])
                        candidates.Add(candidate);
                }
            }

            if (candidates.Count == 0)
                return new List<LSHashTweet>();

            return candidates.OrderByDescending(t => t.Vector * tweet.Vector).Take(n).ToList();
        }
Пример #10
0
        private Dictionary <LSHashTweet, int> GetNeighborCandidates(LSHashTweet t)
        {
            Dictionary <LSHashTweet, int> neighborCandidates = new Dictionary <LSHashTweet, int>();

            foreach (LSHashTable table in _tables)
            {
                bool dummy;
                List <LSHashTweet> hits = table.Add(t, out dummy);
                foreach (LSHashTweet hit in hits)
                {
                    if (neighborCandidates.ContainsKey(hit))
                    {
                        neighborCandidates[hit]++;
                    }
                    else
                    {
                        neighborCandidates.Add(hit, 1);
                    }
                }
            }
            return(neighborCandidates);
        }
Пример #11
0
 public void Add(LSHashTweet item)
 {
     _content.Enqueue(item);
     if (_content.Count > Capacity)
         _content.Dequeue();
 }
Пример #12
0
        private Double GetNeighborsFromRecentHistory(List <TweetRelationSimple> relations, LSHashTweet t, Double maxSimilarity)
        {
            List <LSHashTweet> nearestCandidates = _history.GetNearestNeighbors(t, Settings.TweetClusterer_TCW_MaxLinksPerTweet - relations.Count);

            foreach (LSHashTweet candidate in nearestCandidates)
            {
                double similarity = candidate.Vector * t.Vector;
                if (similarity > Settings.TweetClusterer_TCW_MinTweetSimilarityForLink)
                {
                    relations.Add(new TweetRelationSimple()
                    {
                        TweetID = t.ID, TweetID2 = candidate.ID, Similarity = similarity
                    });

                    if (similarity > maxSimilarity)
                    {
                        maxSimilarity = similarity;
                    }
                }
            }
            return(maxSimilarity);
        }
Пример #13
0
        private Double GetNeighborsFromRecentHistory(List<TweetRelationSimple> relations, LSHashTweet t, Double maxSimilarity)
        {
            List<LSHashTweet> nearestCandidates = _history.GetNearestNeighbors(t, Settings.TweetClusterer_TCW_MaxLinksPerTweet - relations.Count);
            foreach (LSHashTweet candidate in nearestCandidates)
            {
                double similarity = candidate.Vector * t.Vector;
                if (similarity > Settings.TweetClusterer_TCW_MinTweetSimilarityForLink)
                {
                    relations.Add(new TweetRelationSimple() { TweetID = t.ID, TweetID2 = candidate.ID, Similarity = similarity });

                    if (similarity > maxSimilarity)
                        maxSimilarity = similarity;
                }
            }
            return maxSimilarity;
        }
Пример #14
0
 private Dictionary<LSHashTweet, int> GetNeighborCandidates(LSHashTweet t)
 {
     Dictionary<LSHashTweet, int> neighborCandidates = new Dictionary<LSHashTweet, int>();
     foreach (LSHashTable table in _tables)
     {
         bool dummy;
         List<LSHashTweet> hits = table.Add(t, out dummy);
         foreach (LSHashTweet hit in hits)
         {
             if (neighborCandidates.ContainsKey(hit))
                 neighborCandidates[hit]++;
             else
                 neighborCandidates.Add(hit, 1);
         }
     }
     return neighborCandidates;
 }
Пример #15
0
 private static double GetNeighbors(List<TweetRelationSimple> relations, LSHashTweet t, Dictionary<LSHashTweet, int> neighborCandidates, Double maxSimilarity)
 {
     if (neighborCandidates.Count > 0)
     {
         IEnumerable<LSHashTweet> nearestCandidates = neighborCandidates
             .OrderByDescending(n => n.Value) //sort by number of times the candidate was in the same bin as this tweet
             .Select(n => n.Key) //select the candidates
             .Take(3 * Settings.TweetClusterer_TCW_HashTableCount) //take the 3L best candidates
             .OrderByDescending(n => n.Vector * t.Vector) //sort by actual similarity
             .Take(Settings.TweetClusterer_TCW_MaxLinksPerTweet); //take best
         foreach (LSHashTweet candidate in nearestCandidates)
         {
             double similarity = candidate.Vector * t.Vector;
             if (similarity > maxSimilarity)
                 maxSimilarity = similarity;
             if (similarity > Settings.TweetClusterer_TCW_MinTweetSimilarityForLink && t.ID != candidate.ID)
             {
                 relations.Add(new TweetRelationSimple() { TweetID = t.ID, TweetID2 = candidate.ID, Similarity = similarity });
             }
         }
     }
     return maxSimilarity;
 }