public void PushTokenizedDocument(ITokenizedDocument tokenized)
        {
            var documentHash = _hasher.GetHash(tokenized.Document);

            _documentsByHash[documentHash] = tokenized.Document;

            foreach (var token in tokenized.Tokens.Distinct())
            {
                _tokensToHashes.AddOrUpdate(token,
                                            addValueFactory: t => new HashSet <int> {
                    documentHash
                },
                                            updateValueFactory: (t, existingHashes) =>
                {
                    existingHashes.Add(documentHash);
                    return(existingHashes);
                });
            }
        }
        public List <DocumentSimilarityPair> SimilarityRanks(ITokenizedDocument queried)
        {
            var documentsAndHits = new ConcurrentDictionary <string, uint>();

            foreach (var token in queried.Tokens.Distinct())
            {
                HashSet <int> matchingHashes;
                if (!_tokensToHashes.TryGetValue(token, out matchingHashes))
                {
                    continue;
                }
                foreach (var matchingHash in matchingHashes)
                {
                    var document = _documentsByHash[matchingHash];
                    documentsAndHits.AddOrUpdate(document, doc => 1, (doc, count) => count + 1);
                }
            }
            var result = documentsAndHits
                         .Select(pair => new DocumentSimilarityPair(pair.Key, pair.Value))
                         .OrderByDescending(tuple => tuple.Similarity).ToList();

            return(result);
        }
        public List <DocumentSimilarityPair> SimilarityRanks(ITokenizedDocument queried)
        {
            var queriedSet = new HashSet <string>(queried.Tokens);
            var ranks      = new Dictionary <string, uint>();

            foreach (var pair in _documentsToTokensSet)
            {
                var currentSet      = pair.Value;
                var intersectionSet = new HashSet <string>(currentSet);
                intersectionSet.IntersectWith(queriedSet);
                var unionSet = new HashSet <string>(currentSet);
                unionSet.UnionWith(queriedSet);
                var rank = (uint)(100.0 * intersectionSet.Count / unionSet.Count); // not perfect percision but good enough for now
                if (rank > 0)
                {
                    ranks[pair.Key] = rank;
                }
            }
            var result = ranks
                         .Select(pair => new DocumentSimilarityPair(pair.Key, pair.Value))
                         .OrderByDescending(pair => pair.Similarity).ToList();

            return(result);
        }
 public void PushTokenizedDocument(ITokenizedDocument tokenized)
 {
     _documentsToTokensSet[tokenized.Document] = new HashSet <string>(tokenized.Tokens);
 }