public void PushTokenizedDocument(ITokenizedDocument tokenized) { var documentHash = _hasher.GetHash(tokenized.Document); _documentsByHash[documentHash] = tokenized.Document; foreach (var token in tokenized.Tokens.Distinct()) { _tokensToHashes.AddOrUpdate(token, addValueFactory: t => new HashSet <int> { documentHash }, updateValueFactory: (t, existingHashes) => { existingHashes.Add(documentHash); return(existingHashes); }); } }
public List <DocumentSimilarityPair> SimilarityRanks(ITokenizedDocument queried) { var documentsAndHits = new ConcurrentDictionary <string, uint>(); foreach (var token in queried.Tokens.Distinct()) { HashSet <int> matchingHashes; if (!_tokensToHashes.TryGetValue(token, out matchingHashes)) { continue; } foreach (var matchingHash in matchingHashes) { var document = _documentsByHash[matchingHash]; documentsAndHits.AddOrUpdate(document, doc => 1, (doc, count) => count + 1); } } var result = documentsAndHits .Select(pair => new DocumentSimilarityPair(pair.Key, pair.Value)) .OrderByDescending(tuple => tuple.Similarity).ToList(); return(result); }
public List <DocumentSimilarityPair> SimilarityRanks(ITokenizedDocument queried) { var queriedSet = new HashSet <string>(queried.Tokens); var ranks = new Dictionary <string, uint>(); foreach (var pair in _documentsToTokensSet) { var currentSet = pair.Value; var intersectionSet = new HashSet <string>(currentSet); intersectionSet.IntersectWith(queriedSet); var unionSet = new HashSet <string>(currentSet); unionSet.UnionWith(queriedSet); var rank = (uint)(100.0 * intersectionSet.Count / unionSet.Count); // not perfect percision but good enough for now if (rank > 0) { ranks[pair.Key] = rank; } } var result = ranks .Select(pair => new DocumentSimilarityPair(pair.Key, pair.Value)) .OrderByDescending(pair => pair.Similarity).ToList(); return(result); }
public void PushTokenizedDocument(ITokenizedDocument tokenized) { _documentsToTokensSet[tokenized.Document] = new HashSet <string>(tokenized.Tokens); }