public List <SearchResult> ExecuteTfIdfSearch(int?windowSize = null, byte?filterByClass = null) { var idfs = Query.Select(x => (token: x, idf: Index.Idf(x))).ToDictionary(x => x.token, x => x.idf); var queryVector = new TfIdfVector(idfs, QueryTokenCounts); var docIds = new HashSet <int>(); foreach (var token in Query) { foreach (var docId in Index.SearchByToken(token)) { docIds.Add(docId); } } if (windowSize.HasValue) { docIds = ProximityFilter(docIds, windowSize.Value); } var scoresByDocId = new Dictionary <int, double>(); foreach (var docId in docIds) { var tfByToken = new Dictionary <string, int>(); foreach (var token in Query) { tfByToken.Add(token, Index.GetOccurrence(token, docId).Positions.Count); } var docVector = new TfIdfVector(idfs, tfByToken); scoresByDocId.Add(docId, docVector.Multiply(queryVector)); } if (filterByClass == null) { return(scoresByDocId.OrderByDescending(x => x.Value).Select(x => new SearchResult(x.Key, Index.GetHighlight(x.Key), x.Value)).ToList()); } var documents = docIds.Select(x => new DocumentWrapper(Index.PureDocumentsById[x])).ToDictionary(x => x.Document.Id); var vectorGenerator = new VectorGenerator(documents, true); vectorGenerator.Process(); var tokenMapper = new TokenMapper(); var classifier = new RandomForestClassifierClient(); var result = classifier.Classify(documents.Values.Select(x => x.CreateClassificationVector(tokenMapper)).ToList()); for (int i = 0; i < result.Count; i++) { documents.Values.ElementAt(i).ClassifiedTag = result[i]; } var filteredResult = scoresByDocId.Where(x => documents[x.Key].ClassifiedTag == filterByClass).ToList(); Console.WriteLine($"Removed {filteredResult.Count} items with other classes"); return(filteredResult.OrderByDescending(x => x.Value).Select(x => new SearchResult(x.Key, Index.GetHighlight(x.Key), x.Value)).ToList()); }
internal void ConvertTokenToId(TokenMapper mapper) { IntVector = new Dictionary <int, double>(); foreach (var item in FinalVector) { IntVector.Add(mapper.GetOrCreateId(item.Key), item.Value); } }