Пример #1
0
        public List <SearchResult> ExecuteTfIdfSearch(int?windowSize = null, byte?filterByClass = null)
        {
            var idfs        = Query.Select(x => (token: x, idf: Index.Idf(x))).ToDictionary(x => x.token, x => x.idf);
            var queryVector = new TfIdfVector(idfs, QueryTokenCounts);

            var docIds = new HashSet <int>();

            foreach (var token in Query)
            {
                foreach (var docId in Index.SearchByToken(token))
                {
                    docIds.Add(docId);
                }
            }

            if (windowSize.HasValue)
            {
                docIds = ProximityFilter(docIds, windowSize.Value);
            }

            var scoresByDocId = new Dictionary <int, double>();

            foreach (var docId in docIds)
            {
                var tfByToken = new Dictionary <string, int>();

                foreach (var token in Query)
                {
                    tfByToken.Add(token, Index.GetOccurrence(token, docId).Positions.Count);
                }

                var docVector = new TfIdfVector(idfs, tfByToken);
                scoresByDocId.Add(docId, docVector.Multiply(queryVector));
            }

            if (filterByClass == null)
            {
                return(scoresByDocId.OrderByDescending(x => x.Value).Select(x => new SearchResult(x.Key, Index.GetHighlight(x.Key), x.Value)).ToList());
            }

            var documents       = docIds.Select(x => new DocumentWrapper(Index.PureDocumentsById[x])).ToDictionary(x => x.Document.Id);
            var vectorGenerator = new VectorGenerator(documents, true);

            vectorGenerator.Process();

            var tokenMapper = new TokenMapper();
            var classifier  = new RandomForestClassifierClient();
            var result      = classifier.Classify(documents.Values.Select(x => x.CreateClassificationVector(tokenMapper)).ToList());

            for (int i = 0; i < result.Count; i++)
            {
                documents.Values.ElementAt(i).ClassifiedTag = result[i];
            }

            var filteredResult = scoresByDocId.Where(x => documents[x.Key].ClassifiedTag == filterByClass).ToList();

            Console.WriteLine($"Removed {filteredResult.Count} items with other classes");

            return(filteredResult.OrderByDescending(x => x.Value).Select(x => new SearchResult(x.Key, Index.GetHighlight(x.Key), x.Value)).ToList());
        }
Пример #2
0
        internal double Multiply(TfIdfVector queryVector)
        {
            double sum = 0;

            foreach (var token in queryVector.FinalVector.Keys)
            {
                sum += queryVector.FinalVector[token] * this.FinalVector[token];
            }

            return(sum);
        }
Пример #3
0
        internal TfIdfVector Add(TfIdfVector other, int sourceWeight)
        {
            var result = new Dictionary <string, double>();

            foreach (var key in this.FinalVector.Keys.Union(other.FinalVector.Keys))
            {
                double finalValue = 0.0;

                if (this.FinalVector.TryGetValue(key, out double currentValue))
                {
                    finalValue += sourceWeight * currentValue;
                }

                if (other.FinalVector.TryGetValue(key, out double otherValue))
                {
                    finalValue += otherValue;
                }

                result.Add(key, finalValue);
            }

            return(new TfIdfVector(result));
        }