public static MoreLikeThisQuery GetParametersFromPath(string path, NameValueCollection query) { var results = new MoreLikeThisQuery { IndexName = query.Get("index"), Fields = query.GetValues("fields"), Boost = query.Get("boost").ToNullableBool(), BoostFactor = query.Get("boostFactor").ToNullableFloat(), MaximumNumberOfTokensParsed = query.Get("maxNumTokens").ToNullableInt(), MaximumQueryTerms = query.Get("maxQueryTerms").ToNullableInt(), MaximumWordLength = query.Get("maxWordLen").ToNullableInt(), MinimumDocumentFrequency = query.Get("minDocFreq").ToNullableInt(), MaximumDocumentFrequency = query.Get("maxDocFreq").ToNullableInt(), MaximumDocumentFrequencyPercentage = query.Get("maxDocFreqPct").ToNullableInt(), MinimumTermFrequency = query.Get("minTermFreq").ToNullableInt(), MinimumWordLength = query.Get("minWordLen").ToNullableInt(), StopWordsDocumentId = query.Get("stopWords"), AdditionalQuery= query.Get("query") }; var keyValues = query.Get("docid").Split(new[] { ';' }, StringSplitOptions.RemoveEmptyEntries); foreach (var keyValue in keyValues) { var split = keyValue.IndexOf('='); if (split >= 0) results.MapGroupFields.Add(keyValue.Substring(0, split), keyValue.Substring(split + 1)); else results.DocumentId = keyValue; } return results; }
public void CanEncodeDecodeRequestByDocumentId() { var parameters = new MoreLikeThisQuery { IndexName = "dataIndex", DocumentId = "foo/1", Fields = new[] {"Body"}, MinimumWordLength = 3, MinimumDocumentFrequency = 1, Boost = true, }; var uri = parameters.GetRequestUri(); Assert.Equal("/morelikethis/?index=dataIndex&docid=foo%2F1&fields=Body&boost=true&minDocFreq=1&minWordLen=3&", uri); var path = uri.Substring(0, uri.IndexOf('?')); var queryString = HttpUtility.ParseQueryString(uri.Substring(uri.IndexOf('?'))); var decodedParameters = MoreLikeThisController.GetParametersFromPath(path, queryString); Assert.Equal("dataIndex", decodedParameters.IndexName); Assert.Equal(JsonConvert.SerializeObject(parameters), JsonConvert.SerializeObject(decodedParameters)); }
public void CanEncodeDecodeRequestOnIndexGrouping() { var parameters = new MoreLikeThisQuery { IndexName = "dataIndex", Fields = new[] {"Body"}, MinimumWordLength = 3, MinimumDocumentFrequency = 1, Boost = true, }; parameters.MapGroupFields.Add("foo", "bar"); parameters.MapGroupFields.Add("be", "bop"); var uri = parameters.GetRequestUri(); Assert.Equal("/morelikethis/?index=dataIndex&docid=foo%3Dbar%3Bbe%3Dbop&fields=Body&boost=true&minDocFreq=1&minWordLen=3&", uri); var path = uri.Substring(0, uri.IndexOf('?')); var queryString = HttpUtility.ParseQueryString(uri.Substring(uri.IndexOf('?'))); var decodedParameters = MoreLikeThisResponder.GetParametersFromPath(path, queryString); Assert.Equal("dataIndex", decodedParameters.IndexName); Assert.Equal(JsonConvert.SerializeObject(parameters), JsonConvert.SerializeObject(decodedParameters)); }
public MoreLikeThisQueryResult ExecuteMoreLikeThisQuery(MoreLikeThisQuery query, TransactionInformation transactionInformation, int pageSize = 25, string[] include = null) { if (query == null) throw new ArgumentNullException("query"); var index = database.IndexStorage.GetIndexInstance(query.IndexName); if (index == null) throw new InvalidOperationException("The index " + query.IndexName + " cannot be found"); if (string.IsNullOrEmpty(query.DocumentId) && query.MapGroupFields.Count == 0) throw new InvalidOperationException("The document id or map group fields are mandatory"); IndexSearcher searcher; using (database.IndexStorage.GetCurrentIndexSearcher(index.indexId, out searcher)) { var documentQuery = new BooleanQuery(); if (string.IsNullOrEmpty(query.DocumentId) == false) { documentQuery.Add(new TermQuery(new Term(Constants.DocumentIdFieldName, query.DocumentId.ToLowerInvariant())), Occur.MUST); } foreach (string key in query.MapGroupFields.Keys) { documentQuery.Add(new TermQuery(new Term(key, query.MapGroupFields[key])), Occur.MUST); } var td = searcher.Search(documentQuery, 1); // get the current Lucene docid for the given RavenDB doc ID if (td.ScoreDocs.Length == 0) throw new InvalidOperationException("Document " + query.DocumentId + " could not be found"); var ir = searcher.IndexReader; var mlt = new RavenMoreLikeThis(ir); AssignParameters(mlt, query); if (string.IsNullOrWhiteSpace(query.StopWordsDocumentId) == false) { var stopWordsDoc = database.Documents.Get(query.StopWordsDocumentId, null); if (stopWordsDoc == null) throw new InvalidOperationException("Stop words document " + query.StopWordsDocumentId + " could not be found"); var stopWordsSetup = stopWordsDoc.DataAsJson.JsonDeserialization<StopWordsSetup>(); if (stopWordsSetup.StopWords != null) { var stopWords = stopWordsSetup.StopWords; var ht = new HashSet<string>(StringComparer.InvariantCultureIgnoreCase); foreach (var stopWord in stopWords) { ht.Add(stopWord); } mlt.SetStopWords(ht); } } var fieldNames = query.Fields ?? GetFieldNames(ir); mlt.SetFieldNames(fieldNames); var toDispose = new List<Action>(); RavenPerFieldAnalyzerWrapper perFieldAnalyzerWrapper = null; try { perFieldAnalyzerWrapper = index.CreateAnalyzer(new LowerCaseKeywordAnalyzer(), toDispose, true); mlt.Analyzer = perFieldAnalyzerWrapper; var mltQuery = mlt.Like(td.ScoreDocs[0].Doc); var tsdc = TopScoreDocCollector.Create(pageSize, true); searcher.Search(mltQuery, tsdc); var hits = tsdc.TopDocs().ScoreDocs; var jsonDocuments = GetJsonDocuments(query, searcher, index, query.IndexName, hits, td.ScoreDocs[0].Doc); var result = new MultiLoadResult(); var includedEtags = new List<byte>(jsonDocuments.SelectMany(x => x.Etag.ToByteArray())); includedEtags.AddRange(database.Indexes.GetIndexEtag(query.IndexName, null).ToByteArray()); var loadedIds = new HashSet<string>(jsonDocuments.Select(x => x.Key)); var addIncludesCommand = new AddIncludesCommand(database, transactionInformation, (etag, includedDoc) => { includedEtags.AddRange(etag.ToByteArray()); result.Includes.Add(includedDoc); }, include ?? new string[0], loadedIds); foreach (var jsonDocument in jsonDocuments) { result.Results.Add(jsonDocument.ToJson()); addIncludesCommand.Execute(jsonDocument.DataAsJson); } Etag computedEtag; using (var md5 = MD5.Create()) { var computeHash = md5.ComputeHash(includedEtags.ToArray()); computedEtag = Etag.Parse(computeHash); } return new MoreLikeThisQueryResult { Etag = computedEtag, Result = result, }; } finally { if (perFieldAnalyzerWrapper != null) perFieldAnalyzerWrapper.Close(); foreach (var action in toDispose) { action(); } } } }
private static void AssignParameters(Lucene.Net.Search.Similar.MoreLikeThis mlt, MoreLikeThisQuery parameters) { if (parameters.Boost != null) mlt.Boost = parameters.Boost.Value; if (parameters.BoostFactor != null) mlt.BoostFactor = parameters.BoostFactor.Value; if (parameters.MaximumNumberOfTokensParsed != null) mlt.MaxNumTokensParsed = parameters.MaximumNumberOfTokensParsed.Value; if (parameters.MaximumQueryTerms != null) mlt.MaxQueryTerms = parameters.MaximumQueryTerms.Value; if (parameters.MinimumWordLength != null) mlt.MinWordLen = parameters.MinimumWordLength.Value; if (parameters.MaximumWordLength != null) mlt.MaxWordLen = parameters.MaximumWordLength.Value; if (parameters.MinimumTermFrequency != null) mlt.MinTermFreq = parameters.MinimumTermFrequency.Value; if (parameters.MinimumDocumentFrequency != null) mlt.MinDocFreq = parameters.MinimumDocumentFrequency.Value; if (parameters.MaximumDocumentFrequency != null) mlt.MaxDocFreq = parameters.MaximumDocumentFrequency.Value; if (parameters.MaximumDocumentFrequencyPercentage != null) mlt.SetMaxDocFreqPct(parameters.MaximumDocumentFrequencyPercentage.Value); }
private JsonDocument[] GetJsonDocuments(MoreLikeThisQuery parameters, IndexSearcher searcher, Index index, string indexName, IEnumerable<ScoreDoc> hits, int baseDocId) { if (string.IsNullOrEmpty(parameters.DocumentId) == false) { var documentIds = hits .Where(hit => hit.Doc != baseDocId) .Select(hit => searcher.Doc(hit.Doc).Get(Constants.DocumentIdFieldName)) .Where(x => x != null) .Distinct(); return documentIds .Select(docId => database.Documents.Get(docId, null)) .Where(it => it != null) .ToArray(); } var fields = searcher.Doc(baseDocId).GetFields().Cast<AbstractField>().Select(x => x.Name).Distinct().ToArray(); var etag = database.Indexes.GetIndexEtag(indexName, null); return hits .Where(hit => hit.Doc != baseDocId) .Select(hit => new JsonDocument { DataAsJson = Index.CreateDocumentFromFields(searcher.Doc(hit.Doc), new FieldsToFetch(fields, false, index.IsMapReduce ? Constants.ReduceKeyFieldName : Constants.DocumentIdFieldName)), Etag = etag }) .ToArray(); }
private IEnumerable<RavenJObject> ProcessResults(MoreLikeThisQuery query, IEnumerable<JsonDocument> documents, CancellationToken token) { IndexingFunc transformFunc = null; if (string.IsNullOrEmpty(query.ResultsTransformer) == false) { var transformGenerator = database.IndexDefinitionStorage.GetTransformer(query.ResultsTransformer); if (transformGenerator != null && transformGenerator.TransformResultsDefinition != null) transformFunc = transformGenerator.TransformResultsDefinition; else throw new InvalidOperationException("The transformer " + query.ResultsTransformer + " was not found"); } IEnumerable<RavenJObject> results; var transformerErrors = new List<string>(); if (transformFunc == null) results = documents.Select(x => x.ToJson()); else { var robustEnumerator = new RobustEnumerator(token, 100) { OnError = (exception, o) => transformerErrors.Add(string.Format("Doc '{0}', Error: {1}", Index.TryGetDocKey(o), exception.Message)) }; results = robustEnumerator .RobustEnumeration(documents.Select(x => new DynamicJsonObject(x.ToJson())).GetEnumerator(), transformFunc) .Select(JsonExtensions.ToJObject); } return results; }
public static MoreLikeThisQueryResult ExecuteMoreLikeThisQuery(this DocumentDatabase self, MoreLikeThisQuery query, TransactionInformation transactionInformation, int pageSize = 25, string[] include = null) { return new MoreLikeThisQueryRunner(self).ExecuteMoreLikeThisQuery(query, transactionInformation, pageSize, include); }
private static void AssignParameters(Similarity.Net.MoreLikeThis mlt, MoreLikeThisQuery parameters) { if (parameters.Boost != null) mlt.SetBoost(parameters.Boost.Value); if (parameters.MaximumNumberOfTokensParsed != null) mlt.SetMaxNumTokensParsed(parameters.MaximumNumberOfTokensParsed.Value); if (parameters.MaximumNumberOfTokensParsed != null) mlt.SetMaxNumTokensParsed(parameters.MaximumNumberOfTokensParsed.Value); if (parameters.MaximumQueryTerms != null) mlt.SetMaxQueryTerms(parameters.MaximumQueryTerms.Value); if (parameters.MaximumWordLength != null) mlt.SetMaxWordLen(parameters.MaximumWordLength.Value); if (parameters.MinimumDocumentFrequency != null) mlt.SetMinDocFreq(parameters.MinimumDocumentFrequency.Value); if (parameters.MinimumTermFrequency != null) mlt.SetMinTermFreq(parameters.MinimumTermFrequency.Value); if (parameters.MinimumWordLength != null) mlt.SetMinWordLen(parameters.MinimumWordLength.Value); }