private static JArray GetTerms(IndexSearcher searcher, int doc, string field) { TermPositionVector termPositionVector = (TermPositionVector)searcher.IndexReader.GetTermFreqVector(doc, field); if (termPositionVector == null) { return(null); } JArray array = new JArray(); for (int i = 0; i < termPositionVector.GetTerms().Length; i++) { string term = termPositionVector.GetTerms()[i]; int[] positions = termPositionVector.GetTermPositions(i); string offset = ""; foreach (TermVectorOffsetInfo offsetInfo in termPositionVector.GetOffsets(i)) { offset += string.Format("({0},{1})", offsetInfo.StartOffset, offsetInfo.EndOffset); } array.Add(term + " " + offset); } return(array); }
public virtual void TestOffsetReader() { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); Assert.IsTrue(reader != null); TermPositionVector vector = (TermPositionVector)reader.Get(0, testFields[0]); Assert.IsTrue(vector != null); System.String[] terms = vector.GetTerms(); Assert.IsTrue(terms != null); Assert.IsTrue(terms.Length == testTerms.Length); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); Assert.IsTrue(term.Equals(testTerms[i])); int[] positions = vector.GetTermPositions(i); Assert.IsTrue(positions != null); Assert.IsTrue(positions.Length == this.positions[i].Length); for (int j = 0; j < positions.Length; j++) { int position = positions[j]; Assert.IsTrue(position == this.positions[i][j]); } TermVectorOffsetInfo[] offset = vector.GetOffsets(i); Assert.IsTrue(offset != null); Assert.IsTrue(offset.Length == this.offsets[i].Length); for (int j = 0; j < offset.Length; j++) { TermVectorOffsetInfo termVectorOffsetInfo = offset[j]; Assert.IsTrue(termVectorOffsetInfo.Equals(offsets[i][j])); } } }
public virtual void TestTermVectorsFieldOrder() { Directory dir = new MockRAMDirectory(); IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.Add(new Field("c", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("a", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("b", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("x", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.AddDocument(doc); writer.Close(); IndexReader reader = IndexReader.Open(dir); TermFreqVector[] v = reader.GetTermFreqVectors(0); Assert.AreEqual(4, v.Length); System.String[] expectedFields = new System.String[] { "a", "b", "c", "x" }; int[] expectedPositions = new int[] { 1, 2, 0 }; for (int i = 0; i < v.Length; i++) { TermPositionVector posVec = (TermPositionVector)v[i]; Assert.AreEqual(expectedFields[i], posVec.GetField()); System.String[] terms = posVec.GetTerms(); Assert.AreEqual(3, terms.Length); Assert.AreEqual("content", terms[0]); Assert.AreEqual("here", terms[1]); Assert.AreEqual("some", terms[2]); for (int j = 0; j < 3; j++) { int[] positions = posVec.GetTermPositions(j); Assert.AreEqual(1, positions.Length); Assert.AreEqual(expectedPositions[j], positions[0]); } } }
/// <summary> /// 获取当前搜索结果文档集合中指定索引位置的文档 /// </summary> /// <param name="index">文档在搜索结果文档集合中的从零开始的索引</param> /// <returns></returns> public Document this[int index] { get { int offset = 0; //条件在索引中的位置向量 TermPositionVector termPositionVector = (TermPositionVector)this.reader.GetTermFreqVector(this._hits.Id(index), "body"); //如果存在位置向量 if (termPositionVector != null) { int pos = -1; for (int i = 0; i < terms.Length; i++) { //第一个命中的关键字在索引中的位置 pos = System.Array.IndexOf <string>(termPositionVector.GetTerms(), terms[i]); if (pos > -1) { break; } } //如果在索引中找到对应关键字则取出关键字在正文中的偏移量 if (pos > -1) { TermVectorOffsetInfo[] tvois = termPositionVector.GetOffsets(pos); offset = tvois[0].GetStartOffset(); } } return(new Hit(this._hits.Doc(index), offset)); } }
public FieldTermStack(IndexReader reader, int docId, String fieldName, FieldQuery fieldQuery) { this.fieldName = fieldName; TermFreqVector tfv = reader.GetTermFreqVector(docId, fieldName); if (tfv == null) { return; // just return to make null snippets } TermPositionVector tpv = null; try { tpv = (TermPositionVector)tfv; } catch (InvalidCastException e) { return; // just return to make null snippets } List <String> termSet = fieldQuery.getTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet == null) { return; } foreach (String term in tpv.GetTerms()) { if (!termSet.Contains(term)) { continue; } int index = tpv.IndexOf(term); TermVectorOffsetInfo[] tvois = tpv.GetOffsets(index); if (tvois == null) { return; // just return to make null snippets } int[] poss = tpv.GetTermPositions(index); if (poss == null) { return; // just return to make null snippets } for (int i = 0; i < tvois.Length; i++) { termList.AddLast(new TermInfo(term, tvois[i].GetStartOffset(), tvois[i].GetEndOffset(), poss[i])); } } // sort by position //Collections.sort(termList); Sort(termList); }
/// <summary> Add a complete document specified by all its term vectors. If document has no /// term vectors, add value for tvx. /// /// </summary> /// <param name="vectors"> /// </param> /// <throws> IOException </throws> public void AddAllDocVectors(TermFreqVector[] vectors) { OpenDocument(); if (vectors != null) { for (int i = 0; i < vectors.Length; i++) { bool storePositionWithTermVector = false; bool storeOffsetWithTermVector = false; try { TermPositionVector tpVector = (TermPositionVector)vectors[i]; if (tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null) { storePositionWithTermVector = true; } if (tpVector.Size() > 0 && tpVector.GetOffsets(0) != null) { storeOffsetWithTermVector = true; } FieldInfo fieldInfo = fieldInfos.FieldInfo(tpVector.GetField()); OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector); for (int j = 0; j < tpVector.Size(); j++) { AddTermInternal(tpVector.GetTerms()[j], tpVector.GetTermFrequencies()[j], tpVector.GetTermPositions(j), tpVector.GetOffsets(j)); } CloseField(); } catch (System.InvalidCastException ignore) { TermFreqVector tfVector = vectors[i]; FieldInfo fieldInfo = fieldInfos.FieldInfo(tfVector.GetField()); OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector); for (int j = 0; j < tfVector.Size(); j++) { AddTermInternal(tfVector.GetTerms()[j], tfVector.GetTermFrequencies()[j], null, null); } CloseField(); } } } CloseDocument(); }
public virtual void TestMixedVectrosVectors() { IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO)); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS)); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_OFFSETS)); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.AddDocument(doc); writer.Close(); searcher = new IndexSearcher(directory); Query query = new TermQuery(new Term("field", "one")); ScoreDoc[] hits = searcher.Search(query, null, 1000).scoreDocs; Assert.AreEqual(1, hits.Length); TermFreqVector[] vector = searcher.reader_ForNUnit.GetTermFreqVectors(hits[0].doc); Assert.IsTrue(vector != null); Assert.IsTrue(vector.Length == 1); TermPositionVector tfv = (TermPositionVector)vector[0]; Assert.IsTrue(tfv.GetField().Equals("field")); System.String[] terms = tfv.GetTerms(); Assert.AreEqual(1, terms.Length); Assert.AreEqual(terms[0], "one"); Assert.AreEqual(5, tfv.GetTermFrequencies()[0]); int[] positions = tfv.GetTermPositions(0); Assert.AreEqual(5, positions.Length); for (int i = 0; i < 5; i++) { Assert.AreEqual(i, positions[i]); } TermVectorOffsetInfo[] offsets = tfv.GetOffsets(0); Assert.AreEqual(5, offsets.Length); for (int i = 0; i < 5; i++) { Assert.AreEqual(4 * i, offsets[i].GetStartOffset()); Assert.AreEqual(4 * i + 3, offsets[i].GetEndOffset()); } }
public virtual void TestTermPositionVectors() { Query query = new TermQuery(new Term("field", "zero")); try { ScoreDoc[] hits = searcher.Search(query, null, 1000).scoreDocs; Assert.AreEqual(1, hits.Length); for (int i = 0; i < hits.Length; i++) { TermFreqVector[] vector = searcher.reader_ForNUnit.GetTermFreqVectors(hits[i].doc); Assert.IsTrue(vector != null); Assert.IsTrue(vector.Length == 1); bool shouldBePosVector = (hits[i].doc % 2 == 0)?true:false; Assert.IsTrue((shouldBePosVector == false) || (shouldBePosVector == true && (vector[0] is TermPositionVector == true))); bool shouldBeOffVector = (hits[i].doc % 3 == 0)?true:false; Assert.IsTrue((shouldBeOffVector == false) || (shouldBeOffVector == true && (vector[0] is TermPositionVector == true))); if (shouldBePosVector || shouldBeOffVector) { TermPositionVector posVec = (TermPositionVector)vector[0]; System.String[] terms = posVec.GetTerms(); Assert.IsTrue(terms != null && terms.Length > 0); for (int j = 0; j < terms.Length; j++) { int[] positions = posVec.GetTermPositions(j); TermVectorOffsetInfo[] offsets = posVec.GetOffsets(j); if (shouldBePosVector) { Assert.IsTrue(positions != null); Assert.IsTrue(positions.Length > 0); } else { Assert.IsTrue(positions == null); } if (shouldBeOffVector) { Assert.IsTrue(offsets != null); Assert.IsTrue(offsets.Length > 0); } else { Assert.IsTrue(offsets == null); } } } else { try { TermPositionVector posVec = (TermPositionVector)vector[0]; Assert.IsTrue(false); } catch (System.InvalidCastException ignore) { TermFreqVector freqVec = vector[0]; System.String[] terms = freqVec.GetTerms(); Assert.IsTrue(terms != null && terms.Length > 0); } } } } catch (System.IO.IOException e) { Assert.IsTrue(false); } }
/// <summary> /// Low level api. /// Returns a token stream or null if no offset info available in index. /// This can be used to feed the highlighter with a pre-parsed token stream /// /// In my tests the speeds to recreate 1000 token streams using this method are: /// - with TermVector offset only data stored - 420 milliseconds /// - with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// /// The re-analyze timings will typically vary depending on - /// 1) The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// 2) The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// 3) Use of compression on field storage - could be faster due to compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// </summary> /// <param name="tpv"/> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param> public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous) { //code to reconstruct the original sequence of Tokens String[] terms = tpv.GetTerms(); int[] freq = tpv.GetTermFrequencies(); int totalTokens = freq.Sum(); var tokensInOriginalOrder = new Token[totalTokens]; List<Token> unsortedTokens = null; for (int t = 0; t < freq.Length; t++) { TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t); if (offsets == null) { return null; } int[] pos = null; if (tokenPositionsGuaranteedContiguous) { //try get the token position info to speed up assembly of tokens into sorted sequence pos = tpv.GetTermPositions(t); } if (pos == null) { //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later if (unsortedTokens == null) { unsortedTokens = new List<Token>(); } foreach (TermVectorOffsetInfo t1 in offsets) { var token = new Token(t1.StartOffset, t1.EndOffset); token.SetTermBuffer(terms[t]); unsortedTokens.Add(token); } } else { //We have positions stored and a guarantee that the token position information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or // creates jumps in position numbers - this code would fail under those circumstances //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.Length; tp++) { var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset); tokensInOriginalOrder[pos[tp]] = token; } } } //If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.ToArray(); Array.Sort(tokensInOriginalOrder, (t1, t2) => { if (t1.StartOffset > t2.EndOffset) return 1; if (t1.StartOffset < t2.StartOffset) return -1; return 0; }); } return new StoredTokenStream(tokensInOriginalOrder); }
/// <summary> /// Low level api. /// Returns a token stream or null if no offset info available in index. /// This can be used to feed the highlighter with a pre-parsed token stream /// /// In my tests the speeds to recreate 1000 token streams using this method are: /// - with TermVector offset only data stored - 420 milliseconds /// - with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// /// The re-analyze timings will typically vary depending on - /// 1) The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// 2) The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// 3) Use of compression on field storage - could be faster due to compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// </summary> /// <param name="tpv"/> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param> public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous) { //code to reconstruct the original sequence of Tokens String[] terms = tpv.GetTerms(); int[] freq = tpv.GetTermFrequencies(); int totalTokens = freq.Sum(); var tokensInOriginalOrder = new Token[totalTokens]; List <Token> unsortedTokens = null; for (int t = 0; t < freq.Length; t++) { TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t); if (offsets == null) { return(null); } int[] pos = null; if (tokenPositionsGuaranteedContiguous) { //try get the token position info to speed up assembly of tokens into sorted sequence pos = tpv.GetTermPositions(t); } if (pos == null) { //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later if (unsortedTokens == null) { unsortedTokens = new List <Token>(); } foreach (TermVectorOffsetInfo t1 in offsets) { var token = new Token(t1.StartOffset, t1.EndOffset); token.SetTermBuffer(terms[t]); unsortedTokens.Add(token); } } else { //We have positions stored and a guarantee that the token position information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or // creates jumps in position numbers - this code would fail under those circumstances //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.Length; tp++) { var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset); tokensInOriginalOrder[pos[tp]] = token; } } } //If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.ToArray(); Array.Sort(tokensInOriginalOrder, (t1, t2) => { if (t1.StartOffset > t2.EndOffset) { return(1); } if (t1.StartOffset < t2.StartOffset) { return(-1); } return(0); }); } return(new StoredTokenStream(tokensInOriginalOrder)); }
public FieldTermStack(IndexReader reader, int docId, String fieldName, FieldQuery fieldQuery, IState state) { this.fieldName = fieldName; var tfv = reader.GetTermFreqVector(docId, fieldName, state); if (tfv == null) { return; // just return to make null snippets } TermPositionVector tpv = null; try { tpv = (TermPositionVector)tfv; } catch (InvalidCastException) { return; // just return to make null snippets } List <String> termSet = fieldQuery.getTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet == null) { return; } var needwildcard = termSet.Any(x => x.IndexOfAny(new char[] { '*', '?' }) != -1); foreach (String term in tpv.GetTerms()) { if (needwildcard) { if (termSet.Any(ts => WildcardMatcher.Matches(ts, term)) == false) { continue; } } else if (!termSet.Contains(term)) { continue; } int index = tpv.IndexOf(term); TermVectorOffsetInfo[] tvois = tpv.GetOffsets(index); if (tvois == null) { return; // just return to make null snippets } int[] poss = tpv.GetTermPositions(index); if (poss == null) { return; // just return to make null snippets } for (int i = 0; i < tvois.Length; i++) { termList.AddLast(new TermInfo(term, tvois[i].StartOffset, tvois[i].EndOffset, poss[i])); } } // sort by position //Collections.sort(termList); Sort(termList); }