private static JArray GetTerms(IndexSearcher searcher, int doc, string field) { TermPositionVector termPositionVector = (TermPositionVector)searcher.IndexReader.GetTermFreqVector(doc, field); if (termPositionVector == null) { return(null); } JArray array = new JArray(); for (int i = 0; i < termPositionVector.GetTerms().Length; i++) { string term = termPositionVector.GetTerms()[i]; int[] positions = termPositionVector.GetTermPositions(i); string offset = ""; foreach (TermVectorOffsetInfo offsetInfo in termPositionVector.GetOffsets(i)) { offset += string.Format("({0},{1})", offsetInfo.StartOffset, offsetInfo.EndOffset); } array.Add(term + " " + offset); } return(array); }
/// <summary> /// 获取当前搜索结果文档集合中指定索引位置的文档 /// </summary> /// <param name="index">文档在搜索结果文档集合中的从零开始的索引</param> /// <returns></returns> public Document this[int index] { get { int offset = 0; //条件在索引中的位置向量 TermPositionVector termPositionVector = (TermPositionVector)this.reader.GetTermFreqVector(this._hits.Id(index), "body"); //如果存在位置向量 if (termPositionVector != null) { int pos = -1; for (int i = 0; i < terms.Length; i++) { //第一个命中的关键字在索引中的位置 pos = System.Array.IndexOf <string>(termPositionVector.GetTerms(), terms[i]); if (pos > -1) { break; } } //如果在索引中找到对应关键字则取出关键字在正文中的偏移量 if (pos > -1) { TermVectorOffsetInfo[] tvois = termPositionVector.GetOffsets(pos); offset = tvois[0].GetStartOffset(); } } return(new Hit(this._hits.Doc(index), offset)); } }
public virtual void TestTermVectorsFieldOrder() { Directory dir = new MockRAMDirectory(); IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.Add(new Field("c", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("a", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("b", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.Add(new Field("x", "some content here", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.AddDocument(doc); writer.Close(); IndexReader reader = IndexReader.Open(dir); TermFreqVector[] v = reader.GetTermFreqVectors(0); Assert.AreEqual(4, v.Length); System.String[] expectedFields = new System.String[] { "a", "b", "c", "x" }; int[] expectedPositions = new int[] { 1, 2, 0 }; for (int i = 0; i < v.Length; i++) { TermPositionVector posVec = (TermPositionVector)v[i]; Assert.AreEqual(expectedFields[i], posVec.GetField()); System.String[] terms = posVec.GetTerms(); Assert.AreEqual(3, terms.Length); Assert.AreEqual("content", terms[0]); Assert.AreEqual("here", terms[1]); Assert.AreEqual("some", terms[2]); for (int j = 0; j < 3; j++) { int[] positions = posVec.GetTermPositions(j); Assert.AreEqual(1, positions.Length); Assert.AreEqual(expectedPositions[j], positions[0]); } } }
public virtual void TestOffsetReader() { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); Assert.IsTrue(reader != null); TermPositionVector vector = (TermPositionVector)reader.Get(0, testFields[0]); Assert.IsTrue(vector != null); System.String[] terms = vector.GetTerms(); Assert.IsTrue(terms != null); Assert.IsTrue(terms.Length == testTerms.Length); for (int i = 0; i < terms.Length; i++) { System.String term = terms[i]; //System.out.println("Term: " + term); Assert.IsTrue(term.Equals(testTerms[i])); int[] positions = vector.GetTermPositions(i); Assert.IsTrue(positions != null); Assert.IsTrue(positions.Length == this.positions[i].Length); for (int j = 0; j < positions.Length; j++) { int position = positions[j]; Assert.IsTrue(position == this.positions[i][j]); } TermVectorOffsetInfo[] offset = vector.GetOffsets(i); Assert.IsTrue(offset != null); Assert.IsTrue(offset.Length == this.offsets[i].Length); for (int j = 0; j < offset.Length; j++) { TermVectorOffsetInfo termVectorOffsetInfo = offset[j]; Assert.IsTrue(termVectorOffsetInfo.Equals(offsets[i][j])); } } }
private static void CompareTermData(Directory dir, string str) { IndexSearcher searcher = new IndexSearcher(dir, true); var tf = searcher.IndexReader.GetTermFreqVectors(0); TermPositionVector tpMorph = (TermPositionVector)tf[0]; TermPositionVector tpSimple = (TermPositionVector)tf[1]; for (int i = 0; i < 4; i++) { int[] posMorph = tpMorph.GetTermPositions(i); int[] posSimple = tpSimple.GetTermPositions(i); for (int j = 0; j < posSimple.Length; j++) { Assert.Equal(posSimple[j], posMorph[j]); } TermVectorOffsetInfo[] offMorph = tpMorph.GetOffsets(i); TermVectorOffsetInfo[] offSimple = tpSimple.GetOffsets(i); for (int j = 0; j < offSimple.Length; j++) { Console.WriteLine(str.Substring(offSimple[j].StartOffset, offSimple[j].EndOffset - offSimple[j].StartOffset)); Assert.Equal(offSimple[j].StartOffset, offMorph[j].StartOffset); Assert.Equal(offSimple[j].EndOffset, offMorph[j].EndOffset); } } }
public FieldTermStack(IndexReader reader, int docId, String fieldName, FieldQuery fieldQuery) { this.fieldName = fieldName; TermFreqVector tfv = reader.GetTermFreqVector(docId, fieldName); if (tfv == null) { return; // just return to make null snippets } TermPositionVector tpv = null; try { tpv = (TermPositionVector)tfv; } catch (InvalidCastException e) { return; // just return to make null snippets } List <String> termSet = fieldQuery.getTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet == null) { return; } foreach (String term in tpv.GetTerms()) { if (!termSet.Contains(term)) { continue; } int index = tpv.IndexOf(term); TermVectorOffsetInfo[] tvois = tpv.GetOffsets(index); if (tvois == null) { return; // just return to make null snippets } int[] poss = tpv.GetTermPositions(index); if (poss == null) { return; // just return to make null snippets } for (int i = 0; i < tvois.Length; i++) { termList.AddLast(new TermInfo(term, tvois[i].GetStartOffset(), tvois[i].GetEndOffset(), poss[i])); } } // sort by position //Collections.sort(termList); Sort(termList); }
// поиск с указанием найденной позиции в тексте public void DoSearch(String db, String querystr, global::Lucene.Net.Store.Directory indexDirectory) { // 1. Specify the analyzer for tokenizing text. // The same analyzer should be used as was used for indexing StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30, ListStopWords); // 2. query Query q = new QueryParser(Version.LUCENE_30, "LineText", analyzer).Parse(querystr); // 3. search int hitsPerPage = 10; IndexSearcher searcher = new IndexSearcher(indexDirectory, true); IndexReader reader = IndexReader.Open(indexDirectory, true); searcher.SetDefaultFieldSortScoring(true, false); TopScoreDocCollector collector = TopScoreDocCollector.Create(hitsPerPage, true); searcher.Search(q, collector); ScoreDoc[] hits = collector.TopDocs().ScoreDocs; // 4. display term positions, and term indexes MessageBox.Show("Found " + hits.Length + " hits."); for (int i = 0; i < hits.Length; ++i) { int docId = hits[i].Doc; ITermFreqVector tfvector = reader.GetTermFreqVector(docId, "LineText"); TermPositionVector tpvector = (TermPositionVector)tfvector; // this part works only if there is one term in the query string, // otherwise you will have to iterate this section over the query terms. int termidx = tfvector.IndexOf(querystr); int[] termposx = tpvector.GetTermPositions(termidx); TermVectorOffsetInfo[] tvoffsetinfo = tpvector.GetOffsets(termidx); for (int j = 0; j < termposx.Length; j++) { MessageBox.Show("termpos : " + termposx[j]); } for (int j = 0; j < tvoffsetinfo.Length; j++) { int offsetStart = tvoffsetinfo[j].StartOffset; int offsetEnd = tvoffsetinfo[j].EndOffset; MessageBox.Show("offsets : " + offsetStart + " " + offsetEnd); } // print some info about where the hit was found... Document d = searcher.Doc(docId); MessageBox.Show((i + 1) + ". " + d.Get("path")); } // searcher can only be closed when there // is no need to access the documents any more. searcher.Dispose(); }
/// <summary> Add a complete document specified by all its term vectors. If document has no /// term vectors, add value for tvx. /// /// </summary> /// <param name="vectors"> /// </param> /// <throws> IOException </throws> public void AddAllDocVectors(TermFreqVector[] vectors) { OpenDocument(); if (vectors != null) { for (int i = 0; i < vectors.Length; i++) { bool storePositionWithTermVector = false; bool storeOffsetWithTermVector = false; try { TermPositionVector tpVector = (TermPositionVector)vectors[i]; if (tpVector.Size() > 0 && tpVector.GetTermPositions(0) != null) { storePositionWithTermVector = true; } if (tpVector.Size() > 0 && tpVector.GetOffsets(0) != null) { storeOffsetWithTermVector = true; } FieldInfo fieldInfo = fieldInfos.FieldInfo(tpVector.GetField()); OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector); for (int j = 0; j < tpVector.Size(); j++) { AddTermInternal(tpVector.GetTerms()[j], tpVector.GetTermFrequencies()[j], tpVector.GetTermPositions(j), tpVector.GetOffsets(j)); } CloseField(); } catch (System.InvalidCastException ignore) { TermFreqVector tfVector = vectors[i]; FieldInfo fieldInfo = fieldInfos.FieldInfo(tfVector.GetField()); OpenField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector); for (int j = 0; j < tfVector.Size(); j++) { AddTermInternal(tfVector.GetTerms()[j], tfVector.GetTermFrequencies()[j], null, null); } CloseField(); } } } CloseDocument(); }
public virtual void TestMixedVectrosVectors() { IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO)); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS)); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_OFFSETS)); doc.Add(new Field("field", "one", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.AddDocument(doc); writer.Close(); searcher = new IndexSearcher(directory); Query query = new TermQuery(new Term("field", "one")); ScoreDoc[] hits = searcher.Search(query, null, 1000).scoreDocs; Assert.AreEqual(1, hits.Length); TermFreqVector[] vector = searcher.reader_ForNUnit.GetTermFreqVectors(hits[0].doc); Assert.IsTrue(vector != null); Assert.IsTrue(vector.Length == 1); TermPositionVector tfv = (TermPositionVector)vector[0]; Assert.IsTrue(tfv.GetField().Equals("field")); System.String[] terms = tfv.GetTerms(); Assert.AreEqual(1, terms.Length); Assert.AreEqual(terms[0], "one"); Assert.AreEqual(5, tfv.GetTermFrequencies()[0]); int[] positions = tfv.GetTermPositions(0); Assert.AreEqual(5, positions.Length); for (int i = 0; i < 5; i++) { Assert.AreEqual(i, positions[i]); } TermVectorOffsetInfo[] offsets = tfv.GetOffsets(0); Assert.AreEqual(5, offsets.Length); for (int i = 0; i < 5; i++) { Assert.AreEqual(4 * i, offsets[i].GetStartOffset()); Assert.AreEqual(4 * i + 3, offsets[i].GetEndOffset()); } }
public FieldTermStack(IndexReader reader, int docId, String fieldName, FieldQuery fieldQuery, IState state) { this.fieldName = fieldName; var tfv = reader.GetTermFreqVector(docId, fieldName, state); if (tfv == null) { return; // just return to make null snippets } TermPositionVector tpv = null; try { tpv = (TermPositionVector)tfv; } catch (InvalidCastException) { return; // just return to make null snippets } List <String> termSet = fieldQuery.getTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet == null) { return; } var needwildcard = termSet.Any(x => x.IndexOfAny(new char[] { '*', '?' }) != -1); foreach (String term in tpv.GetTerms()) { if (needwildcard) { if (termSet.Any(ts => WildcardMatcher.Matches(ts, term)) == false) { continue; } } else if (!termSet.Contains(term)) { continue; } int index = tpv.IndexOf(term); TermVectorOffsetInfo[] tvois = tpv.GetOffsets(index); if (tvois == null) { return; // just return to make null snippets } int[] poss = tpv.GetTermPositions(index); if (poss == null) { return; // just return to make null snippets } for (int i = 0; i < tvois.Length; i++) { termList.AddLast(new TermInfo(term, tvois[i].StartOffset, tvois[i].EndOffset, poss[i])); } } // sort by position //Collections.sort(termList); Sort(termList); }
public static TokenStream GetTokenStream(TermPositionVector tpv) { //assumes the worst and makes no assumptions about token position sequences. return(GetTokenStream(tpv, false)); }
/// <summary> /// Low level api. /// Returns a token stream or null if no offset info available in index. /// This can be used to feed the highlighter with a pre-parsed token stream /// /// In my tests the speeds to recreate 1000 token streams using this method are: /// - with TermVector offset only data stored - 420 milliseconds /// - with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// /// The re-analyze timings will typically vary depending on - /// 1) The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// 2) The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// 3) Use of compression on field storage - could be faster due to compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// </summary> /// <param name="tpv"/> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param> public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous) { //code to reconstruct the original sequence of Tokens String[] terms = tpv.GetTerms(); int[] freq = tpv.GetTermFrequencies(); int totalTokens = freq.Sum(); var tokensInOriginalOrder = new Token[totalTokens]; List <Token> unsortedTokens = null; for (int t = 0; t < freq.Length; t++) { TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t); if (offsets == null) { return(null); } int[] pos = null; if (tokenPositionsGuaranteedContiguous) { //try get the token position info to speed up assembly of tokens into sorted sequence pos = tpv.GetTermPositions(t); } if (pos == null) { //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later if (unsortedTokens == null) { unsortedTokens = new List <Token>(); } foreach (TermVectorOffsetInfo t1 in offsets) { var token = new Token(t1.StartOffset, t1.EndOffset); token.SetTermBuffer(terms[t]); unsortedTokens.Add(token); } } else { //We have positions stored and a guarantee that the token position information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or // creates jumps in position numbers - this code would fail under those circumstances //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.Length; tp++) { var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset); tokensInOriginalOrder[pos[tp]] = token; } } } //If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.ToArray(); Array.Sort(tokensInOriginalOrder, (t1, t2) => { if (t1.StartOffset > t2.EndOffset) { return(1); } if (t1.StartOffset < t2.StartOffset) { return(-1); } return(0); }); } return(new StoredTokenStream(tokensInOriginalOrder)); }
public virtual void TestTermPositionVectors() { Query query = new TermQuery(new Term("field", "zero")); try { ScoreDoc[] hits = searcher.Search(query, null, 1000).scoreDocs; Assert.AreEqual(1, hits.Length); for (int i = 0; i < hits.Length; i++) { TermFreqVector[] vector = searcher.reader_ForNUnit.GetTermFreqVectors(hits[i].doc); Assert.IsTrue(vector != null); Assert.IsTrue(vector.Length == 1); bool shouldBePosVector = (hits[i].doc % 2 == 0)?true:false; Assert.IsTrue((shouldBePosVector == false) || (shouldBePosVector == true && (vector[0] is TermPositionVector == true))); bool shouldBeOffVector = (hits[i].doc % 3 == 0)?true:false; Assert.IsTrue((shouldBeOffVector == false) || (shouldBeOffVector == true && (vector[0] is TermPositionVector == true))); if (shouldBePosVector || shouldBeOffVector) { TermPositionVector posVec = (TermPositionVector)vector[0]; System.String[] terms = posVec.GetTerms(); Assert.IsTrue(terms != null && terms.Length > 0); for (int j = 0; j < terms.Length; j++) { int[] positions = posVec.GetTermPositions(j); TermVectorOffsetInfo[] offsets = posVec.GetOffsets(j); if (shouldBePosVector) { Assert.IsTrue(positions != null); Assert.IsTrue(positions.Length > 0); } else { Assert.IsTrue(positions == null); } if (shouldBeOffVector) { Assert.IsTrue(offsets != null); Assert.IsTrue(offsets.Length > 0); } else { Assert.IsTrue(offsets == null); } } } else { try { TermPositionVector posVec = (TermPositionVector)vector[0]; Assert.IsTrue(false); } catch (System.InvalidCastException ignore) { TermFreqVector freqVec = vector[0]; System.String[] terms = freqVec.GetTerms(); Assert.IsTrue(terms != null && terms.Length > 0); } } } } catch (System.IO.IOException e) { Assert.IsTrue(false); } }
/// <summary> /// Low level api. /// Returns a token stream or null if no offset info available in index. /// This can be used to feed the highlighter with a pre-parsed token stream /// /// In my tests the speeds to recreate 1000 token streams using this method are: /// - with TermVector offset only data stored - 420 milliseconds /// - with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// /// The re-analyze timings will typically vary depending on - /// 1) The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// 2) The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// 3) Use of compression on field storage - could be faster due to compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// </summary> /// <param name="tpv"/> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false.</param> public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous) { //code to reconstruct the original sequence of Tokens String[] terms = tpv.GetTerms(); int[] freq = tpv.GetTermFrequencies(); int totalTokens = freq.Sum(); var tokensInOriginalOrder = new Token[totalTokens]; List<Token> unsortedTokens = null; for (int t = 0; t < freq.Length; t++) { TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t); if (offsets == null) { return null; } int[] pos = null; if (tokenPositionsGuaranteedContiguous) { //try get the token position info to speed up assembly of tokens into sorted sequence pos = tpv.GetTermPositions(t); } if (pos == null) { //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later if (unsortedTokens == null) { unsortedTokens = new List<Token>(); } foreach (TermVectorOffsetInfo t1 in offsets) { var token = new Token(t1.StartOffset, t1.EndOffset); token.SetTermBuffer(terms[t]); unsortedTokens.Add(token); } } else { //We have positions stored and a guarantee that the token position information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or // creates jumps in position numbers - this code would fail under those circumstances //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.Length; tp++) { var token = new Token(terms[t], offsets[tp].StartOffset, offsets[tp].EndOffset); tokensInOriginalOrder[pos[tp]] = token; } } } //If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = unsortedTokens.ToArray(); Array.Sort(tokensInOriginalOrder, (t1, t2) => { if (t1.StartOffset > t2.EndOffset) return 1; if (t1.StartOffset < t2.StartOffset) return -1; return 0; }); } return new StoredTokenStream(tokensInOriginalOrder); }
public static void VerifyEquals(TermFreqVector[] d1, TermFreqVector[] d2) { if (d1 == null) { Assert.IsTrue(d2 == null); return; } Assert.IsTrue(d2 != null); Assert.AreEqual(d1.Length, d2.Length); for (int i = 0; i < d1.Length; i++) { TermFreqVector v1 = d1[i]; TermFreqVector v2 = d2[i]; if (v1 == null || v2 == null) { System.Console.Out.WriteLine("v1=" + v1 + " v2=" + v2 + " i=" + i + " of " + d1.Length); } Assert.AreEqual(v1.Size(), v2.Size()); int numTerms = v1.Size(); System.String[] terms1 = v1.GetTerms(); System.String[] terms2 = v2.GetTerms(); int[] freq1 = v1.GetTermFrequencies(); int[] freq2 = v2.GetTermFrequencies(); for (int j = 0; j < numTerms; j++) { if (!terms1[j].Equals(terms2[j])) { Assert.AreEqual(terms1[j], terms2[j]); } Assert.AreEqual(freq1[j], freq2[j]); } if (v1 is TermPositionVector) { Assert.IsTrue(v2 is TermPositionVector); TermPositionVector tpv1 = (TermPositionVector)v1; TermPositionVector tpv2 = (TermPositionVector)v2; for (int j = 0; j < numTerms; j++) { int[] pos1 = tpv1.GetTermPositions(j); int[] pos2 = tpv2.GetTermPositions(j); Assert.AreEqual(pos1.Length, pos2.Length); TermVectorOffsetInfo[] offsets1 = tpv1.GetOffsets(j); TermVectorOffsetInfo[] offsets2 = tpv2.GetOffsets(j); if (offsets1 == null) { Assert.IsTrue(offsets2 == null); } else { Assert.IsTrue(offsets2 != null); } for (int k = 0; k < pos1.Length; k++) { Assert.AreEqual(pos1[k], pos2[k]); if (offsets1 != null) { Assert.AreEqual(offsets1[k].GetStartOffset(), offsets2[k].GetStartOffset()); Assert.AreEqual(offsets1[k].GetEndOffset(), offsets2[k].GetEndOffset()); } } } } } }
private static List <DataForIndex> getResult(List <DataForIndex> dfi, ScoreDoc[] hits, string term, IndexSearcher searcher, Query query, MultiFieldQueryParser parser) { if (hits.Length == 0) { term = searchByPartialWords(term); query = parseQuery(term, parser); hits = searcher.Search(query, 100).ScoreDocs; } foreach (var scoreDoc in hits) { var doc = searcher.Doc(scoreDoc.Doc); var score = scoreDoc.Score; DataForIndex listdata = new DataForIndex(); listdata.ID = int.Parse(doc.Get("ID")); listdata.FileName = doc.Get("FileName"); listdata.SearchWord = term; listdata.FileExtension = doc.Get("FileExtension"); listdata.AudioGenre = doc.Get("AudioGenre"); listdata.AudioAlbum = doc.Get("AudioAlbum"); listdata.AudioBitrate = doc.Get("AudioBitrate"); listdata.AudioDuration = doc.Get("AudioDuration"); listdata.Label = doc.Get("Label"); if (isFarsiArabic) { listdata.IsFarsiArabic = true; listdata.HarfArabic = HarfArabic; listdata.HarfFarsi = HarfFarsi; } listdata.Score = score; TermPositionVector obj_vector = (TermPositionVector)searcher.IndexReader.GetTermFreqVector(scoreDoc.Doc, "Body"); int int_phraseIndex = obj_vector.IndexOf(term.Split(' ').FirstOrDefault()); TermVectorOffsetInfo[] obj_offsetInfo = obj_vector.GetOffsets(int_phraseIndex); StringBuilder text = new StringBuilder(); for (int i = 0; i < obj_offsetInfo.Length; i++) { string body = doc.Get("Body"); int start = obj_offsetInfo[i].StartOffset; int end = body.Length; int count = 100; if (start + count <= end) { end = start + count; } if (start > count) { start = start - count; } else { start = 0; } text.Append(body.Substring(start, end - start) + " # "); } listdata.ResultText = text.ToString(); if (dfi.FirstOrDefault(x => x.FileName == listdata.FileName) == null) { dfi.Add(listdata); } else { var del = dfi.FirstOrDefault(x => x.FileName == listdata.FileName); dfi.Remove(del); del.SearchWord = del.SearchWord + " + " + listdata.SearchWord; del.ResultText = del.ResultText + " " + listdata.ResultText; dfi.Add(del); } } return(dfi); }
public static TokenStream GetTokenStream(TermPositionVector tpv) { //assumes the worst and makes no assumptions about token position sequences. return GetTokenStream(tpv, false); }