/// <summary>Returns the next token in the stream, or null at EOS. /// <p>Removes <tt>'s</tt> from the end of words. /// <p>Removes dots from acronyms. /// </summary> public override Token Next(Token result) { Token t = input.Next(result); if (t == null) return null; char[] buffer = t.TermBuffer(); int bufferLength = t.TermLength(); System.String type = t.Type(); if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { // Strip last 2 characters off t.SetTermLength(bufferLength - 2); } else if (type == ACRONYM_TYPE) { // remove dots int upto = 0; for (int i = 0; i < bufferLength; i++) { char c = buffer[i]; if (c != '.') buffer[upto++] = c; } t.SetTermLength(upto); } return t; }
public virtual void TestToStringAndMultiAttributeImplementations() { AttributeSource src = new AttributeSource(); TermAttribute termAtt = (TermAttribute)src.AddAttribute(typeof(TermAttribute)); TypeAttribute typeAtt = (TypeAttribute)src.AddAttribute(typeof(TypeAttribute)); termAtt.SetTermBuffer("TestTerm"); typeAtt.SetType("TestType"); Assert.AreEqual("(" + termAtt.ToString() + "," + typeAtt.ToString() + ")", src.ToString(), "Attributes should appear in original order"); System.Collections.Generic.IEnumerator <AttributeImpl> it = src.GetAttributeImplsIterator().GetEnumerator(); Assert.IsTrue(it.MoveNext(), "Iterator should have 2 attributes left"); Assert.AreSame(termAtt, it.Current, "First AttributeImpl from iterator should be termAtt"); Assert.IsTrue(it.MoveNext(), "Iterator should have 1 attributes left"); Assert.AreSame(typeAtt, it.Current, "Second AttributeImpl from iterator should be typeAtt"); Assert.IsFalse(it.MoveNext(), "Iterator should have 0 attributes left"); src = new AttributeSource(); src.AddAttributeImpl(new Token()); // this should not add a new attribute as Token implements TermAttribute, too termAtt = (TermAttribute)src.AddAttribute(typeof(TermAttribute)); Assert.IsTrue(termAtt is Token, "TermAttribute should be implemented by Token"); // get the Token attribute and check, that it is the only one it = src.GetAttributeImplsIterator().GetEnumerator(); Assert.IsTrue(it.MoveNext()); Token tok = (Token)it.Current; Assert.IsFalse(it.MoveNext(), "There should be only one attribute implementation instance"); termAtt.SetTermBuffer("TestTerm"); Assert.AreEqual("(" + tok.ToString() + ")", src.ToString(), "Token should only printed once"); }
public override Token Next() { if (inPhrase) { inPhrase = false; return(new Token("phrase2", savedStart, savedEnd)); } else { for (Token token = input.Next(); token != null; token = input.Next()) { if (token.TermText().Equals("phrase")) { inPhrase = true; savedStart = token.StartOffset(); savedEnd = token.EndOffset(); return(new Token("phrase1", savedStart, savedEnd)); } else if (!token.TermText().Equals("stop")) { return(token); } } } return(null); }
/// <summary>Returns the next token in the stream, or null at EOS. /// <p>Removes <tt>'s</tt> from the end of words. /// <p>Removes dots from acronyms. /// </summary> public override Token Next(/* in */ Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); Token nextToken = input.Next(reusableToken); if (nextToken == null) return null; char[] buffer = nextToken.TermBuffer(); int bufferLength = nextToken.TermLength(); System.String type = nextToken.Type(); if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { // Strip last 2 characters off nextToken.SetTermLength(bufferLength - 2); } else if (type == ACRONYM_TYPE) { // remove dots int upto = 0; for (int i = 0; i < bufferLength; i++) { char c = buffer[i]; if (c != '.') buffer[upto++] = c; } nextToken.SetTermLength(upto); } return nextToken; }
internal static void Test(System.IO.TextReader reader, bool verbose, long bytes) { Analyzer analyzer = new SimpleAnalyzer(); TokenStream stream = analyzer.TokenStream(null, reader); System.DateTime start = System.DateTime.Now; int count = 0; for (Token t = stream.Next(); t != null; t = stream.Next()) { if (verbose) { System.Console.Out.WriteLine("Text=" + t.TermText() + " start=" + t.StartOffset() + " end=" + t.EndOffset()); } count++; } System.DateTime end = System.DateTime.Now; long time = end.Ticks - start.Ticks; System.Console.Out.WriteLine(time + " milliseconds to extract " + count + " tokens"); System.Console.Out.WriteLine((time * 1000.0) / count + " microseconds/token"); System.Console.Out.WriteLine((bytes * 1000.0 * 60.0 * 60.0) / (time * 1000000.0) + " megabytes/hour"); }
internal virtual void AddToken(Token token, float score) { if (numTokens < MAX_NUM_TOKENS_PER_GROUP) { if (numTokens == 0) { startOffset = matchStartOffset = token.StartOffset(); endOffset = matchEndOffset = token.EndOffset(); tot += score; } else { startOffset = Math.Min(startOffset, token.StartOffset()); endOffset = Math.Max(endOffset, token.EndOffset()); if (score > 0) { if (tot == 0) { matchStartOffset = token.StartOffset(); matchEndOffset = token.EndOffset(); } else { matchStartOffset = Math.Min(matchStartOffset, token.StartOffset()); matchEndOffset = Math.Max(matchEndOffset, token.EndOffset()); } tot += score; } } tokens[numTokens] = token; scores[numTokens] = score; numTokens++; } }
/* (non-Javadoc) * @see Lucene.Net.Highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token) */ public virtual bool IsNewFragment(Token token) { bool isNewFrag = token.EndOffset() >= (fragmentSize * currentNumFrags); if (isNewFrag) { currentNumFrags++; } return isNewFrag; }
public override Token Next() { if (i == TOKENS.Length) return null; Token t = new Token(TOKENS[i], i, i); t.SetPositionIncrement(INCREMENTS[i]); i++; return t; }
/* (non-Javadoc) * @see Lucene.Net.Highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token) */ public virtual bool IsNewFragment(Token token) { bool isNewFrag = token.EndOffset() >= (fragmentSize * currentNumFrags); if (isNewFrag) { currentNumFrags++; } return(isNewFrag); }
public override Token Next() { if (i == TOKENS.Length) { return(null); } Token t = new Token(TOKENS[i], i, i); t.SetPositionIncrement(INCREMENTS[i]); i++; return(t); }
/* * (non-Javadoc) * * @see Lucene.Net.Analysis.TokenStream#next() */ public override Token Next(/* in */ Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); int posIncr = 1; while (true) { int tokenType = scanner.GetNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return(null); } if (scanner.Yylength() <= maxTokenLength) { reusableToken.Clear(); reusableToken.SetPositionIncrement(posIncr); scanner.GetText(reusableToken); int start = scanner.Yychar(); reusableToken.SetStartOffset(start); reusableToken.SetEndOffset(start + reusableToken.TermLength()); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { reusableToken.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); reusableToken.SetTermLength(reusableToken.TermLength() - 1); // remove extra '.' } else { reusableToken.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); } } else { reusableToken.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } return(reusableToken); } // When we skip a too-long term, we still increment the // position increment else { posIncr++; } } }
internal override void addTerm(Token t, RawPostingList p0) { System.Diagnostics.Debug.Assert(docState.TestPoint("FreqProxTermsWriterPerField.addTerm start")); FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList)p0; System.Diagnostics.Debug.Assert(omitTf || p.docFreq > 0); if (omitTf) { if (docState.docID != p.lastDocID) { System.Diagnostics.Debug.Assert(docState.docID > p.lastDocID); termsHashPerField.writeVInt(0, p.lastDocCode); p.lastDocCode = docState.docID - p.lastDocID; p.lastDocID = docState.docID; } } else { if (docState.docID != p.lastDocID) { System.Diagnostics.Debug.Assert(docState.docID > p.lastDocID); // Term not yet seen in the current doc but previously // seen in other doc(s) since the last flush // Now that we know doc freq for previous doc, // write it & lastDocCode if (1 == p.docFreq) { termsHashPerField.writeVInt(0, p.lastDocCode | 1); } else { termsHashPerField.writeVInt(0, p.lastDocCode); termsHashPerField.writeVInt(0, p.docFreq); } p.docFreq = 1; p.lastDocCode = (docState.docID - p.lastDocID) << 1; p.lastDocID = docState.docID; writeProx(t, p, fieldState.position); } else { p.docFreq++; writeProx(t, p, fieldState.position - p.lastPosition); } } }
public virtual int Compare(System.Object o1, System.Object o2) { Token t1 = (Token)o1; Token t2 = (Token)o2; if (t1.StartOffset() > t2.StartOffset()) { return(1); } if (t1.StartOffset() < t2.StartOffset()) { return(-1); } return(0); }
public virtual void TestIncrementingPositions() { Analyzer analyzer = new WhitespaceAnalyzer(); TokenStream ts = analyzer.TokenStream("Field", new System.IO.StringReader("one two three four five")); while (true) { Token token = ts.Next(); if (token == null) { break; } Assert.AreEqual(1, token.GetPositionIncrement(), token.TermText()); } }
internal override void addTerm(Token t, RawPostingList p0) { System.Diagnostics.Debug.Assert(docState.TestPoint("FreqProxTermsWriterPerField.addTerm start")); FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList)p0; System.Diagnostics.Debug.Assert(omitTf || p.docFreq > 0); if (omitTf) { if (docState.docID != p.lastDocID) { System.Diagnostics.Debug.Assert(docState.docID > p.lastDocID); termsHashPerField.writeVInt(0, p.lastDocCode); p.lastDocCode = docState.docID - p.lastDocID; p.lastDocID = docState.docID; } } else { if (docState.docID != p.lastDocID) { System.Diagnostics.Debug.Assert(docState.docID > p.lastDocID); // Term not yet seen in the current doc but previously // seen in other doc(s) since the last flush // Now that we know doc freq for previous doc, // write it & lastDocCode if (1 == p.docFreq) termsHashPerField.writeVInt(0, p.lastDocCode | 1); else { termsHashPerField.writeVInt(0, p.lastDocCode); termsHashPerField.writeVInt(0, p.docFreq); } p.docFreq = 1; p.lastDocCode = (docState.docID - p.lastDocID) << 1; p.lastDocID = docState.docID; writeProx(t, p, fieldState.position); } else { p.docFreq++; writeProx(t, p, fieldState.position - p.lastPosition); } } }
internal void writeProx(Token t, FreqProxTermsWriter.PostingList p, int proxCode) { Payload payload = t.GetPayload(); if (payload != null && payload.length > 0) { termsHashPerField.writeVInt(1, (proxCode << 1) | 1); termsHashPerField.writeVInt(1, payload.length); termsHashPerField.writeBytes(1, payload.data, payload.offset, payload.length); hasPayloads = true; } else { termsHashPerField.writeVInt(1, proxCode << 1); } p.lastPosition = fieldState.position; }
internal override void newTerm(Token t, RawPostingList p0) { // First time we're seeing this term since the last // flush System.Diagnostics.Debug.Assert(docState.TestPoint("FreqProxTermsWriterPerField.newTerm start")); FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList)p0; p.lastDocID = docState.docID; if (omitTf) { p.lastDocCode = docState.docID; } else { p.lastDocCode = docState.docID << 1; p.docFreq = 1; writeProx(t, p, fieldState.position); } }
public virtual void TestMixupDocs() { Directory dir = NewDirectory(); IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); iwc.SetMergePolicy(NewLogMergePolicy()); RandomIndexWriter writer = new RandomIndexWriter(Random(), dir, iwc); Document doc = new Document(); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.StoreTermVectors = true; customType.StoreTermVectorPositions = true; customType.StoreTermVectorPayloads = true; customType.StoreTermVectorOffsets = Random().NextBoolean(); Field field = new Field("field", "", customType); TokenStream ts = new MockTokenizer(new StringReader("here we go"), MockTokenizer.WHITESPACE, true); Assert.IsFalse(ts.HasAttribute<IPayloadAttribute>()); field.TokenStream = ts; doc.Add(field); writer.AddDocument(doc); Token withPayload = new Token("withPayload", 0, 11); withPayload.Payload = new BytesRef("test"); ts = new CannedTokenStream(withPayload); Assert.IsTrue(ts.HasAttribute<IPayloadAttribute>()); field.TokenStream = ts; writer.AddDocument(doc); ts = new MockTokenizer(new StringReader("another"), MockTokenizer.WHITESPACE, true); Assert.IsFalse(ts.HasAttribute<IPayloadAttribute>()); field.TokenStream = ts; writer.AddDocument(doc); DirectoryReader reader = writer.Reader; Terms terms = reader.GetTermVector(1, "field"); Debug.Assert(terms != null); TermsEnum termsEnum = terms.Iterator(null); Assert.IsTrue(termsEnum.SeekExact(new BytesRef("withPayload"))); DocsAndPositionsEnum de = termsEnum.DocsAndPositions(null, null); Assert.AreEqual(0, de.NextDoc()); Assert.AreEqual(0, de.NextPosition()); Assert.AreEqual(new BytesRef("test"), de.Payload); writer.Dispose(); reader.Dispose(); dir.Dispose(); }
/// <summary>Returns the next input Token, after being stemmed </summary> public override Token Next() { Token token = input.Next(); if (token == null) return null; stemmer.SetCurrent(token.TermText()); try { stemMethod.Invoke(stemmer, (System.Object[]) EMPTY_ARGS); } catch (System.Exception e) { throw new System.SystemException(e.ToString()); } Token newToken = new Token(stemmer.GetCurrent(), token.StartOffset(), token.EndOffset(), token.Type()); newToken.SetPositionIncrement(token.GetPositionIncrement()); return newToken; }
/* (non-Javadoc) * @see Lucene.Net.Highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token) */ public virtual float GetTokenScore(Token token) { System.String termText = token.TermText(); WeightedTerm queryTerm = (WeightedTerm)termsToFind[termText]; if (queryTerm == null) { //not a query term - return return(0); } //found a query term - is it unique in this doc? if (!uniqueTermsInFragment.Contains(termText)) { totalScore += queryTerm.GetWeight(); uniqueTermsInFragment.Add(termText, termText); } return(queryTerm.GetWeight()); }
/// <summary> /// Returns the next, stemmed, input Token. /// </summary> /// <returns> /// The stemed form of a token. /// </returns> /// <throws>IOException</throws> public override Token Next() { Token token = input.Next(); if (token == null) { return(null); } else { string str = stemmer.stem(token.TermText()); //if ((System.Object) str != token.TermText()) if (!str.Equals(token.TermText())) { // Yes, I mean object reference comparison here //token.TermText() = str; return(new Token(str, token.StartOffset(), token.EndOffset(), token.Type())); } return(token); } }
/// <summary>Returns the next token in the stream, or null at EOS. /// <p>Removes <tt>'s</tt> from the end of words. /// <p>Removes dots from acronyms. /// </summary> public override Token Next(/* in */ Token reusableToken) { System.Diagnostics.Debug.Assert(reusableToken != null); Token nextToken = input.Next(reusableToken); if (nextToken == null) { return(null); } char[] buffer = nextToken.TermBuffer(); int bufferLength = nextToken.TermLength(); System.String type = nextToken.Type(); if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { // Strip last 2 characters off nextToken.SetTermLength(bufferLength - 2); } else if (type == ACRONYM_TYPE) { // remove dots int upto = 0; for (int i = 0; i < bufferLength; i++) { char c = buffer[i]; if (c != '.') { buffer[upto++] = c; } } nextToken.SetTermLength(upto); } return(nextToken); }
public virtual void TestUnicode() { RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian); inWords = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testUnicode.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); sampleUnicode = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\resUnicode.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("Unicode")); TokenStream in_Renamed = ra.TokenStream("all", inWords); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleUnicode, RussianCharsets.UnicodeRussian); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "Unicode"); } inWords.Close(); sampleUnicode.Close(); }
public virtual void TestKOI8() { //System.out.println(new java.util.Date()); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8); // KOI8 inWordsKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\testKOI8.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sampleKOI8 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo(dataDir.FullName + @"Analysis\RU\resKOI8.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); TokenStream in_Renamed = ra.TokenStream("all", inWordsKOI8); RussianLetterTokenizer sample = new RussianLetterTokenizer(sampleKOI8, RussianCharsets.KOI8); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "KOI8"); } inWordsKOI8.Close(); sampleKOI8.Close(); }
internal override void addTerm(Token t, RawPostingList p0) { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start")); TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList)p0; p.freq++; if (doVectorOffsets) { int startOffset = fieldState.offset + t.StartOffset(); int endOffset = fieldState.offset + t.EndOffset(); termsHashPerField.writeVInt(1, startOffset - p.lastOffset); termsHashPerField.writeVInt(1, endOffset - startOffset); p.lastOffset = endOffset; } if (doVectorPositions) { termsHashPerField.writeVInt(0, fieldState.position - p.lastPosition); p.lastPosition = fieldState.position; } }
public virtual void Test1251() { // 1251 inWords1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\test1251.txt").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); sample1251 = new System.IO.StreamReader( new System.IO.FileStream( new System.IO.FileInfo( dataDir.FullName + @"Analysis\RU\res1251.htm").FullName, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.GetEncoding("iso-8859-1")); RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251); TokenStream in_Renamed = ra.TokenStream("", inWords1251); RussianLetterTokenizer sample = new RussianLetterTokenizer(sample1251, RussianCharsets.CP1251); for (; ;) { Token token = in_Renamed.Next(); if (token == null) { break; } Token sampleToken = sample.Next(); Assert.AreEqual(token.TermText(), sampleToken == null ? null : sampleToken.TermText(), "1251"); } inWords1251.Close(); sample1251.Close(); }
public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { System.Collections.ArrayList terms = new System.Collections.ArrayList(); try { Token reusableToken = new Token(); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { terms.Add(nextToken.Term()); } ProcessTerms((System.String[]) terms.ToArray(typeof(System.String))); } catch (System.IO.IOException) { } } } }
public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { Token next = null; System.Collections.ArrayList terms = new System.Collections.ArrayList(); try { while ((next = stream.Next()) != null) { terms.Add(next.TermText()); } ProcessTerms((System.String[])terms.ToArray(typeof(System.String))); } catch (System.IO.IOException) { } } } }
public QueryTermVector(System.String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.TokenStream("", new System.IO.StringReader(queryString)); if (stream != null) { System.Collections.ArrayList terms = new System.Collections.ArrayList(); try { Token reusableToken = new Token(); for (Token nextToken = stream.Next(reusableToken); nextToken != null; nextToken = stream.Next(reusableToken)) { terms.Add(nextToken.Term()); } ProcessTerms((System.String[])terms.ToArray(typeof(System.String))); } catch (System.IO.IOException) { } } } }
/// <summary>Returns the next input Token, after being stemmed </summary> public override Token Next() { Token token = input.Next(); if (token == null) { return(null); } stemmer.SetCurrent(token.TermText()); try { stemMethod.Invoke(stemmer, (System.Object[])EMPTY_ARGS); } catch (System.Exception e) { throw new System.SystemException(e.ToString()); } Token newToken = new Token(stemmer.GetCurrent(), token.StartOffset(), token.EndOffset(), token.Type()); newToken.SetPositionIncrement(token.GetPositionIncrement()); return(newToken); }
/// <summary>Returns the next token in the stream, or null at EOS. /// <p>Removes <tt>'s</tt> from the end of words. /// <p>Removes dots from acronyms. /// </summary> public override Token Next(Token result) { Token t = input.Next(result); if (t == null) { return(null); } char[] buffer = t.TermBuffer(); int bufferLength = t.TermLength(); System.String type = t.Type(); if (type == APOSTROPHE_TYPE && bufferLength >= 2 && buffer[bufferLength - 2] == '\'' && (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S')) { // Strip last 2 characters off t.SetTermLength(bufferLength - 2); } else if (type == ACRONYM_TYPE) { // remove dots int upto = 0; for (int i = 0; i < bufferLength; i++) { char c = buffer[i]; if (c != '.') { buffer[upto++] = c; } } t.SetTermLength(upto); } return(t); }
/// <summary> Fills Lucene token with the current token text.</summary> internal void GetText(Token t) { t.SetTermBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); }
/// <deprecated> Will be removed in Lucene 3.0. This method is final, as it should /// not be overridden. Delegates to the backwards compatibility layer. /// </deprecated> public override Token Next(Token reusableToken) { return base.Next(reusableToken); }
public virtual void TestLegalbutVeryLargeOffsets() { Directory dir = NewDirectory(); IndexWriter iw = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, null)); Document doc = new Document(); Token t1 = new Token("foo", 0, int.MaxValue - 500); if (Random().NextBoolean()) { t1.Payload = new BytesRef("test"); } Token t2 = new Token("foo", int.MaxValue - 500, int.MaxValue); TokenStream tokenStream = new CannedTokenStream(new Token[] { t1, t2 }); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; // store some term vectors for the checkindex cross-check ft.StoreTermVectors = true; ft.StoreTermVectorPositions = true; ft.StoreTermVectorOffsets = true; Field field = new Field("foo", tokenStream, ft); doc.Add(field); iw.AddDocument(doc); iw.Dispose(); dir.Dispose(); }
public override Token Next() { if (currentRealToken == null) { Token nextRealToken = realStream.Next(); if (nextRealToken == null) { return null; } System.String expansions = (System.String) synonyms[nextRealToken.TermText()]; if (expansions == null) { return nextRealToken; } st = new Tokenizer(expansions, ","); if (st.HasMoreTokens()) { currentRealToken = nextRealToken; } return currentRealToken; } else { System.String nextExpandedValue = st.NextToken(); Token expandedToken = new Token(nextExpandedValue, currentRealToken.StartOffset(), currentRealToken.EndOffset()); expandedToken.SetPositionIncrement(0); if (!st.HasMoreTokens()) { currentRealToken = null; st = null; } return expandedToken; } }
// TODO: more tests with other possibilities private void CheckTokens(Token[] tokens) { Directory dir = NewDirectory(); RandomIndexWriter riw = new RandomIndexWriter(Random(), dir, Iwc); bool success = false; try { FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; // store some term vectors for the checkindex cross-check ft.StoreTermVectors = true; ft.StoreTermVectorPositions = true; ft.StoreTermVectorOffsets = true; Document doc = new Document(); doc.Add(new Field("body", new CannedTokenStream(tokens), ft)); riw.AddDocument(doc); success = true; } finally { if (success) { IOUtils.Close(riw, dir); } else { IOUtils.CloseWhileHandlingException(riw, dir); } } }
// Called once per inverted token internal abstract void add(Token token);
public virtual bool IsNewFragment(Token token) { return(false); }
internal abstract void addTerm(Token t, RawPostingList p);
internal abstract void skippingLongTerm(Token t);
/* * (non-Javadoc) * * @see Lucene.Net.Analysis.TokenStream#next() */ public override Token Next(Token result) { int posIncr = 1; while (true) { int tokenType = scanner.GetNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return null; } if (scanner.Yylength() <= maxTokenLength) { result.Clear(); result.SetPositionIncrement(posIncr); scanner.GetText(result); int start = scanner.Yychar(); result.SetStartOffset(start); result.SetEndOffset(start + result.TermLength()); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { result.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); result.SetTermLength(result.TermLength() - 1); // remove extra '.' } else { result.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); } } else { result.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } return result; } // When we skip a too-long term, we still increment the // position increment else posIncr++; } }
private Term[] TapTerms(Token[] tap) { Term[] terms = new Term[tap.Length]; for (int i = 0; i < terms.Length; i++) { terms[i] = new Term("field", tap[i].ToString()); } return terms; }
/// <summary> Low level api. /// Returns a token stream or null if no offset info available in index. /// This can be used to feed the highlighter with a pre-parsed token stream /// /// In my tests the speeds to recreate 1000 token streams using this method are: /// - with TermVector offset only data stored - 420 milliseconds /// - with TermVector offset AND position data stored - 271 milliseconds /// (nb timings for TermVector with position data are based on a tokenizer with contiguous /// positions - no overlaps or gaps) /// The cost of not using TermPositionVector to store /// pre-parsed content and using an analyzer to re-parse the original content: /// - reanalyzing the original content - 980 milliseconds /// /// The re-analyze timings will typically vary depending on - /// 1) The complexity of the analyzer code (timings above were using a /// stemmer/lowercaser/stopword combo) /// 2) The number of other fields (Lucene reads ALL fields off the disk /// when accessing just one document field - can cost dear!) /// 3) Use of compression on field storage - could be faster cos of compression (less disk IO) /// or slower (more CPU burn) depending on the content. /// /// </summary> /// <param name="">tpv /// </param> /// <param name="tokenPositionsGuaranteedContiguous">true if the token position numbers have no overlaps or gaps. If looking /// to eek out the last drops of performance, set to true. If in doubt, set to false. /// </param> public static TokenStream GetTokenStream(TermPositionVector tpv, bool tokenPositionsGuaranteedContiguous) { //an object used to iterate across an array of tokens //code to reconstruct the original sequence of Tokens System.String[] terms = tpv.GetTerms(); int[] freq = tpv.GetTermFrequencies(); int totalTokens = 0; for (int t = 0; t < freq.Length; t++) { totalTokens += freq[t]; } Token[] tokensInOriginalOrder = new Token[totalTokens]; System.Collections.ArrayList unsortedTokens = null; for (int t = 0; t < freq.Length; t++) { TermVectorOffsetInfo[] offsets = tpv.GetOffsets(t); if (offsets == null) { return null; } int[] pos = null; if (tokenPositionsGuaranteedContiguous) { //try get the token position info to speed up assembly of tokens into sorted sequence pos = tpv.GetTermPositions(t); } if (pos == null) { //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later if (unsortedTokens == null) { unsortedTokens = new System.Collections.ArrayList(); } for (int tp = 0; tp < offsets.Length; tp++) { unsortedTokens.Add(new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset())); } } else { //We have positions stored and a guarantee that the token position information is contiguous // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or // creates jumps in position numbers - this code would fail under those circumstances //tokens stored with positions - can use this to index straight into sorted array for (int tp = 0; tp < pos.Length; tp++) { tokensInOriginalOrder[pos[tp]] = new Token(terms[t], offsets[tp].GetStartOffset(), offsets[tp].GetEndOffset()); } } } //If the field has been stored without position data we must perform a sort if (unsortedTokens != null) { tokensInOriginalOrder = (Token[]) unsortedTokens.ToArray(typeof(Token)); Array.Sort(tokensInOriginalOrder, new AnonymousClassComparator()); } return new StoredTokenStream(tokensInOriginalOrder); }
internal StoredTokenStream(Token[] tokens) { this.tokens = tokens; }
// Secondary entry point (for 2nd & subsequent TermsHash), // because token text has already been "interned" into // textStart, so we hash by textStart public void add(Token token, int textStart) { int code = textStart; int hashPos = code & postingsHashMask; System.Diagnostics.Debug.Assert(!postingsCompacted); // Locate RawPostingList in hash p = postingsHash[hashPos]; if (p != null && p.textStart != textStart) { // Conflict: keep searching different locations in // the hash table. int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & postingsHashMask; p = postingsHash[hashPos]; } while (p != null && p.textStart != textStart); } if (p == null) { // First time we are seeing this token since we last // flushed the hash. // Refill? if (0 == perThread.freePostingsCount) { perThread.morePostings(); } // Pull next free RawPostingList from free list p = perThread.freePostings[--perThread.freePostingsCount]; System.Diagnostics.Debug.Assert(p != null); p.textStart = textStart; System.Diagnostics.Debug.Assert(postingsHash[hashPos] == null); postingsHash[hashPos] = p; numPostings++; if (numPostings == postingsHashHalfSize) { rehashPostings(2 * postingsHashSize); } // Init stream slices if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) { intPool.nextBuffer(); } if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE) { bytePool.NextBuffer(); } intUptos = intPool.buffer; intUptoStart = intPool.intUpto; intPool.intUpto += streamCount; p.intStart = intUptoStart + intPool.intOffset; for (int i = 0; i < streamCount; i++) { int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE); intUptos[intUptoStart + i] = upto + bytePool.byteOffset; } p.byteStart = intUptos[intUptoStart]; consumer.newTerm(token, p); } else { intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; consumer.addTerm(token, p); } }
internal virtual bool IsDistinct(Token token) { return token.StartOffset() >= endOffset; }
// Primary entry point (for first TermsHash) internal override void add(Token token) { System.Diagnostics.Debug.Assert(!postingsCompacted); // We are first in the chain so we must "intern" the // term text into textStart address // Get the text of this term. char[] tokenText = token.TermBuffer(); int tokenTextLen = token.TermLength(); // Compute hashcode & replace any invalid UTF16 sequences int downto = tokenTextLen; int code = 0; while (downto > 0) { char ch = tokenText[--downto]; if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) { if (0 == downto) { // Unpaired ch = tokenText[downto] = (char)UnicodeUtil.UNI_REPLACEMENT_CHAR; } else { char ch2 = tokenText[downto - 1]; if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) { // OK: high followed by low. This is a valid // surrogate pair. code = ((code * 31) + ch) * 31 + ch2; downto--; continue; } else { // Unpaired ch = tokenText[downto] = (char)UnicodeUtil.UNI_REPLACEMENT_CHAR; } } } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END) { // Unpaired ch = tokenText[downto] = (char)UnicodeUtil.UNI_REPLACEMENT_CHAR; } code = (code * 31) + ch; } int hashPos = code & postingsHashMask; // Locate RawPostingList in hash p = postingsHash[hashPos]; if (p != null && !postingEquals(tokenText, tokenTextLen)) { // Conflict: keep searching different locations in // the hash table. int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & postingsHashMask; p = postingsHash[hashPos]; } while (p != null && !postingEquals(tokenText, tokenTextLen)); } if (p == null) { // First time we are seeing this token since we last // flushed the hash. int textLen1 = 1 + tokenTextLen; if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) { if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) { // Just skip this term, to remain as robust as // possible during indexing. A TokenFilter // can be inserted into the analyzer chain if // other behavior is wanted (pruning the term // to a prefix, throwing an exception, etc). if (docState.maxTermPrefix == null) { docState.maxTermPrefix = new System.String(tokenText, 0, 30); } consumer.skippingLongTerm(token); return; } charPool.nextBuffer(); } // Refill? if (0 == perThread.freePostingsCount) { perThread.morePostings(); } // Pull next free RawPostingList from free list p = perThread.freePostings[--perThread.freePostingsCount]; System.Diagnostics.Debug.Assert(p != null); char[] text = charPool.buffer; int textUpto = charPool.charUpto; p.textStart = textUpto + charPool.charOffset; charPool.charUpto += textLen1; System.Array.Copy(tokenText, 0, text, textUpto, tokenTextLen); text[textUpto + tokenTextLen] = (char)0xffff; System.Diagnostics.Debug.Assert(postingsHash[hashPos] == null); postingsHash[hashPos] = p; numPostings++; if (numPostings == postingsHashHalfSize) { rehashPostings(2 * postingsHashSize); } // Init stream slices if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) { intPool.nextBuffer(); } if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE) { bytePool.NextBuffer(); } intUptos = intPool.buffer; intUptoStart = intPool.intUpto; intPool.intUpto += streamCount; p.intStart = intUptoStart + intPool.intOffset; for (int i = 0; i < streamCount; i++) { int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE); intUptos[intUptoStart + i] = upto + bytePool.byteOffset; } p.byteStart = intUptos[intUptoStart]; consumer.newTerm(token, p); } else { intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; consumer.addTerm(token, p); } if (doNextCall) { nextPerField.add(token, p.textStart); } }
private Token MakeToken(string text, int posIncr, int startOffset, int endOffset) { Token t = new Token(); t.Append(text); t.PositionIncrement = posIncr; t.SetOffset(startOffset, endOffset); return t; }
private static Token MakeToken(string text, int posIncr) { Token t = new Token(); t.Append(text); t.PositionIncrement = posIncr; return t; }
public float GetTokenScore(Token token) { return 0; }
public virtual void TestZeroPosIncr() { Directory dir = new RAMDirectory(); Token[] tokens = new Token[3]; tokens[0] = new Token(); tokens[0].Append("a"); tokens[0].PositionIncrement = 1; tokens[1] = new Token(); tokens[1].Append("b"); tokens[1].PositionIncrement = 0; tokens[2] = new Token(); tokens[2].Append("c"); tokens[2].PositionIncrement = 0; RandomIndexWriter writer = new RandomIndexWriter(Random(), dir); Document doc = new Document(); doc.Add(new TextField("field", new CannedTokenStream(tokens))); writer.AddDocument(doc); doc = new Document(); doc.Add(new TextField("field", new CannedTokenStream(tokens))); writer.AddDocument(doc); IndexReader r = writer.Reader; writer.Dispose(); IndexSearcher s = NewSearcher(r); MultiPhraseQuery mpq = new MultiPhraseQuery(); //mpq.setSlop(1); // NOTE: not great that if we do the else clause here we // get different scores! MultiPhraseQuery counts that // phrase as occurring twice per doc (it should be 1, I // think?). this is because MultipleTermPositions is able to // return the same position more than once (0, in this // case): if (true) { mpq.Add(new Term[] { new Term("field", "b"), new Term("field", "c") }, 0); mpq.Add(new Term[] { new Term("field", "a") }, 0); } else { mpq.Add(new Term[] { new Term("field", "a") }, 0); mpq.Add(new Term[] { new Term("field", "b"), new Term("field", "c") }, 0); } TopDocs hits = s.Search(mpq, 2); Assert.AreEqual(2, hits.TotalHits); Assert.AreEqual(hits.ScoreDocs[0].Score, hits.ScoreDocs[1].Score, 1e-5); /* for(int hit=0;hit<hits.TotalHits;hit++) { ScoreDoc sd = hits.ScoreDocs[hit]; System.out.println(" hit doc=" + sd.Doc + " score=" + sd.Score); } */ r.Dispose(); dir.Dispose(); }
private void InitBlock(HighlighterTest enclosingInstance) { this.enclosingInstance = enclosingInstance; lst = new System.Collections.ArrayList(); Token t; t = new Token("hi", 0, 2); lst.Add(t); t = new Token("hispeed", 0, 8); lst.Add(t); t = new Token("speed", 3, 8); t.SetPositionIncrement(0); lst.Add(t); t = new Token("10", 8, 10); lst.Add(t); t = new Token("foo", 11, 14); lst.Add(t); iter = lst.GetEnumerator(); }
public override Token Next(Token reusableToken) { return(base.Next(reusableToken)); }
/* (non-Javadoc) * @see Lucene.Net.Highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token) */ public virtual float GetTokenScore(Token token) { System.String termText = token.TermText(); WeightedTerm queryTerm = (WeightedTerm) termsToFind[termText]; if (queryTerm == null) { //not a query term - return return 0; } //found a query term - is it unique in this doc? if (!uniqueTermsInFragment.Contains(termText)) { totalScore += queryTerm.GetWeight(); uniqueTermsInFragment.Add(termText, termText); } return queryTerm.GetWeight(); }
public RepeatingTokenStream(System.String val) { t = new Token(val, 0, val.Length); }
public override Token Next(Token result) { if (this.fieldName.Equals("crash") && count++ >= 4) throw new System.IO.IOException("I'm experiencing problems"); return input.Next(result); }
public void TestNegativePositions() { SinkTokenizer tokens = new SinkTokenizer(); Token t = new Token(); t.SetTermText("a"); t.SetPositionIncrement(0); tokens.Add(t); t.SetTermText("b"); t.SetPositionIncrement(1); tokens.Add(t); t.SetTermText("c"); tokens.Add(t); MockRAMDirectory dir = new MockRAMDirectory(); IndexWriter w = new IndexWriter(dir, false, new WhitespaceAnalyzer(), true); Document doc = new Document(); doc.Add(new Field("field", tokens)); w.AddDocument(doc); w.Close(); IndexSearcher s = new IndexSearcher(dir); PhraseQuery pq = new PhraseQuery(); pq.Add(new Term("field", "a")); pq.Add(new Term("field", "b")); pq.Add(new Term("field", "c")); Hits hits = s.Search(pq); Assert.AreEqual(1, hits.Length()); Query q = new SpanTermQuery(new Term("field", "a")); hits = s.Search(q); Assert.AreEqual(1, hits.Length()); TermPositions tps = s.GetIndexReader().TermPositions(new Term("field", "a")); Assert.IsTrue(tps.Next()); Assert.AreEqual(1, tps.Freq()); Assert.AreEqual(-1, tps.NextPosition()); Assert.IsTrue(_TestUtil.CheckIndex(dir)); s.Close(); dir.Close(); }