TermAttribute.Term C# (CSharp)のコード例

コード例 #1

0

ファイルを表示

        public virtual void TestCaptureState()
        {
            // init a first instance
            AttributeSource src     = new AttributeSource();
            TermAttribute   termAtt = (TermAttribute)src.AddAttribute(typeof(TermAttribute));
            TypeAttribute   typeAtt = (TypeAttribute)src.AddAttribute(typeof(TypeAttribute));

            termAtt.SetTermBuffer("TestTerm");
            typeAtt.SetType("TestType");
            int hashCode = src.GetHashCode();

            AttributeSource.State state = src.CaptureState();

            // modify the attributes
            termAtt.SetTermBuffer("AnotherTestTerm");
            typeAtt.SetType("AnotherTestType");
            Assert.IsTrue(hashCode != src.GetHashCode(), "Hash code should be different");

            src.RestoreState(state);
            Assert.AreEqual("TestTerm", termAtt.Term());
            Assert.AreEqual("TestType", typeAtt.Type());
            Assert.AreEqual(hashCode, src.GetHashCode(), "Hash code should be equal after restore");

            // restore into an exact configured copy
            AttributeSource copy = new AttributeSource();

            copy.AddAttribute(typeof(TermAttribute));
            copy.AddAttribute(typeof(TypeAttribute));
            copy.RestoreState(state);
            Assert.AreEqual(src.GetHashCode(), copy.GetHashCode(), "Both AttributeSources should have same hashCode after restore");
            Assert.AreEqual(src, copy, "Both AttributeSources should be equal after restore");

            // init a second instance (with attributes in different order and one additional attribute)
            AttributeSource src2 = new AttributeSource();

            typeAtt = (TypeAttribute)src2.AddAttribute(typeof(TypeAttribute));
            Lucene.Net.Analysis.Tokenattributes.FlagsAttribute flagsAtt = (Lucene.Net.Analysis.Tokenattributes.FlagsAttribute)src2.AddAttribute(typeof(Lucene.Net.Analysis.Tokenattributes.FlagsAttribute));
            termAtt = (TermAttribute)src2.AddAttribute(typeof(TermAttribute));
            flagsAtt.SetFlags(12345);

            src2.RestoreState(state);
            Assert.AreEqual("TestTerm", termAtt.Term());
            Assert.AreEqual("TestType", typeAtt.Type());
            Assert.AreEqual(12345, flagsAtt.GetFlags(), "FlagsAttribute should not be touched");

            // init a third instance missing one Attribute
            AttributeSource src3 = new AttributeSource();

            termAtt = (TermAttribute)src3.AddAttribute(typeof(TermAttribute));
            try
            {
                src3.RestoreState(state);
                Assert.Fail("The third instance is missing the TypeAttribute, so restoreState() should throw IllegalArgumentException");
            }
            catch (System.ArgumentException iae)
            {
                // pass
            }
        }

コード例 #2

0

ファイルを表示

        /// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary>
        /// <param name="r">a source of text to be tokenized
        /// </param>
        /// <param name="termFreqMap">a Map of terms and their frequencies
        /// </param>
        /// <param name="fieldName">Used by analyzer for any special per-field analysis
        /// </param>
        private void AddTermFrequencies(System.IO.TextReader r, System.Collections.IDictionary termFreqMap, System.String fieldName)
        {
            TokenStream ts         = analyzer.TokenStream(fieldName, r);
            int         tokenCount = 0;
            // for every token
            TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));

            while (ts.IncrementToken())
            {
                string word = termAtt.Term();
                tokenCount++;
                if (tokenCount > maxNumTokensParsed)
                {
                    break;
                }
                if (IsNoiseWord(word))
                {
                    continue;
                }

                // increment frequency
                Int cnt = (Int)termFreqMap[word];
                if (cnt == null)
                {
                    termFreqMap[word] = new Int();
                }
                else
                {
                    cnt.x++;
                }
            }
        }

コード例 #3

0

ファイルを表示

        protected new void AddTermFrequencies(System.IO.StreamReader r, System.Collections.IDictionary termFreqMap, System.String fieldName)
        {
            var           analyzer   = Analyzers[fieldName];
            TokenStream   ts         = analyzer.TokenStream(fieldName, r);
            TermAttribute termAtt    = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));
            int           tokenCount = 0;

            while (ts.IncrementToken())
            {
                // for every token
                System.String word = termAtt.Term();
                tokenCount++;
                if (tokenCount > GetMaxNumTokensParsed())
                {
                    break;
                }
                if (IsNoiseWord(word))
                {
                    continue;
                }

                // increment frequency
                var cnt = (Int)termFreqMap[word];
                if (cnt == null)
                {
                    termFreqMap[word] = new Int();
                }
                else
                {
                    cnt.x++;
                }
            }
        }

コード例 #4

0

ファイルを表示

        public void TestUnaccentedWordAnalyzer()
        {
            TopDocs td   = null;
            string  text = "[email protected] 123.456 ğüşıöç%ĞÜŞİÖÇ$ΑΒΓΔΕΖ#АБВГДЕ SSß";

            string[] expectedTokens = new string[] { "name", "surname", "gmail", "com", "123", "456", "gusioc", "gusioc", "αβγδεζ", "абвгде", "ssss" };

            UnaccentedWordAnalyzer analyzer = new UnaccentedWordAnalyzer();
            TokenStream            ts       = analyzer.TokenStream("", new System.IO.StringReader(text));

            int           i             = 0;
            TermAttribute termAttribute = (TermAttribute)ts.GetAttribute(typeof(TermAttribute));

            while (ts.IncrementToken())
            {
                Assert.AreEqual(expectedTokens[i++], termAttribute.Term());
                System.Diagnostics.Debug.WriteLine(termAttribute.Term());
            }

            QueryParser   p   = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, "field", analyzer);
            IndexSearcher src = CreateIndex(text, analyzer);

            td = src.Search(p.Parse("ĞÜŞıöç"), 10);
            Assert.AreEqual(1, td.totalHits);

            td = src.Search(p.Parse("name"), 10);
            Assert.AreEqual(1, td.totalHits);

            td = src.Search(p.Parse("surname"), 10);
            Assert.AreEqual(1, td.totalHits);

            td = src.Search(p.Parse("NAME.surname"), 10);
            Assert.AreEqual(1, td.totalHits);

            td = src.Search(p.Parse("surname@gmail"), 10);
            Assert.AreEqual(1, td.totalHits);

            td = src.Search(p.Parse("name@gmail"), 10);
            Assert.AreEqual(0, td.totalHits);

            td = src.Search(p.Parse("456"), 10);
            Assert.AreEqual(1, td.totalHits);

            td = src.Search(p.Parse("123.456"), 10);
            Assert.AreEqual(1, td.totalHits);
        }

コード例 #5

0

ファイルを表示

ファイル: HunspellStemFilter.cs プロジェクト: thinhmascot/Lucene.Net.Analysis.Hunspell

        public override Boolean IncrementToken()
        {
            if (_buffer.Any())
            {
                var nextStem = _buffer.Dequeue();

                RestoreState(_savedState);
                _posIncAtt.SetPositionIncrement(0);
                _termAtt.SetTermBuffer(nextStem.Stem, 0, nextStem.StemLength);
                return(true);
            }

            if (!input.IncrementToken())
            {
                return(false);
            }

            var newTerms = _dedup
                               ? _stemmer.UniqueStems(_termAtt.Term())
                               : _stemmer.Stem(_termAtt.Term());

            foreach (var newTerm in newTerms)
            {
                _buffer.Enqueue(newTerm);
            }

            if (_buffer.Count == 0)
            {
                // we do not know this word, return it unchanged
                return(true);
            }

            var stem = _buffer.Dequeue();

            _termAtt.SetTermBuffer(stem.Stem, 0, stem.StemLength);

            if (_buffer.Count > 0)
            {
                _savedState = CaptureState();
            }

            return(true);
        }

コード例 #6

0

ファイルを表示

ファイル: PreRetrievalMetrics.cs プロジェクト: abb-iss/Sando

        public string StemText(string text)
        {
            string      result = "";
            TokenStream stream = Stemmer.TokenStream(String.Empty, new StringReader(text));

            while (stream.IncrementToken())
            {
                TermAttribute termAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute));
                result = result + termAttr.Term() + " ";
            }

            return(result.Trim());
        }

コード例 #7

0

ファイルを表示

        public static IEnumerable <string> TokensFromAnalysis(Analyzer analyzer, String text)
        {
            TokenStream   stream    = analyzer.TokenStream("contents", new StringReader(text));
            List <string> result    = new List <string>();
            TermAttribute tokenAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute));

            while (stream.IncrementToken())
            {
                result.Add(tokenAttr.Term());
            }

            stream.End();
            stream.Close();

            return(result);
        }

コード例 #8

0

ファイルを表示

        private string[] GetAnalyzedText(string field, string text)
        {
            var reader      = new StringReader(text);
            var tokenStream = _masterAnalyzer.TokenStream(field, reader);

            _termAtt = (TermAttribute)tokenStream.AddAttribute(typeof(TermAttribute));

            var tokens = new List <string>();
            var words  = new List <string>();

            while (tokenStream.IncrementToken())
            {
                tokens.Add(_termAtt.ToString());
                words.Add(_termAtt.Term());
            }
            return(words.ToArray());
        }

コード例 #9

0

ファイルを表示

        public static IEnumerable <string> TokensFromAnalysis(Analyzer analyzer, String text)
        {
            TokenStream   stream    = analyzer.TokenStream("contents", new StringReader(text));
            List <string> result    = new List <string>();
            TermAttribute tokenAttr = (TermAttribute)stream.GetAttribute(typeof(TermAttribute));

            while (stream.IncrementToken())
            {
                Console.WriteLine("Buffer:={0}, Length:={1}, Term:={2}".FormatWith(tokenAttr.TermBuffer(), tokenAttr.TermLength(), tokenAttr.Term()));
                result.Add(tokenAttr.Term());
            }



            //tokenAttr.

            stream.End();
            stream.Close();

            return(result);
        }

コード例 #10

0

ファイルを表示

        static List <string> TokenizeStandard(string content, TokenizeConfig config)
        {
            StringReader reader = new StringReader(content);
            TokenStream  result = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_24, reader);

            var stophash = StopFilter.MakeStopSet(config.StopWords);

            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            result = new StopFilter(true, result, stophash, true);

            /// Set up lexicon/invertlexicon, featurevectors, wordappearancecount ///
            result.Reset();
            TermAttribute termattr = (TermAttribute)result.GetAttribute(typeof(TermAttribute));
            List <string> words    = new List <string>();

            while (result.IncrementToken())
            {
                words.Add(termattr.Term());
            }
            return(words);
        }

コード例 #11

0

ファイルを表示

ファイル: SimilarityQueries.cs プロジェクト: vivekshimpi01/lucene.net

        /// <summary> Simple similarity query generators.
        /// Takes every unique word and forms a boolean query where all words are optional.
        /// After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
        /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
        /// need to then ignore that.
        ///
        /// <p>
        ///
        /// So, if you have a code fragment like this:
        /// <br>
        /// <code>
        /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
        /// </code>
        ///
        /// <p>
        ///
        /// </summary>
        /// <summary> The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
        ///
        /// <p>
        /// The philosophy behind this method is "two documents are similar if they share lots of words".
        /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
        ///
        /// <P>
        /// This method is fail-safe in that if a long 'body' is passed in and
        /// {@link BooleanQuery#add BooleanQuery.add()} (used internally)
        /// throws
        /// {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
        /// query as it is will be returned.
        ///
        ///
        ///
        ///
        ///
        /// </summary>
        /// <param name="body">the body of the document you want to find similar documents to
        /// </param>
        /// <param name="a">the analyzer to use to parse the body
        /// </param>
        /// <param name="field">the field you want to search on, probably something like "contents" or "body"
        /// </param>
        /// <param name="stop">optional set of stop words to ignore
        /// </param>
        /// <returns> a query with all unique words in 'body'
        /// </returns>
        /// <throws>  IOException this can't happen... </throws>
        public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop)
        {
            TokenStream   ts      = a.TokenStream(field, new System.IO.StringReader(body));
            TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));

            BooleanQuery tmp = new BooleanQuery();

            System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups
            while (ts.IncrementToken())
            {
                String word = termAtt.Term();
                // ignore opt stop words
                if (stop != null && stop.Contains(word))
                {
                    continue;
                }
                // ignore dups
                if (already.Contains(word) == true)
                {
                    continue;
                }
                already.Add(word, word);
                // add to query
                TermQuery tq = new TermQuery(new Term(field, word));
                try
                {
                    tmp.Add(tq, BooleanClause.Occur.SHOULD);
                }
                catch (BooleanQuery.TooManyClauses)
                {
                    // fail-safe, just return what we have, not the end of the world
                    break;
                }
            }
            return(tmp);
        }

コード例 #12

0

ファイルを表示

ファイル: TestTokenStreamBWComp.cs プロジェクト: vivekshimpi01/lucene.net

        private static void  ConsumeStreamNewAPI(TokenStream stream)
        {
            stream.Reset();
            PayloadAttribute payloadAtt = (PayloadAttribute)stream.AddAttribute(typeof(PayloadAttribute));
            TermAttribute    termAtt    = (TermAttribute)stream.AddAttribute(typeof(TermAttribute));

            int i = 0;

            while (stream.IncrementToken())
            {
                System.String term = termAtt.Term();
                Payload       p    = payloadAtt.GetPayload();
                if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION)
                {
                    Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun");
                }
                else
                {
                    Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)");
                }
                Assert.AreEqual(results[i], term);
                i++;
            }
        }

コード例 #13

0

ファイルを表示

        private void AddTerms(IndexReader reader, FieldVals f)
        {
            if (f.queryString == null)
            {
                return;
            }
            TokenStream   ts      = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString));
            TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));

            int       corpusNumDocs            = reader.NumDocs();
            Term      internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects
            Hashtable processedTerms           = new Hashtable();

            while (ts.IncrementToken())
            {
                String term = termAtt.Term();
                if (!processedTerms.Contains(term))
                {
                    processedTerms.Add(term, term);
                    ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                    float          minScore  = 0;
                    Term           startTerm = internSavingTemplateTerm.CreateTerm(term);
                    FuzzyTermEnum  fe        = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength);
                    TermEnum       origEnum  = reader.Terms(startTerm);
                    int            df        = 0;
                    if (startTerm.Equals(origEnum.Term()))
                    {
                        df = origEnum.DocFreq(); //store the df so all variants use same idf
                    }
                    int numVariants          = 0;
                    int totalVariantDocFreqs = 0;
                    do
                    {
                        Term possibleMatch = fe.Term();
                        if (possibleMatch != null)
                        {
                            numVariants++;
                            totalVariantDocFreqs += fe.DocFreq();
                            float score = fe.Difference();
                            if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore)
                            {
                                ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm);
                                variantsQ.Insert(st);
                                minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain minScore
                            }
                        }
                    }while (fe.Next());
                    if (numVariants > 0)
                    {
                        int avgDf = totalVariantDocFreqs / numVariants;
                        if (df == 0)    //no direct match we can use as df for all variants
                        {
                            df = avgDf; //use avg df of all variants
                        }

                        // take the top variants (scored by edit distance) and reset the score
                        // to include an IDF factor then add to the global queue for ranking
                        // overall top query terms
                        int size = variantsQ.Size();
                        for (int i = 0; i < size; i++)
                        {
                            ScoreTerm st = (ScoreTerm)variantsQ.Pop();
                            st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs);
                            q.Insert(st);
                        }
                    }
                }
            }
        }

コード例 #14

0

ファイルを表示

ファイル: CompatibilityExtensions.cs プロジェクト: mattbrailsford/lucene-net-contrib-spatial-294

 public static void Append(this TermAttribute termAtt, char ch)
 {
     termAtt.SetTermBuffer(termAtt.Term() + new string(new[] { ch }));             // TODO: Not optimal, but works
 }

コード例 #15

0

ファイルを表示

ファイル: CompatibilityExtensions.cs プロジェクト: mattbrailsford/lucene-net-contrib-spatial-294

 public static void Append(this TermAttribute termAtt, string str)
 {
     termAtt.SetTermBuffer(termAtt.Term() + str);             // TODO: Not optimal, but works
 }

コード例 #16

0

ファイルを表示

ファイル: BaseTokenStreamTestCase.cs プロジェクト: stgwilli/ravendb

        public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int?finalOffset)
        {
            Assert.IsNotNull(output);
            CheckClearAttributesAttribute checkClearAtt = (CheckClearAttributesAttribute)ts.AddAttribute(typeof(CheckClearAttributesAttribute));

            Assert.IsTrue(ts.HasAttribute(typeof(TermAttribute)), "has no TermAttribute");
            TermAttribute termAtt = (TermAttribute)ts.GetAttribute(typeof(TermAttribute));

            OffsetAttribute offsetAtt = null;

            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(OffsetAttribute)), "has no OffsetAttribute");
                offsetAtt = (OffsetAttribute)ts.GetAttribute(typeof(OffsetAttribute));
            }

            TypeAttribute typeAtt = null;

            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(TypeAttribute)), "has no TypeAttribute");
                typeAtt = (TypeAttribute)ts.GetAttribute(typeof(TypeAttribute));
            }

            PositionIncrementAttribute posIncrAtt = null;

            if (posIncrements != null)
            {
                Assert.IsTrue(ts.HasAttribute(typeof(PositionIncrementAttribute)), "has no PositionIncrementAttribute");
                posIncrAtt = (PositionIncrementAttribute)ts.GetAttribute(typeof(PositionIncrementAttribute));
            }

            ts.Reset();
            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetTermBuffer("bogusTerm");
                if (offsetAtt != null)
                {
                    offsetAtt.SetOffset(14584724, 24683243);
                }
                if (typeAtt != null)
                {
                    typeAtt.SetType("bogusType");
                }
                if (posIncrAtt != null)
                {
                    posIncrAtt.SetPositionIncrement(45987657);
                }

                checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.Term(), "term " + i);
                if (startOffsets != null)
                {
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i);
                }
                if (endOffsets != null)
                {
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i);
                }
                if (types != null)
                {
                    Assert.AreEqual(types[i], typeAtt.Type(), "type " + i);
                }
                if (posIncrements != null)
                {
                    Assert.AreEqual(posIncrements[i], posIncrAtt.GetPositionIncrement(), "posIncrement " + i);
                }
            }
            Assert.IsFalse(ts.IncrementToken(), "end of stream");
            ts.End();
            if (finalOffset.HasValue)
            {
                Assert.AreEqual(finalOffset, offsetAtt.EndOffset(), "finalOffset ");
            }
            ts.Close();
        }

C# (CSharp) TermAttribute.Termの例