public List <string> GetNGrams(string TextFieldKey, string document)
        {
            List <string> ngrams = new List <string>();

            if (LuceneService.NGrammer != null)
            {
                TokenStream stream = LuceneService.NGrammer.GetTokenStream(TextFieldKey, new StringReader(document));
                //AttributeSource source = new AttributeSource();
                //OffsetAttribute offsetAttribute = stream.AddAttribute<OffsetAttribute>();
                var charTermAttribute = stream.AddAttribute <ICharTermAttribute>();
                stream.Reset();

                while (stream.IncrementToken())
                {
                    //Token token = new Token();
                    //int startOffset = offsetAttribute.StartOffset;
                    //int endOffset = offsetAttribute.EndOffset;
                    String term = charTermAttribute.ToString();
                    ngrams.Add(term);
                }
                stream.ClearAttributes();
                stream.End();
                stream.Dispose();
                return(ngrams);
            }
            else
            {
                throw new Exception("No ngrammer");
            }
        }
Esempio n. 2
0
        public static Dictionary <int, List <int> > GetTokenDataForDoc(string document)
        {
            Dictionary <int, List <int> > res = new Dictionary <int, List <int> >();
            List <int>  list              = new List <int>();
            TokenStream stream            = LuceneService.Analyzer.GetTokenStream(ProjectInfo.TextFieldKey, new StringReader(document));
            var         index             = 0;
            var         charTermAttribute = stream.AddAttribute <ICharTermAttribute>();

            stream.Reset();

            while (stream.IncrementToken())
            {
                //Token token = new Token();
                //int startOffset = offsetAttribute.StartOffset;
                //int endOffset = offsetAttribute.EndOffset;
                String term = charTermAttribute.ToString();
                index++;
                list.Add(term.Length);
            }
            res.Add(index, list);
            stream.ClearAttributes();
            stream.End();
            stream.Dispose();
            return(res);
        }
        public static void AssertTokenStreamContents(TokenStream ts, String[] output, Int32[] startOffsets, Int32[] endOffsets, String[] types, Int32[] posIncrements, Int32? finalOffset) {
            Assert.IsNotNull(output);
            var checkClearAtt = (CheckClearAttributesAttribute) ts.AddAttribute(typeof (CheckClearAttributesAttribute));

            Assert.IsTrue(ts.HasAttribute(typeof (TermAttribute)), "has no TermAttribute");
            var termAtt = (TermAttribute) ts.GetAttribute(typeof (TermAttribute));

            OffsetAttribute offsetAtt = null;
            if (startOffsets != null || endOffsets != null || finalOffset != null) {
                Assert.IsTrue(ts.HasAttribute(typeof (OffsetAttribute)), "has no OffsetAttribute");
                offsetAtt = (OffsetAttribute) ts.GetAttribute(typeof (OffsetAttribute));
            }

            TypeAttribute typeAtt = null;
            if (types != null) {
                Assert.IsTrue(ts.HasAttribute(typeof (TypeAttribute)), "has no TypeAttribute");
                typeAtt = (TypeAttribute) ts.GetAttribute(typeof (TypeAttribute));
            }

            PositionIncrementAttribute posIncrAtt = null;
            if (posIncrements != null) {
                Assert.IsTrue(ts.HasAttribute(typeof (PositionIncrementAttribute)), "has no PositionIncrementAttribute");
                posIncrAtt = (PositionIncrementAttribute) ts.GetAttribute(typeof (PositionIncrementAttribute));
            }

            ts.Reset();
            for (Int32 i = 0; i < output.Length; i++) {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetTermBuffer("bogusTerm");
                if (offsetAtt != null) offsetAtt.SetOffset(14584724, 24683243);
                if (typeAtt != null) typeAtt.SetType("bogusType");
                if (posIncrAtt != null) posIncrAtt.SetPositionIncrement(45987657);

                checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.Term(), "term " + i);
                if (startOffsets != null)
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset(), "startOffset " + i);
                if (endOffsets != null)
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset(), "endOffset " + i);
                if (types != null)
                    Assert.AreEqual(types[i], typeAtt.Type(), "type " + i);
                if (posIncrements != null)
                    Assert.AreEqual(posIncrements[i], posIncrAtt.GetPositionIncrement(), "posIncrement " + i);
            }
            Assert.IsFalse(ts.IncrementToken(), "end of stream");
            ts.End();
            if (finalOffset.HasValue)
                Assert.AreEqual(finalOffset.Value, offsetAtt.EndOffset(), "finalOffset ");
            ts.Close();
        }
Esempio n. 4
0
        /// <summary>
        /// TODO: rewrite tests not to use string comparison.
        /// </summary>
        private static string tsToString(TokenStream @in)
        {
            StringBuilder      @out    = new StringBuilder();
            ICharTermAttribute termAtt = @in.AddAttribute <ICharTermAttribute>();

            // extra safety to enforce, that the state is not preserved and also
            // assign bogus values
            @in.ClearAttributes();
            termAtt.SetEmpty().Append("bogusTerm");
            @in.Reset();
            while (@in.IncrementToken())
            {
                if (@out.Length > 0)
                {
                    @out.Append(' ');
                }
                @out.Append(termAtt.ToString());
                @in.ClearAttributes();
                termAtt.SetEmpty().Append("bogusTerm");
            }

            @in.Dispose();
            return(@out.ToString());
        }
        public static void AssertTokenStreamContents(TokenStream ts, System.String[] output, int[] startOffsets, int[] endOffsets, System.String[] types, int[] posIncrements, int?finalOffset)
        {
            Assert.IsNotNull(output);
            ICheckClearAttributesAttribute checkClearAtt = ts.AddAttribute <ICheckClearAttributesAttribute>();

            Assert.IsTrue(ts.HasAttribute <ITermAttribute>(), "has no TermAttribute");
            ITermAttribute termAtt = ts.GetAttribute <ITermAttribute>();

            IOffsetAttribute offsetAtt = null;

            if (startOffsets != null || endOffsets != null || finalOffset != null)
            {
                Assert.IsTrue(ts.HasAttribute <IOffsetAttribute>(), "has no OffsetAttribute");
                offsetAtt = ts.GetAttribute <IOffsetAttribute>();
            }

            ITypeAttribute typeAtt = null;

            if (types != null)
            {
                Assert.IsTrue(ts.HasAttribute <ITypeAttribute>(), "has no TypeAttribute");
                typeAtt = ts.GetAttribute <ITypeAttribute>();
            }

            IPositionIncrementAttribute posIncrAtt = null;

            if (posIncrements != null)
            {
                Assert.IsTrue(ts.HasAttribute <IPositionIncrementAttribute>(), "has no PositionIncrementAttribute");
                posIncrAtt = ts.GetAttribute <IPositionIncrementAttribute>();
            }

            ts.Reset();
            for (int i = 0; i < output.Length; i++)
            {
                // extra safety to enforce, that the state is not preserved and also assign bogus values
                ts.ClearAttributes();
                termAtt.SetTermBuffer("bogusTerm");
                if (offsetAtt != null)
                {
                    offsetAtt.SetOffset(14584724, 24683243);
                }
                if (typeAtt != null)
                {
                    typeAtt.Type = "bogusType";
                }
                if (posIncrAtt != null)
                {
                    posIncrAtt.PositionIncrement = 45987657;
                }

                checkClearAtt.GetAndResetClearCalled(); // reset it, because we called clearAttribute() before
                Assert.IsTrue(ts.IncrementToken(), "token " + i + " does not exist");
                Assert.IsTrue(checkClearAtt.GetAndResetClearCalled(), "clearAttributes() was not called correctly in TokenStream chain");

                Assert.AreEqual(output[i], termAtt.Term, "term " + i);
                if (startOffsets != null)
                {
                    Assert.AreEqual(startOffsets[i], offsetAtt.StartOffset, "startOffset " + i);
                }
                if (endOffsets != null)
                {
                    Assert.AreEqual(endOffsets[i], offsetAtt.EndOffset, "endOffset " + i);
                }
                if (types != null)
                {
                    Assert.AreEqual(types[i], typeAtt.Type, "type " + i);
                }
                if (posIncrements != null)
                {
                    Assert.AreEqual(posIncrements[i], posIncrAtt.PositionIncrement, "posIncrement " + i);
                }
            }
            Assert.IsFalse(ts.IncrementToken(), "end of stream");
            ts.End();
            if (finalOffset.HasValue)
            {
                Assert.AreEqual(finalOffset, offsetAtt.EndOffset, "finalOffset ");
            }
            ts.Close();
        }
        /// <summary>
        /// TODO: rewrite tests not to use string comparison.
        /// </summary>
        private static string tsToString(TokenStream @in)
        {
            StringBuilder @out = new StringBuilder();
            ICharTermAttribute termAtt = @in.AddAttribute<ICharTermAttribute>();
            // extra safety to enforce, that the state is not preserved and also
            // assign bogus values
            @in.ClearAttributes();
            termAtt.SetEmpty().Append("bogusTerm");
            @in.Reset();
            while (@in.IncrementToken())
            {
                if (@out.Length > 0)
                {
                    @out.Append(' ');
                }
                @out.Append(termAtt.ToString());
                @in.ClearAttributes();
                termAtt.SetEmpty().Append("bogusTerm");
            }

            @in.Dispose();
            return @out.ToString();
        }