Example #1
0
        public void TestOffsetsWithTokenizer()
        {
            const string input = @"test1 <a href=""foo"">testlink</a> test2 test3";

            Tokenizer t = new WhitespaceTokenizer(new HTMLStripCharFilter(CharReader.Get(new StringReader(input))));

            string          token   = string.Empty;
            List <Token>    results = new List <Token>();
            OffsetAttribute att     = ((OffsetAttribute)t.GetAttribute(typeof(OffsetAttribute)));

            t.IncrementToken();
            Assert.AreEqual(0, att.StartOffset());
            Assert.AreEqual(5, att.EndOffset() - att.StartOffset());

            t.IncrementToken();
            Assert.AreEqual(20, att.StartOffset());
            Assert.AreEqual(8, att.EndOffset() - att.StartOffset());

            t.IncrementToken();
            Assert.AreEqual(33, att.StartOffset());
            Assert.AreEqual(5, att.EndOffset() - att.StartOffset());

            t.IncrementToken();
            Assert.AreEqual(39, att.StartOffset());
            Assert.AreEqual(5, att.EndOffset() - att.StartOffset());
        }
Example #2
0
        public void IncrementsOffsetCorrectlyWithAnotherReader2()
        {
            const string input = @"test1 <a href=""foo"">testlink</a> test2 test3";

            CharFilter filter = new HTMLStripCharFilter(CharReader.Get(new StringReader(input)));
            Tokenizer  t      = new Tokenizer(filter);

            string       token   = string.Empty;
            List <Token> results = new List <Token>();

            t.NextToken(out token);
            Assert.Equal(0, filter.CorrectOffset(t.Offset));
            Assert.Equal(5, t.LengthInSource);

            t.NextToken(out token);
            Assert.Equal(20, filter.CorrectOffset(t.Offset));
            Assert.Equal(8, t.LengthInSource);

            t.NextToken(out token);
            Assert.Equal(33, filter.CorrectOffset(t.Offset));
            Assert.Equal(5, t.LengthInSource);

            t.NextToken(out token);
            Assert.Equal(39, filter.CorrectOffset(t.Offset));
            Assert.Equal(5, t.LengthInSource);
        }
Example #3
0
        public void TestHTML()
        {
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StreamReader(GetTestFile("htmlStripReaderTest.html"))));
            var builder = new StringBuilder();
            var ch      = -1;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var str = builder.ToString();

            Assert.IsTrue(str.IndexOf("&lt;") == -1, "Entity not properly escaped");            //there is one > in the text
            Assert.IsTrue(str.IndexOf("forrest") == -1 && str.IndexOf("Forrest") == -1, "Forrest should have been stripped out");
            Assert.IsTrue(str.Trim().StartsWith("Welcome to Solr"), "File should start with 'Welcome to Solr' after trimming");

            Assert.IsTrue(str.Trim().EndsWith("Foundation."), "File should start with 'Foundation.' after trimming");
        }
Example #4
0
        public void TestMalformedHTML()
        {
            const string test = "a <a hr<ef=aa<a>> </close</a>";
            const string gold = "a <a hr<ef=aa > </close ";
            //					   <aa hhr<<eef=aa > </close<
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)));
            var builder = new StringBuilder();
            var ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Resu: " + result + "<EOL>");
            // System.out.println("Gold: " + gold + "<EOL>");
            assertTrue(result + " is not equal to " + gold + "<EOS>", result.Equals(gold));
        }
Example #5
0
        public void doTestOffsets(String input)
        {
            var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(input)));
            int ch     = 0;
            int off    = 0;          // offset in the reader
            int strOff = -1;         // offset in the original string

            while ((ch = reader.Read()) != -1)
            {
                var correctedOff = reader.CorrectOffset(off);

                if (ch == 'X')
                {
                    strOff = input.IndexOf('X', strOff + 1);
                    assertEquals(strOff, correctedOff);
                }

                off++;
            }
        }
Example #6
0
        private static void processBuffer(String test, String assertMsg)
        {
            // System.out.println("-------------------processBuffer----------");
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)));            //force the use of BufferedReader
            var builder = new StringBuilder();

            try
            {
                var ch = 0;
                while ((ch = reader.Read()) != -1)
                {
                    builder.Append((char)ch);
                }
            }
            finally
            {
                // System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
            }
            Assert.AreEqual(test, builder.ToString(), assertMsg);
        }
Example #7
0
        public void TestGamma()
        {
            const string test = "&Gamma;";
            const string gold = "\u0393";
            var          set  = new HashSet <String> {
                "reserved"
            };
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
            var builder = new StringBuilder();
            int ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Resu: " + result + "<EOL>");
            // System.out.println("Gold: " + gold + "<EOL>");
            Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>");
        }
Example #8
0
        public void TestMoreEntities()
        {
            const string test = "&nbsp; &lt;junk/&gt; &nbsp; &#33; &#64; and &#8217;";
            const string gold = "  <junk/>   ! @ and ’";
            var          set  = new HashSet <String> {
                "reserved"
            };
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
            var builder = new StringBuilder();
            int ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Resu: " + result + "<EOL>");
            // System.out.println("Gold: " + gold + "<EOL>");
            Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold);
        }
Example #9
0
        public void TestComment()
        {
            const string test    = "<!--- three dashes, still a valid comment ---> ";
            const string gold    = "  ";
            var          reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)));   //force the use of BufferedReader
            int          ch      = 0;
            var          builder = new StringBuilder();

            try
            {
                while ((ch = reader.Read()) != -1)
                {
                    builder.Append((char)ch);
                }
            }
            finally
            {
                // System.out.println("String: " + builder.toString());
            }
            assertTrue(builder.ToString() + " is not equal to " + gold + "<EOS>", builder.ToString().Equals(gold) == true);
        }
Example #10
0
        public void TestEntities()
        {
            const string test = "&nbsp; &lt;foo&gt; &Uuml;bermensch &#61; &Gamma; bar &#x393;";
            const string gold = "  <foo> \u00DCbermensch = \u0393 bar \u0393";
            var          set  = new HashSet <String> {
                "reserved"
            };
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
            var builder = new StringBuilder();
            int ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Resu: " + result + "<EOL>");
            // System.out.println("Gold: " + gold + "<EOL>");
            Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>");
        }
Example #11
0
        public void TestReserved()
        {
            const string test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
            var          set  = new HashSet <String> {
                "reserved"
            };
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
            var builder = new StringBuilder();
            int ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Result: " + result);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved"), result.IndexOf("reserved") == 9);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 15), result.IndexOf("reserved", 15) == 38);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 41), result.IndexOf("reserved", 41) == 54);
            assertTrue("Other tag should be removed", result.IndexOf("other") == -1);
        }
Example #12
0
        public void IncrementsOffsetCorrectlyWithAnotherReader()
        {
            int[] expectedOffsets = { 0, 5, 10, 15 };
            int   curPos          = 0;

            string    token = string.Empty;
            Tokenizer t     =
                new Tokenizer(
                    new HTMLStripCharFilter(CharReader.Get(new System.IO.StringReader(@"test<a href=""foo"">test</a>test test"))));

            while (true)
            {
                Tokenizer.TokenType token_type = t.NextToken(out token);
                if (token_type == 0)
                {
                    break;
                }

                Assert.Equal(expectedOffsets[curPos++], t.Offset);
                Assert.Equal(4, t.LengthInSource);
            }
        }
Example #13
0
        public void TestHebrewScenarios()
        {
            const string html = "<div class=\"foo\">בדיקה ראשונה</div> וכאן נוסיף גם <a href=\"#bar\">לינק</a> ועכשיו " +
                                "גם <a alt=\"לינק מסובך עם תיאור\" href=\"http://lucene.apache.org/\">לינק מסובך יותר</a>. " +
                                " <!-- הערה אחת ויחידה -->";
            const string gold = " בדיקה ראשונה  וכאן נוסיף גם  לינק  ועכשיו " +
                                "גם  לינק מסובך יותר .   ";
            var reader    = new HTMLStripCharFilter(CharReader.Get(new StringReader(html)));
            var builder   = new StringBuilder();
            var ch        = -1;
            var goldArray = gold.ToCharArray();
            var position  = 0;

            while ((ch = reader.Read()) != -1)
            {
                var theChar = (char)ch;
                builder.Append(theChar);
                Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position] + "\". Buffer so far: " + builder + "<EOB>");
                position++;
            }
            Assert.AreEqual(gold, builder.ToString());

            doTestOffsets("שלום X מה X שלומך חבר");
        }
Example #14
0
        public void Test()
        {
            const string html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
                                "another <a href=\"http://lucene.apache.org/\">link</a>. " +
                                "This is an entity: &amp; plus a &lt;.  Here is an &. <!-- is a comment -->";
            const string gold = " this is some text  here is a  link  and " +
                                "another  link . " +
                                "This is an entity: & plus a <.  Here is an &.  ";
            var reader    = new HTMLStripCharFilter(CharReader.Get(new StringReader(html)));
            var builder   = new StringBuilder();
            var ch        = -1;
            var goldArray = gold.ToCharArray();
            var position  = 0;

            while ((ch = reader.Read()) != -1)
            {
                var theChar = (char)ch;
                builder.Append(theChar);
                Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position]
                              + "\". Buffer so far: " + builder + "<EOB>");
                position++;
            }
            Assert.AreEqual(gold, builder.ToString());
        }
Example #15
0
 /// Easy-use constructor that takes a {@link Reader}.
 public MappingCharFilter(NormalizeCharMap normMap, System.IO.TextReader in_Renamed) : base(CharReader.Get(in_Renamed))
 {
     this.normMap = normMap;
 }
Example #16
0
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            var filter = new HTMLStripCharFilter(CharReader.Get(reader));

            return(base.TokenStream(fieldName, filter));
        }
        public void Step(IDocumentAccessor document, IndentationSettings settings)
        {
            var line = document.Text;

            if (settings.LeaveEmptyLines && line.Length == 0)
            {
                return;
            }
            line = line.TrimStart();

            var indent = new StringBuilder();

            if (line.Length == 0)
            {
                if (this._blockComment)
                {
                    return;
                }
                indent.Append(this._block.InnerIndent);
                indent.Append(settings.IndentString.Repeat(this._block.OneLineBlock));
                if (this._block.Continuation)
                {
                    indent.Append(settings.IndentString);
                }
                if (document.Text != indent.ToString())
                {
                    document.Text = indent.ToString();
                }
                return;
            }

            if (document.TrimEnd())
            {
                line = document.Text.TrimStart();
            }

            var oldBlock       = this._block;
            var startInComment = this._blockComment;

            this._lineComment = false;
            this._escape      = false;

            this._lastNonCommentChar = '\n';

            var reader = new CharReader(line);

            var cha  = ' ';
            var prev = '\0';
            var next = '\n';

            var indented = false;

            while (reader.IsRemainChar)
            {
                cha  = reader.Get();
                prev = reader.Backward;
                next = reader.Ahead;

                if (this._lineComment)
                {
                    break;
                }
                if (this._escape)
                {
                    this._escape = false;
                    continue;
                }

                switch (cha)
                {
                case '/':
                    if (this._blockComment && prev == '*')
                    {
                        this._blockComment = false;
                    }
                    if (!this._inString)
                    {
                        if (!this._blockComment && next == '/')
                        {
                            this._lineComment = true;
                        }
                        if (!this._lineComment && next == '*')
                        {
                            this._blockComment = true;
                        }
                    }
                    break;

                case '"':
                    if (!(this._lineComment || this._blockComment))
                    {
                        if (this._inString)
                        {
                            this._inString = !this._escape;
                        }
                    }
                    break;

                case '\\':
                    if (this._inString)
                    {
                        this._escape = true;
                    }
                    break;

                default:
                    break;
                }

                if (this._lineComment || this._blockComment || this._inString)
                {
                    if (this._wordBuilder.Length > 0)
                    {
                        this._block.LastLiteral = this._wordBuilder.ToString();
                    }
                    this._wordBuilder.Length = 0;
                    continue;
                }

                if (char.IsLetterOrDigit(cha))
                {
                    this._wordBuilder.Append(cha);
                }
                else
                {
                    if (this._wordBuilder.Length > 0)
                    {
                        this._block.LastLiteral = this._wordBuilder.ToString();
                    }
                    this._wordBuilder.Length = 0;
                }

                switch (cha)
                {
                case '(':
                case '{':
                case '[':
                    this._block.ResetOneLineBlock();
                    this._blocks.Push(this._block);
                    this._block.StartLine = document.LineNumber;
                    if (!indented)
                    {
                        this._block.Indent(settings);
                        indented = true;
                    }
                    this._block.Bracket = cha;
                    break;

                case ')':
                case '}':
                case ']':
                    var openBracket = StringChecker.GetOpenBracket(cha);
                    while (this._block.Bracket != openBracket)
                    {
                        if (this._blocks.Count == 0)
                        {
                            break;
                        }
                        this._block = this._blocks.Pop();
                    }
                    if (this._blocks.Count == 0)
                    {
                        break;
                    }
                    this._block = this._blocks.Pop();
                    this._block.Continuation = false;
                    this._block.ResetOneLineBlock();
                    break;
                }

                if (!char.IsWhiteSpace(cha))
                {
                    this._lastNonCommentChar = cha;
                }
            }

            if (this._wordBuilder.Length > 0)
            {
                this._block.LastLiteral = this._wordBuilder.ToString();
            }
            this._wordBuilder.Length = 0;

            if ((startInComment && line[0] != '*') ||
                document.Text.StartsWith("//\t", StringComparison.Ordinal) ||
                (document.Text == "//"))
            {
                return;
            }

            if ("]})".Contains(line[0]))
            {
                indent.Append(oldBlock.OuterIndent);
                oldBlock.ResetOneLineBlock();
                oldBlock.Continuation = false;
            }
            else
            {
                indent.Append(oldBlock.InnerIndent);
            }

            if (document.IsReadOnly)
            {
                if (!oldBlock.Continuation && oldBlock.OneLineBlock == 0 &&
                    oldBlock.StartLine == this._block.StartLine &&
                    this._block.StartLine < document.LineNumber && this._lastNonCommentChar != ':')
                {
                    indent.Length = 0;
                    line          = document.Text;
                    for (int i = 0; i < line.Length; ++i)
                    {
                        if (!char.IsWhiteSpace(line[i]))
                        {
                            break;
                        }
                        indent.Append(line[i]);
                    }

                    if (startInComment && indent.Length > 0 && indent[indent.Length - 1] == ' ')
                    {
                        indent.Length -= 1;
                    }
                    this._block.InnerIndent = indent.ToString();
                }
                return;
            }

            if (startInComment)
            {
                indent.Append(' ');
            }

            if (indent.Length != (document.Text.Length - line.Length) ||
                !document.Text.StartsWith(indent.ToString(), StringComparison.Ordinal) ||
                char.IsWhiteSpace(document.Text[indent.Length]))
            {
                document.Text = indent.ToString() + line;
            }
        }