//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: public static void assertHTMLStripsTo(java.io.Reader input, String gold, java.util.Set<String> escapedTags) throws Exception
 public static void assertHTMLStripsTo(Reader input, string gold, ISet<string> escapedTags)
 {
     HTMLStripCharFilter reader;
     if (null == escapedTags)
     {
       reader = new HTMLStripCharFilter(input);
     }
     else
     {
       reader = new HTMLStripCharFilter(input, escapedTags);
     }
     int ch = 0;
     StringBuilder builder = new StringBuilder();
     try
     {
       while ((ch = reader.read()) != -1)
       {
     builder.Append((char)ch);
       }
     }
     catch (Exception e)
     {
       if (gold.Equals(builder.ToString()))
       {
     throw e;
       }
       throw new Exception("('" + builder.ToString() + "' is not equal to '" + gold + "').  " + e.Message, e);
     }
     assertEquals("'" + builder.ToString() + "' is not equal to '" + gold + "'", gold, builder.ToString());
 }
Exemple #2
0
        public static void AssertHTMLStripsTo(TextReader input, string gold, ISet <string> escapedTags)
        {
            HTMLStripCharFilter reader;

            if (null == escapedTags)
            {
                reader = new HTMLStripCharFilter(input);
            }
            else
            {
                reader = new HTMLStripCharFilter(input, escapedTags);
            }
            int           ch      = 0;
            StringBuilder builder = new StringBuilder();

            try
            {
                while ((ch = reader.Read()) > 0)
                {
                    builder.Append((char)ch);
                }
            }
            catch (Exception e)
            {
                if (gold.Equals(builder.ToString(), StringComparison.Ordinal))
                {
                    throw; // LUCENENET: CA2200: Rethrow to preserve stack details (https://docs.microsoft.com/en-us/visualstudio/code-quality/ca2200-rethrow-to-preserve-stack-details)
                }
                throw new Exception("('" + builder.ToString() + "' is not equal to '" + gold + "').  " + e.Message, e);
            }
            assertEquals("'" + builder.ToString() + "' is not equal to '" + gold + "'", gold, builder.ToString());
        }
Exemple #3
0
        public void IncrementsOffsetCorrectlyWithAnotherReader2()
        {
            const string input = @"test1 <a href=""foo"">testlink</a> test2 test3";

            CharFilter filter = new HTMLStripCharFilter(CharReader.Get(new StringReader(input)));
            Tokenizer  t      = new Tokenizer(filter);

            string       token   = string.Empty;
            List <Token> results = new List <Token>();

            t.NextToken(out token);
            Assert.Equal(0, filter.CorrectOffset(t.Offset));
            Assert.Equal(5, t.LengthInSource);

            t.NextToken(out token);
            Assert.Equal(20, filter.CorrectOffset(t.Offset));
            Assert.Equal(8, t.LengthInSource);

            t.NextToken(out token);
            Assert.Equal(33, filter.CorrectOffset(t.Offset));
            Assert.Equal(5, t.LengthInSource);

            t.NextToken(out token);
            Assert.Equal(39, filter.CorrectOffset(t.Offset));
            Assert.Equal(5, t.LengthInSource);
        }
Exemple #4
0
        public static void AssertHTMLStripsTo(TextReader input, string gold, ISet <string> escapedTags)
        {
            HTMLStripCharFilter reader;

            if (null == escapedTags)
            {
                reader = new HTMLStripCharFilter(input);
            }
            else
            {
                reader = new HTMLStripCharFilter(input, escapedTags);
            }
            int           ch      = 0;
            StringBuilder builder = new StringBuilder();

            try
            {
                while ((ch = reader.Read()) > 0)
                {
                    builder.Append((char)ch);
                }
            }
            catch (Exception e)
            {
                if (gold.Equals(builder.ToString()))
                {
                    throw e;
                }
                throw new Exception("('" + builder.ToString() + "' is not equal to '" + gold + "').  " + e.Message, e);
            }
            assertEquals("'" + builder.ToString() + "' is not equal to '" + gold + "'", gold, builder.ToString());
        }
Exemple #5
0
        public virtual void TestBufferOverflow()
        {
            StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.InitialBufferSize + 50);

            testBuilder.Append("ah<?> ??????");
            AppendChars(testBuilder, HTMLStripCharFilter.InitialBufferSize + 500);
            TextReader reader = new HTMLStripCharFilter(new StreamReader(new MemoryStream(testBuilder.ToString().GetBytes(Encoding.UTF8))));

            AssertHTMLStripsTo(reader, testBuilder.ToString(), null);

            testBuilder.Length = 0;
            testBuilder.Append("<!--");                                                //comments
            AppendChars(testBuilder, 3 * HTMLStripCharFilter.InitialBufferSize + 500); //comments have two lookaheads

            testBuilder.Append("-->foo");
            string gold = "foo";

            AssertHTMLStripsTo(testBuilder.ToString(), gold, null);

            testBuilder.Length = 0;
            testBuilder.Append("<?");
            AppendChars(testBuilder, HTMLStripCharFilter.InitialBufferSize + 500);
            testBuilder.Append("?>");
            gold = "";
            AssertHTMLStripsTo(testBuilder.ToString(), gold, null);

            testBuilder.Length = 0;
            testBuilder.Append("<b ");
            AppendChars(testBuilder, HTMLStripCharFilter.InitialBufferSize + 500);
            testBuilder.Append("/>");
            gold = "";
            AssertHTMLStripsTo(testBuilder.ToString(), gold, null);
        }
Exemple #6
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void testBufferOverflow() throws Exception
        public virtual void testBufferOverflow()
        {
            StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.InitialBufferSize + 50);

            testBuilder.Append("ah<?> ??????");
            appendChars(testBuilder, HTMLStripCharFilter.InitialBufferSize + 500);
            Reader reader = new HTMLStripCharFilter(new System.IO.StreamReader(new StringReader(testBuilder.ToString())));     //force the use of BufferedReader

            assertHTMLStripsTo(reader, testBuilder.ToString(), null);

            testBuilder.Length = 0;
            testBuilder.Append("<!--");                                                //comments
            appendChars(testBuilder, 3 * HTMLStripCharFilter.InitialBufferSize + 500); //comments have two lookaheads

            testBuilder.Append("-->foo");
            string gold = "foo";

            assertHTMLStripsTo(testBuilder.ToString(), gold, null);

            testBuilder.Length = 0;
            testBuilder.Append("<?");
            appendChars(testBuilder, HTMLStripCharFilter.InitialBufferSize + 500);
            testBuilder.Append("?>");
            gold = "";
            assertHTMLStripsTo(testBuilder.ToString(), gold, null);

            testBuilder.Length = 0;
            testBuilder.Append("<b ");
            appendChars(testBuilder, HTMLStripCharFilter.InitialBufferSize + 500);
            testBuilder.Append("/>");
            gold = "";
            assertHTMLStripsTo(testBuilder.ToString(), gold, null);
        }
	  public override HTMLStripCharFilter create(Reader input)
	  {
		HTMLStripCharFilter charFilter;
		if (null == escapedTags)
		{
		  charFilter = new HTMLStripCharFilter(input);
		}
		else
		{
		  charFilter = new HTMLStripCharFilter(input, escapedTags);
		}
		return charFilter;
	  }
Exemple #8
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: static void assertLegalOffsets(String in) throws Exception
        internal static void assertLegalOffsets(string @in)
        {
            int length = @in.Length;
            HTMLStripCharFilter reader = new HTMLStripCharFilter(new System.IO.StreamReader(new StringReader(@in)));
            int ch  = 0;
            int off = 0;

            while ((ch = reader.read()) != -1)
            {
                int correction = reader.correctOffset(off);
                assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length, correction <= length);
                off++;
            }
        }
        internal static void AssertLegalOffsets(string @in)
        {
            int length = @in.Length;
            HTMLStripCharFilter reader = new HTMLStripCharFilter(new StreamReader(new MemoryStream(@in.GetBytes(Encoding.UTF8))));
            int ch  = 0;
            int off = 0;

            while ((ch = reader.Read()) > 0)
            {
                int correction = reader.CorrectOffset(off);
                assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length, correction <= length);
                off++;
            }
        }
Exemple #10
0
        public override TextReader Create(TextReader input)
        {
            HTMLStripCharFilter charFilter;

            if (null == escapedTags)
            {
                charFilter = new HTMLStripCharFilter(input);
            }
            else
            {
                charFilter = new HTMLStripCharFilter(input, escapedTags);
            }
            return(charFilter);
        }
Exemple #11
0
        private string HtmlToPlain(string html)
        {
            using (TextReader reader = new HTMLStripCharFilter(new StringReader(html)))
            {
                StringBuilder sb    = new StringBuilder();
                char[]        chars = new char[1024];
                int           length;
                while ((length = reader.Read(chars, 0, chars.Length)) > 0)
                {
                    sb.Append(chars, 0, length);
                }

                return(sb.ToString());
            }
        }
        public virtual void TestMSWord14GeneratedHTML()
        {
            System.IO.Stream    stream = this.GetType().getResourceAsStream("MS-Word 14 generated.htm");
            HTMLStripCharFilter reader = new HTMLStripCharFilter(new System.IO.StreamReader(stream, Encoding.UTF8));
            string        gold         = "This is a test";
            StringBuilder builder      = new StringBuilder();
            int           ch           = 0;

            while ((ch = reader.Read()) > 0)
            {
                builder.Append((char)ch);
            }
            // Compare trim()'d output to gold
            assertEquals("'" + builder.ToString().Trim() + "' is not equal to '" + gold + "'", gold, builder.ToString().Trim());
        }
Exemple #13
0
        public virtual void TestRandomText()
        {
            StringBuilder text          = new StringBuilder();
            int           minNumWords   = 10;
            int           maxNumWords   = 10000;
            int           minWordLength = 3;
            int           maxWordLength = 20;
            int           numWords      = TestUtil.NextInt32(Random, minNumWords, maxNumWords);

            switch (TestUtil.NextInt32(Random, 0, 4))
            {
            case 0:
            {
                for (int wordNum = 0; wordNum < numWords; ++wordNum)
                {
                    text.Append(TestUtil.RandomUnicodeString(Random, maxWordLength));
                    text.Append(' ');
                }
                break;
            }

            case 1:
            {
                for (int wordNum = 0; wordNum < numWords; ++wordNum)
                {
                    text.Append(TestUtil.RandomRealisticUnicodeString(Random, minWordLength, maxWordLength));
                    text.Append(' ');
                }
                break;
            }

            default:
            {         // ASCII 50% of the time
                for (int wordNum = 0; wordNum < numWords; ++wordNum)
                {
                    text.Append(TestUtil.RandomSimpleString(Random));
                    text.Append(' ');
                }
            }
            break;
            }
            TextReader reader = new HTMLStripCharFilter(new StringReader(text.ToString()));

            while (reader.Read() > 0)
            {
                ;
            }
        }
Exemple #14
0
        public void Reader()
        {
            string              s      = "<html>test1 test2</html>";
            StringReader        reader = new StringReader(s);
            HTMLStripCharFilter f      = new HTMLStripCharFilter(reader);

            StringBuilder sb = new StringBuilder();

            char[] chars = new char[1024];
            int    length;

            while ((length = f.Read(chars, 0, chars.Length)) > 0)
            {
                sb.Append(chars, 0, length);
            }

            Assert.Equal("test", sb.ToString());
        }
        public virtual void TestHTML()
        {
            System.IO.Stream stream = this.GetType().getResourceAsStream("htmlStripReaderTest.html");
            HTMLStripCharFilter reader = new HTMLStripCharFilter(new System.IO.StreamReader(stream, Encoding.UTF8));
            StringBuilder builder = new StringBuilder();
            int ch = -1;
            while ((ch = reader.Read()) > 0)
            {
                builder.Append((char)ch);
            }
            string str = builder.ToString();
            assertTrue("Entity not properly escaped", str.IndexOf("&lt;", StringComparison.Ordinal) == -1); //there is one > in the text
            assertTrue("Forrest should have been stripped out", str.IndexOf("forrest", StringComparison.Ordinal) == -1 && str.IndexOf("Forrest", StringComparison.Ordinal) == -1);
            assertTrue("File should start with 'Welcome to Solr' after trimming", str.Trim().StartsWith("Welcome to Solr", StringComparison.Ordinal));

            assertTrue("File should start with 'Foundation.' after trimming", str.Trim().EndsWith("Foundation.", StringComparison.Ordinal));

        }
Exemple #16
0
        public void TestHTML()
        {
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StreamReader(GetTestFile("htmlStripReaderTest.html"))));
            var builder = new StringBuilder();
            var ch      = -1;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var str = builder.ToString();

            Assert.IsTrue(str.IndexOf("&lt;") == -1, "Entity not properly escaped");            //there is one > in the text
            Assert.IsTrue(str.IndexOf("forrest") == -1 && str.IndexOf("Forrest") == -1, "Forrest should have been stripped out");
            Assert.IsTrue(str.Trim().StartsWith("Welcome to Solr"), "File should start with 'Welcome to Solr' after trimming");

            Assert.IsTrue(str.Trim().EndsWith("Foundation."), "File should start with 'Foundation.' after trimming");
        }
Exemple #17
0
        public void TestMalformedHTML()
        {
            const string test = "a <a hr<ef=aa<a>> </close</a>";
            const string gold = "a <a hr<ef=aa > </close ";
            //					   <aa hhr<<eef=aa > </close<
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)));
            var builder = new StringBuilder();
            var ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Resu: " + result + "<EOL>");
            // System.out.println("Gold: " + gold + "<EOL>");
            assertTrue(result + " is not equal to " + gold + "<EOS>", result.Equals(gold));
        }
Exemple #18
0
        public virtual void TestHTML()
        {
            System.IO.Stream    stream  = this.GetType().getResourceAsStream("htmlStripReaderTest.html");
            HTMLStripCharFilter reader  = new HTMLStripCharFilter(new System.IO.StreamReader(stream, Encoding.UTF8));
            StringBuilder       builder = new StringBuilder();
            int ch = -1;

            while ((ch = reader.Read()) > 0)
            {
                builder.Append((char)ch);
            }
            string str = builder.ToString();

            assertTrue("Entity not properly escaped", str.IndexOf("&lt;", StringComparison.Ordinal) == -1); //there is one > in the text
            assertTrue("Forrest should have been stripped out", str.IndexOf("forrest", StringComparison.Ordinal) == -1 && str.IndexOf("Forrest", StringComparison.Ordinal) == -1);
            assertTrue("File should start with 'Welcome to Solr' after trimming", str.Trim().StartsWith("Welcome to Solr", StringComparison.Ordinal));

            assertTrue("File should start with 'Foundation.' after trimming", str.Trim().EndsWith("Foundation.", StringComparison.Ordinal));
        }
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void doTestOffsets(String in) throws Exception
        public virtual void doTestOffsets(string @in)
        {
            HTMLStripCharFilter reader = new HTMLStripCharFilter(new System.IO.StreamReader(new StringReader(@in)));
            int ch = 0;
            int off = 0; // offset in the reader
            int strOff = -1; // offset in the original string
            while ((ch = reader.read()) != -1)
            {
              int correctedOff = reader.correctOffset(off);

              if (ch == 'X')
              {
            strOff = @in.IndexOf('X',strOff + 1);
            assertEquals(strOff, correctedOff);
              }

              off++;
            }
        }
Exemple #20
0
        private static void processBuffer(String test, String assertMsg)
        {
            // System.out.println("-------------------processBuffer----------");
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)));            //force the use of BufferedReader
            var builder = new StringBuilder();

            try
            {
                var ch = 0;
                while ((ch = reader.Read()) != -1)
                {
                    builder.Append((char)ch);
                }
            }
            finally
            {
                // System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
            }
            Assert.AreEqual(test, builder.ToString(), assertMsg);
        }
Exemple #21
0
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public void doTestOffsets(String in) throws Exception
        public virtual void doTestOffsets(string @in)
        {
            HTMLStripCharFilter reader = new HTMLStripCharFilter(new System.IO.StreamReader(new StringReader(@in)));
            int ch     = 0;
            int off    = 0;  // offset in the reader
            int strOff = -1; // offset in the original string

            while ((ch = reader.read()) != -1)
            {
                int correctedOff = reader.correctOffset(off);

                if (ch == 'X')
                {
                    strOff = @in.IndexOf('X', strOff + 1);
                    assertEquals(strOff, correctedOff);
                }

                off++;
            }
        }
Exemple #22
0
        public virtual void DoTestOffsets(string @in)
        {
            HTMLStripCharFilter reader = new HTMLStripCharFilter(new StreamReader(new MemoryStream(@in.GetBytes(Encoding.UTF8))));
            int ch     = 0;
            int off    = 0;  // offset in the reader
            int strOff = -1; // offset in the original string

            while ((ch = reader.Read()) > 0)
            {
                int correctedOff = reader.CorrectOffset(off);

                if (ch == 'X')
                {
                    strOff = @in.IndexOf('X', strOff + 1);
                    assertEquals(strOff, correctedOff);
                }

                off++;
            }
        }
Exemple #23
0
        public void doTestOffsets(String input)
        {
            var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(input)));
            int ch     = 0;
            int off    = 0;          // offset in the reader
            int strOff = -1;         // offset in the original string

            while ((ch = reader.Read()) != -1)
            {
                var correctedOff = reader.CorrectOffset(off);

                if (ch == 'X')
                {
                    strOff = input.IndexOf('X', strOff + 1);
                    assertEquals(strOff, correctedOff);
                }

                off++;
            }
        }
Exemple #24
0
        public void TestMoreEntities()
        {
            const string test = "&nbsp; &lt;junk/&gt; &nbsp; &#33; &#64; and &#8217;";
            const string gold = "  <junk/>   ! @ and ’";
            var          set  = new HashSet <String> {
                "reserved"
            };
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
            var builder = new StringBuilder();
            int ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Resu: " + result + "<EOL>");
            // System.out.println("Gold: " + gold + "<EOL>");
            Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold);
        }
Exemple #25
0
        public void TestGamma()
        {
            const string test = "&Gamma;";
            const string gold = "\u0393";
            var          set  = new HashSet <String> {
                "reserved"
            };
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
            var builder = new StringBuilder();
            int ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Resu: " + result + "<EOL>");
            // System.out.println("Gold: " + gold + "<EOL>");
            Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>");
        }
Exemple #26
0
        public void TestComment()
        {
            const string test    = "<!--- three dashes, still a valid comment ---> ";
            const string gold    = "  ";
            var          reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)));   //force the use of BufferedReader
            int          ch      = 0;
            var          builder = new StringBuilder();

            try
            {
                while ((ch = reader.Read()) != -1)
                {
                    builder.Append((char)ch);
                }
            }
            finally
            {
                // System.out.println("String: " + builder.toString());
            }
            assertTrue(builder.ToString() + " is not equal to " + gold + "<EOS>", builder.ToString().Equals(gold) == true);
        }
Exemple #27
0
        public void TestEntities()
        {
            const string test = "&nbsp; &lt;foo&gt; &Uuml;bermensch &#61; &Gamma; bar &#x393;";
            const string gold = "  <foo> \u00DCbermensch = \u0393 bar \u0393";
            var          set  = new HashSet <String> {
                "reserved"
            };
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
            var builder = new StringBuilder();
            int ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Resu: " + result + "<EOL>");
            // System.out.println("Gold: " + gold + "<EOL>");
            Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>");
        }
Exemple #28
0
        public virtual void TestReserved()
        {
            string        test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
            ISet <string> set  = new HashSet <string>();

            set.Add("reserved");
            TextReader    reader  = new HTMLStripCharFilter(new StringReader(test), set);
            StringBuilder builder = new StringBuilder();
            int           ch      = 0;

            while ((ch = reader.Read()) > 0)
            {
                builder.Append((char)ch);
            }
            string result = builder.ToString();

            // System.out.println("Result: " + result);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", StringComparison.Ordinal), result.IndexOf("reserved", StringComparison.Ordinal) == 9);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 15, StringComparison.Ordinal), result.IndexOf("reserved", 15, StringComparison.Ordinal) == 38);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 41, StringComparison.Ordinal), result.IndexOf("reserved", 41, StringComparison.Ordinal) == 54);
            assertTrue("Other tag should be removed", result.IndexOf("other", StringComparison.Ordinal) == -1);
        }
Exemple #29
0
        public void TestReserved()
        {
            const string test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
            var          set  = new HashSet <String> {
                "reserved"
            };
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
            var builder = new StringBuilder();
            int ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Result: " + result);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved"), result.IndexOf("reserved") == 9);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 15), result.IndexOf("reserved", 15) == 38);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 41), result.IndexOf("reserved", 41) == 54);
            assertTrue("Other tag should be removed", result.IndexOf("other") == -1);
        }
Exemple #30
0
        public void Test()
        {
            const string html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
                                "another <a href=\"http://lucene.apache.org/\">link</a>. " +
                                "This is an entity: &amp; plus a &lt;.  Here is an &. <!-- is a comment -->";
            const string gold = " this is some text  here is a  link  and " +
                                "another  link . " +
                                "This is an entity: & plus a <.  Here is an &.  ";
            var reader    = new HTMLStripCharFilter(CharReader.Get(new StringReader(html)));
            var builder   = new StringBuilder();
            var ch        = -1;
            var goldArray = gold.ToCharArray();
            var position  = 0;

            while ((ch = reader.Read()) != -1)
            {
                var theChar = (char)ch;
                builder.Append(theChar);
                Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position]
                              + "\". Buffer so far: " + builder + "<EOB>");
                position++;
            }
            Assert.AreEqual(gold, builder.ToString());
        }
Exemple #31
0
        public void TestHebrewScenarios()
        {
            const string html = "<div class=\"foo\">בדיקה ראשונה</div> וכאן נוסיף גם <a href=\"#bar\">לינק</a> ועכשיו " +
                                "גם <a alt=\"לינק מסובך עם תיאור\" href=\"http://lucene.apache.org/\">לינק מסובך יותר</a>. " +
                                " <!-- הערה אחת ויחידה -->";
            const string gold = " בדיקה ראשונה  וכאן נוסיף גם  לינק  ועכשיו " +
                                "גם  לינק מסובך יותר .   ";
            var reader    = new HTMLStripCharFilter(CharReader.Get(new StringReader(html)));
            var builder   = new StringBuilder();
            var ch        = -1;
            var goldArray = gold.ToCharArray();
            var position  = 0;

            while ((ch = reader.Read()) != -1)
            {
                var theChar = (char)ch;
                builder.Append(theChar);
                Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position] + "\". Buffer so far: " + builder + "<EOB>");
                position++;
            }
            Assert.AreEqual(gold, builder.ToString());

            doTestOffsets("שלום X מה X שלומך חבר");
        }
        public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
        {
            var filter = new HTMLStripCharFilter(CharReader.Get(reader));

            return(base.TokenStream(fieldName, filter));
        }
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
        //ORIGINAL LINE: public void testBufferOverflow() throws Exception
        public virtual void testBufferOverflow()
        {
            StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.InitialBufferSize + 50);
            testBuilder.Append("ah<?> ??????");
            appendChars(testBuilder, HTMLStripCharFilter.InitialBufferSize + 500);
            Reader reader = new HTMLStripCharFilter(new System.IO.StreamReader(new StringReader(testBuilder.ToString()))); //force the use of BufferedReader
            assertHTMLStripsTo(reader, testBuilder.ToString(), null);

            testBuilder.Length = 0;
            testBuilder.Append("<!--"); //comments
            appendChars(testBuilder, 3 * HTMLStripCharFilter.InitialBufferSize + 500); //comments have two lookaheads

            testBuilder.Append("-->foo");
            string gold = "foo";
            assertHTMLStripsTo(testBuilder.ToString(), gold, null);

            testBuilder.Length = 0;
            testBuilder.Append("<?");
            appendChars(testBuilder, HTMLStripCharFilter.InitialBufferSize + 500);
            testBuilder.Append("?>");
            gold = "";
            assertHTMLStripsTo(testBuilder.ToString(), gold, null);

            testBuilder.Length = 0;
            testBuilder.Append("<b ");
            appendChars(testBuilder, HTMLStripCharFilter.InitialBufferSize + 500);
            testBuilder.Append("/>");
            gold = "";
            assertHTMLStripsTo(testBuilder.ToString(), gold, null);
        }
        public virtual void TestBufferOverflow()
        {
            StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.InitialBufferSize + 50);
            testBuilder.Append("ah<?> ??????");
            AppendChars(testBuilder, HTMLStripCharFilter.InitialBufferSize + 500);
            TextReader reader = new HTMLStripCharFilter(new StreamReader(new MemoryStream(testBuilder.ToString().GetBytes(Encoding.UTF8))));
            AssertHTMLStripsTo(reader, testBuilder.ToString(), null);

            testBuilder.Length = 0;
            testBuilder.Append("<!--"); //comments
            AppendChars(testBuilder, 3 * HTMLStripCharFilter.InitialBufferSize + 500); //comments have two lookaheads

            testBuilder.Append("-->foo");
            string gold = "foo";
            AssertHTMLStripsTo(testBuilder.ToString(), gold, null);

            testBuilder.Length = 0;
            testBuilder.Append("<?");
            AppendChars(testBuilder, HTMLStripCharFilter.InitialBufferSize + 500);
            testBuilder.Append("?>");
            gold = "";
            AssertHTMLStripsTo(testBuilder.ToString(), gold, null);

            testBuilder.Length = 0;
            testBuilder.Append("<b ");
            AppendChars(testBuilder, HTMLStripCharFilter.InitialBufferSize + 500);
            testBuilder.Append("/>");
            gold = "";
            AssertHTMLStripsTo(testBuilder.ToString(), gold, null);
        }
 public virtual void TestMSWord14GeneratedHTML()
 {
     System.IO.Stream stream = this.GetType().getResourceAsStream("MS-Word 14 generated.htm");
     HTMLStripCharFilter reader = new HTMLStripCharFilter(new System.IO.StreamReader(stream, Encoding.UTF8));
     string gold = "This is a test";
     StringBuilder builder = new StringBuilder();
     int ch = 0;
     while ((ch = reader.Read()) > 0)
     {
         builder.Append((char)ch);
     }
     // Compare trim()'d output to gold
     assertEquals("'" + builder.ToString().Trim() + "' is not equal to '" + gold + "'", gold, builder.ToString().Trim());
 }
        public virtual void DoTestOffsets(string @in)
        {
            HTMLStripCharFilter reader = new HTMLStripCharFilter(new StreamReader(new MemoryStream(@in.GetBytes(Encoding.UTF8))));
            int ch = 0;
            int off = 0; // offset in the reader
            int strOff = -1; // offset in the original string
            while ((ch = reader.Read()) > 0)
            {
                int correctedOff = reader.CorrectOffset(off);

                if (ch == 'X')
                {
                    strOff = @in.IndexOf('X', strOff + 1);
                    assertEquals(strOff, correctedOff);
                }

                off++;
            }
        }
 public virtual void TestRandomText()
 {
     StringBuilder text = new StringBuilder();
     int minNumWords = 10;
     int maxNumWords = 10000;
     int minWordLength = 3;
     int maxWordLength = 20;
     int numWords = TestUtil.NextInt(Random(), minNumWords, maxNumWords);
     switch (TestUtil.NextInt(Random(), 0, 4))
     {
         case 0:
             {
                 for (int wordNum = 0; wordNum < numWords; ++wordNum)
                 {
                     text.Append(TestUtil.RandomUnicodeString(Random(), maxWordLength));
                     text.Append(' ');
                 }
                 break;
             }
         case 1:
             {
                 for (int wordNum = 0; wordNum < numWords; ++wordNum)
                 {
                     text.Append(TestUtil.RandomRealisticUnicodeString(Random(), minWordLength, maxWordLength));
                     text.Append(' ');
                 }
                 break;
             }
         default:
             { // ASCII 50% of the time
                 for (int wordNum = 0; wordNum < numWords; ++wordNum)
                 {
                     text.Append(TestUtil.RandomSimpleString(Random()));
                     text.Append(' ');
                 }
             }
             break;
     }
     TextReader reader = new HTMLStripCharFilter(new StringReader(text.ToString()));
     while (reader.Read() > 0) ;
 }
 internal static void AssertLegalOffsets(string @in)
 {
     int length = @in.Length;
     HTMLStripCharFilter reader = new HTMLStripCharFilter(new StreamReader(new MemoryStream(@in.GetBytes(Encoding.UTF8))));
     int ch = 0;
     int off = 0;
     while ((ch = reader.Read()) > 0)
     {
         int correction = reader.CorrectOffset(off);
         assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length, correction <= length);
         off++;
     }
 }
 //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
 //ORIGINAL LINE: static void assertLegalOffsets(String in) throws Exception
 internal static void assertLegalOffsets(string @in)
 {
     int length = @in.Length;
     HTMLStripCharFilter reader = new HTMLStripCharFilter(new System.IO.StreamReader(new StringReader(@in)));
     int ch = 0;
     int off = 0;
     while ((ch = reader.read()) != -1)
     {
       int correction = reader.correctOffset(off);
       assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length, correction <= length);
       off++;
     }
 }
 public virtual void TestReserved()
 {
     string test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
     ISet<string> set = new HashSet<string>();
     set.Add("reserved");
     TextReader reader = new HTMLStripCharFilter(new StringReader(test), set);
     StringBuilder builder = new StringBuilder();
     int ch = 0;
     while ((ch = reader.Read()) > 0)
     {
         builder.Append((char)ch);
     }
     string result = builder.ToString();
     // System.out.println("Result: " + result);
     assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", StringComparison.Ordinal), result.IndexOf("reserved", StringComparison.Ordinal) == 9);
     assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 15, StringComparison.Ordinal), result.IndexOf("reserved", 15, StringComparison.Ordinal) == 38);
     assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 41, StringComparison.Ordinal), result.IndexOf("reserved", 41, StringComparison.Ordinal) == 54);
     assertTrue("Other tag should be removed", result.IndexOf("other", StringComparison.Ordinal) == -1);
 }