예제 #1
0
        internal static void AssertLegalOffsets(string @in)
        {
            int length = @in.Length;
            HTMLStripCharFilter reader = new HTMLStripCharFilter(new StreamReader(new MemoryStream(@in.GetBytes(Encoding.UTF8))));
            int ch  = 0;
            int off = 0;

            while ((ch = reader.Read()) > 0)
            {
                int correction = reader.CorrectOffset(off);
                assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length, correction <= length);
                off++;
            }
        }
예제 #2
0
        public virtual void TestMSWord14GeneratedHTML()
        {
            System.IO.Stream    stream = this.GetType().getResourceAsStream("MS-Word 14 generated.htm");
            HTMLStripCharFilter reader = new HTMLStripCharFilter(new System.IO.StreamReader(stream, Encoding.UTF8));
            string        gold         = "This is a test";
            StringBuilder builder      = new StringBuilder();
            int           ch           = 0;

            while ((ch = reader.Read()) > 0)
            {
                builder.Append((char)ch);
            }
            // Compare trim()'d output to gold
            assertEquals("'" + builder.ToString().Trim() + "' is not equal to '" + gold + "'", gold, builder.ToString().Trim());
        }
예제 #3
0
        private string HtmlToPlain(string html)
        {
            using (TextReader reader = new HTMLStripCharFilter(new StringReader(html)))
            {
                StringBuilder sb    = new StringBuilder();
                char[]        chars = new char[1024];
                int           length;
                while ((length = reader.Read(chars, 0, chars.Length)) > 0)
                {
                    sb.Append(chars, 0, length);
                }

                return(sb.ToString());
            }
        }
예제 #4
0
        public virtual void TestRandomText()
        {
            StringBuilder text          = new StringBuilder();
            int           minNumWords   = 10;
            int           maxNumWords   = 10000;
            int           minWordLength = 3;
            int           maxWordLength = 20;
            int           numWords      = TestUtil.NextInt32(Random, minNumWords, maxNumWords);

            switch (TestUtil.NextInt32(Random, 0, 4))
            {
            case 0:
            {
                for (int wordNum = 0; wordNum < numWords; ++wordNum)
                {
                    text.Append(TestUtil.RandomUnicodeString(Random, maxWordLength));
                    text.Append(' ');
                }
                break;
            }

            case 1:
            {
                for (int wordNum = 0; wordNum < numWords; ++wordNum)
                {
                    text.Append(TestUtil.RandomRealisticUnicodeString(Random, minWordLength, maxWordLength));
                    text.Append(' ');
                }
                break;
            }

            default:
            {         // ASCII 50% of the time
                for (int wordNum = 0; wordNum < numWords; ++wordNum)
                {
                    text.Append(TestUtil.RandomSimpleString(Random));
                    text.Append(' ');
                }
            }
            break;
            }
            TextReader reader = new HTMLStripCharFilter(new StringReader(text.ToString()));

            while (reader.Read() > 0)
            {
                ;
            }
        }
        public virtual void TestHTML()
        {
            System.IO.Stream stream = this.GetType().getResourceAsStream("htmlStripReaderTest.html");
            HTMLStripCharFilter reader = new HTMLStripCharFilter(new System.IO.StreamReader(stream, Encoding.UTF8));
            StringBuilder builder = new StringBuilder();
            int ch = -1;
            while ((ch = reader.Read()) > 0)
            {
                builder.Append((char)ch);
            }
            string str = builder.ToString();
            assertTrue("Entity not properly escaped", str.IndexOf("&lt;", StringComparison.Ordinal) == -1); //there is one > in the text
            assertTrue("Forrest should have been stripped out", str.IndexOf("forrest", StringComparison.Ordinal) == -1 && str.IndexOf("Forrest", StringComparison.Ordinal) == -1);
            assertTrue("File should start with 'Welcome to Solr' after trimming", str.Trim().StartsWith("Welcome to Solr", StringComparison.Ordinal));

            assertTrue("File should start with 'Foundation.' after trimming", str.Trim().EndsWith("Foundation.", StringComparison.Ordinal));

        }
예제 #6
0
        public void Reader()
        {
            string              s      = "<html>test1 test2</html>";
            StringReader        reader = new StringReader(s);
            HTMLStripCharFilter f      = new HTMLStripCharFilter(reader);

            StringBuilder sb = new StringBuilder();

            char[] chars = new char[1024];
            int    length;

            while ((length = f.Read(chars, 0, chars.Length)) > 0)
            {
                sb.Append(chars, 0, length);
            }

            Assert.Equal("test", sb.ToString());
        }
예제 #7
0
        public void TestHTML()
        {
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StreamReader(GetTestFile("htmlStripReaderTest.html"))));
            var builder = new StringBuilder();
            var ch      = -1;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var str = builder.ToString();

            Assert.IsTrue(str.IndexOf("&lt;") == -1, "Entity not properly escaped");            //there is one > in the text
            Assert.IsTrue(str.IndexOf("forrest") == -1 && str.IndexOf("Forrest") == -1, "Forrest should have been stripped out");
            Assert.IsTrue(str.Trim().StartsWith("Welcome to Solr"), "File should start with 'Welcome to Solr' after trimming");

            Assert.IsTrue(str.Trim().EndsWith("Foundation."), "File should start with 'Foundation.' after trimming");
        }
예제 #8
0
        public void TestMalformedHTML()
        {
            const string test = "a <a hr<ef=aa<a>> </close</a>";
            const string gold = "a <a hr<ef=aa > </close ";
            //					   <aa hhr<<eef=aa > </close<
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)));
            var builder = new StringBuilder();
            var ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Resu: " + result + "<EOL>");
            // System.out.println("Gold: " + gold + "<EOL>");
            assertTrue(result + " is not equal to " + gold + "<EOS>", result.Equals(gold));
        }
예제 #9
0
        public virtual void TestHTML()
        {
            System.IO.Stream    stream  = this.GetType().getResourceAsStream("htmlStripReaderTest.html");
            HTMLStripCharFilter reader  = new HTMLStripCharFilter(new System.IO.StreamReader(stream, Encoding.UTF8));
            StringBuilder       builder = new StringBuilder();
            int ch = -1;

            while ((ch = reader.Read()) > 0)
            {
                builder.Append((char)ch);
            }
            string str = builder.ToString();

            assertTrue("Entity not properly escaped", str.IndexOf("&lt;", StringComparison.Ordinal) == -1); //there is one > in the text
            assertTrue("Forrest should have been stripped out", str.IndexOf("forrest", StringComparison.Ordinal) == -1 && str.IndexOf("Forrest", StringComparison.Ordinal) == -1);
            assertTrue("File should start with 'Welcome to Solr' after trimming", str.Trim().StartsWith("Welcome to Solr", StringComparison.Ordinal));

            assertTrue("File should start with 'Foundation.' after trimming", str.Trim().EndsWith("Foundation.", StringComparison.Ordinal));
        }
예제 #10
0
        public virtual void DoTestOffsets(string @in)
        {
            HTMLStripCharFilter reader = new HTMLStripCharFilter(new StreamReader(new MemoryStream(@in.GetBytes(Encoding.UTF8))));
            int ch     = 0;
            int off    = 0;  // offset in the reader
            int strOff = -1; // offset in the original string

            while ((ch = reader.Read()) > 0)
            {
                int correctedOff = reader.CorrectOffset(off);

                if (ch == 'X')
                {
                    strOff = @in.IndexOf('X', strOff + 1);
                    assertEquals(strOff, correctedOff);
                }

                off++;
            }
        }
예제 #11
0
        private static void processBuffer(String test, String assertMsg)
        {
            // System.out.println("-------------------processBuffer----------");
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)));            //force the use of BufferedReader
            var builder = new StringBuilder();

            try
            {
                var ch = 0;
                while ((ch = reader.Read()) != -1)
                {
                    builder.Append((char)ch);
                }
            }
            finally
            {
                // System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
            }
            Assert.AreEqual(test, builder.ToString(), assertMsg);
        }
예제 #12
0
        public void doTestOffsets(String input)
        {
            var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(input)));
            int ch     = 0;
            int off    = 0;          // offset in the reader
            int strOff = -1;         // offset in the original string

            while ((ch = reader.Read()) != -1)
            {
                var correctedOff = reader.CorrectOffset(off);

                if (ch == 'X')
                {
                    strOff = input.IndexOf('X', strOff + 1);
                    assertEquals(strOff, correctedOff);
                }

                off++;
            }
        }
예제 #13
0
        public void TestGamma()
        {
            const string test = "&Gamma;";
            const string gold = "\u0393";
            var          set  = new HashSet <String> {
                "reserved"
            };
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
            var builder = new StringBuilder();
            int ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Resu: " + result + "<EOL>");
            // System.out.println("Gold: " + gold + "<EOL>");
            Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>");
        }
예제 #14
0
        public void TestEntities()
        {
            const string test = "&nbsp; &lt;foo&gt; &Uuml;bermensch &#61; &Gamma; bar &#x393;";
            const string gold = "  <foo> \u00DCbermensch = \u0393 bar \u0393";
            var          set  = new HashSet <String> {
                "reserved"
            };
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
            var builder = new StringBuilder();
            int ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Resu: " + result + "<EOL>");
            // System.out.println("Gold: " + gold + "<EOL>");
            Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>");
        }
예제 #15
0
        public void TestMoreEntities()
        {
            const string test = "&nbsp; &lt;junk/&gt; &nbsp; &#33; &#64; and &#8217;";
            const string gold = "  <junk/>   ! @ and ’";
            var          set  = new HashSet <String> {
                "reserved"
            };
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
            var builder = new StringBuilder();
            int ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Resu: " + result + "<EOL>");
            // System.out.println("Gold: " + gold + "<EOL>");
            Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold);
        }
예제 #16
0
        public void TestComment()
        {
            const string test    = "<!--- three dashes, still a valid comment ---> ";
            const string gold    = "  ";
            var          reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)));   //force the use of BufferedReader
            int          ch      = 0;
            var          builder = new StringBuilder();

            try
            {
                while ((ch = reader.Read()) != -1)
                {
                    builder.Append((char)ch);
                }
            }
            finally
            {
                // System.out.println("String: " + builder.toString());
            }
            assertTrue(builder.ToString() + " is not equal to " + gold + "<EOS>", builder.ToString().Equals(gold) == true);
        }
예제 #17
0
        public virtual void TestReserved()
        {
            string        test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
            ISet <string> set  = new HashSet <string>();

            set.Add("reserved");
            TextReader    reader  = new HTMLStripCharFilter(new StringReader(test), set);
            StringBuilder builder = new StringBuilder();
            int           ch      = 0;

            while ((ch = reader.Read()) > 0)
            {
                builder.Append((char)ch);
            }
            string result = builder.ToString();

            // System.out.println("Result: " + result);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", StringComparison.Ordinal), result.IndexOf("reserved", StringComparison.Ordinal) == 9);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 15, StringComparison.Ordinal), result.IndexOf("reserved", 15, StringComparison.Ordinal) == 38);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 41, StringComparison.Ordinal), result.IndexOf("reserved", 41, StringComparison.Ordinal) == 54);
            assertTrue("Other tag should be removed", result.IndexOf("other", StringComparison.Ordinal) == -1);
        }
예제 #18
0
        public void TestReserved()
        {
            const string test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
            var          set  = new HashSet <String> {
                "reserved"
            };
            var reader  = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
            var builder = new StringBuilder();
            int ch      = 0;

            while ((ch = reader.Read()) != -1)
            {
                builder.Append((char)ch);
            }
            var result = builder.ToString();

            // System.out.println("Result: " + result);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved"), result.IndexOf("reserved") == 9);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 15), result.IndexOf("reserved", 15) == 38);
            assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 41), result.IndexOf("reserved", 41) == 54);
            assertTrue("Other tag should be removed", result.IndexOf("other") == -1);
        }
예제 #19
0
        public void TestHebrewScenarios()
        {
            const string html = "<div class=\"foo\">בדיקה ראשונה</div> וכאן נוסיף גם <a href=\"#bar\">לינק</a> ועכשיו " +
                                "גם <a alt=\"לינק מסובך עם תיאור\" href=\"http://lucene.apache.org/\">לינק מסובך יותר</a>. " +
                                " <!-- הערה אחת ויחידה -->";
            const string gold = " בדיקה ראשונה  וכאן נוסיף גם  לינק  ועכשיו " +
                                "גם  לינק מסובך יותר .   ";
            var reader    = new HTMLStripCharFilter(CharReader.Get(new StringReader(html)));
            var builder   = new StringBuilder();
            var ch        = -1;
            var goldArray = gold.ToCharArray();
            var position  = 0;

            while ((ch = reader.Read()) != -1)
            {
                var theChar = (char)ch;
                builder.Append(theChar);
                Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position] + "\". Buffer so far: " + builder + "<EOB>");
                position++;
            }
            Assert.AreEqual(gold, builder.ToString());

            doTestOffsets("שלום X מה X שלומך חבר");
        }
예제 #20
0
        public void Test()
        {
            const string html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
                                "another <a href=\"http://lucene.apache.org/\">link</a>. " +
                                "This is an entity: &amp; plus a &lt;.  Here is an &. <!-- is a comment -->";
            const string gold = " this is some text  here is a  link  and " +
                                "another  link . " +
                                "This is an entity: & plus a <.  Here is an &.  ";
            var reader    = new HTMLStripCharFilter(CharReader.Get(new StringReader(html)));
            var builder   = new StringBuilder();
            var ch        = -1;
            var goldArray = gold.ToCharArray();
            var position  = 0;

            while ((ch = reader.Read()) != -1)
            {
                var theChar = (char)ch;
                builder.Append(theChar);
                Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position]
                              + "\". Buffer so far: " + builder + "<EOB>");
                position++;
            }
            Assert.AreEqual(gold, builder.ToString());
        }
 public virtual void TestMSWord14GeneratedHTML()
 {
     System.IO.Stream stream = this.GetType().getResourceAsStream("MS-Word 14 generated.htm");
     HTMLStripCharFilter reader = new HTMLStripCharFilter(new System.IO.StreamReader(stream, Encoding.UTF8));
     string gold = "This is a test";
     StringBuilder builder = new StringBuilder();
     int ch = 0;
     while ((ch = reader.Read()) > 0)
     {
         builder.Append((char)ch);
     }
     // Compare trim()'d output to gold
     assertEquals("'" + builder.ToString().Trim() + "' is not equal to '" + gold + "'", gold, builder.ToString().Trim());
 }
 public virtual void TestRandomText()
 {
     StringBuilder text = new StringBuilder();
     int minNumWords = 10;
     int maxNumWords = 10000;
     int minWordLength = 3;
     int maxWordLength = 20;
     int numWords = TestUtil.NextInt(Random(), minNumWords, maxNumWords);
     switch (TestUtil.NextInt(Random(), 0, 4))
     {
         case 0:
             {
                 for (int wordNum = 0; wordNum < numWords; ++wordNum)
                 {
                     text.Append(TestUtil.RandomUnicodeString(Random(), maxWordLength));
                     text.Append(' ');
                 }
                 break;
             }
         case 1:
             {
                 for (int wordNum = 0; wordNum < numWords; ++wordNum)
                 {
                     text.Append(TestUtil.RandomRealisticUnicodeString(Random(), minWordLength, maxWordLength));
                     text.Append(' ');
                 }
                 break;
             }
         default:
             { // ASCII 50% of the time
                 for (int wordNum = 0; wordNum < numWords; ++wordNum)
                 {
                     text.Append(TestUtil.RandomSimpleString(Random()));
                     text.Append(' ');
                 }
             }
             break;
     }
     TextReader reader = new HTMLStripCharFilter(new StringReader(text.ToString()));
     while (reader.Read() > 0) ;
 }
 internal static void AssertLegalOffsets(string @in)
 {
     int length = @in.Length;
     HTMLStripCharFilter reader = new HTMLStripCharFilter(new StreamReader(new MemoryStream(@in.GetBytes(Encoding.UTF8))));
     int ch = 0;
     int off = 0;
     while ((ch = reader.Read()) > 0)
     {
         int correction = reader.CorrectOffset(off);
         assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length, correction <= length);
         off++;
     }
 }
        public virtual void DoTestOffsets(string @in)
        {
            HTMLStripCharFilter reader = new HTMLStripCharFilter(new StreamReader(new MemoryStream(@in.GetBytes(Encoding.UTF8))));
            int ch = 0;
            int off = 0; // offset in the reader
            int strOff = -1; // offset in the original string
            while ((ch = reader.Read()) > 0)
            {
                int correctedOff = reader.CorrectOffset(off);

                if (ch == 'X')
                {
                    strOff = @in.IndexOf('X', strOff + 1);
                    assertEquals(strOff, correctedOff);
                }

                off++;
            }
        }
 public virtual void TestReserved()
 {
     string test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
     ISet<string> set = new HashSet<string>();
     set.Add("reserved");
     TextReader reader = new HTMLStripCharFilter(new StringReader(test), set);
     StringBuilder builder = new StringBuilder();
     int ch = 0;
     while ((ch = reader.Read()) > 0)
     {
         builder.Append((char)ch);
     }
     string result = builder.ToString();
     // System.out.println("Result: " + result);
     assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", StringComparison.Ordinal), result.IndexOf("reserved", StringComparison.Ordinal) == 9);
     assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 15, StringComparison.Ordinal), result.IndexOf("reserved", 15, StringComparison.Ordinal) == 38);
     assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 41, StringComparison.Ordinal), result.IndexOf("reserved", 41, StringComparison.Ordinal) == 54);
     assertTrue("Other tag should be removed", result.IndexOf("other", StringComparison.Ordinal) == -1);
 }