internal static void AssertLegalOffsets(string @in) { int length = @in.Length; HTMLStripCharFilter reader = new HTMLStripCharFilter(new StreamReader(new MemoryStream(@in.GetBytes(Encoding.UTF8)))); int ch = 0; int off = 0; while ((ch = reader.Read()) > 0) { int correction = reader.CorrectOffset(off); assertTrue("invalid offset correction: " + off + "->" + correction + " for doc of length: " + length, correction <= length); off++; } }
public virtual void TestMSWord14GeneratedHTML() { System.IO.Stream stream = this.GetType().getResourceAsStream("MS-Word 14 generated.htm"); HTMLStripCharFilter reader = new HTMLStripCharFilter(new System.IO.StreamReader(stream, Encoding.UTF8)); string gold = "This is a test"; StringBuilder builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) > 0) { builder.Append((char)ch); } // Compare trim()'d output to gold assertEquals("'" + builder.ToString().Trim() + "' is not equal to '" + gold + "'", gold, builder.ToString().Trim()); }
private string HtmlToPlain(string html) { using (TextReader reader = new HTMLStripCharFilter(new StringReader(html))) { StringBuilder sb = new StringBuilder(); char[] chars = new char[1024]; int length; while ((length = reader.Read(chars, 0, chars.Length)) > 0) { sb.Append(chars, 0, length); } return(sb.ToString()); } }
public virtual void TestRandomText() { StringBuilder text = new StringBuilder(); int minNumWords = 10; int maxNumWords = 10000; int minWordLength = 3; int maxWordLength = 20; int numWords = TestUtil.NextInt32(Random, minNumWords, maxNumWords); switch (TestUtil.NextInt32(Random, 0, 4)) { case 0: { for (int wordNum = 0; wordNum < numWords; ++wordNum) { text.Append(TestUtil.RandomUnicodeString(Random, maxWordLength)); text.Append(' '); } break; } case 1: { for (int wordNum = 0; wordNum < numWords; ++wordNum) { text.Append(TestUtil.RandomRealisticUnicodeString(Random, minWordLength, maxWordLength)); text.Append(' '); } break; } default: { // ASCII 50% of the time for (int wordNum = 0; wordNum < numWords; ++wordNum) { text.Append(TestUtil.RandomSimpleString(Random)); text.Append(' '); } } break; } TextReader reader = new HTMLStripCharFilter(new StringReader(text.ToString())); while (reader.Read() > 0) { ; } }
public virtual void TestHTML() { System.IO.Stream stream = this.GetType().getResourceAsStream("htmlStripReaderTest.html"); HTMLStripCharFilter reader = new HTMLStripCharFilter(new System.IO.StreamReader(stream, Encoding.UTF8)); StringBuilder builder = new StringBuilder(); int ch = -1; while ((ch = reader.Read()) > 0) { builder.Append((char)ch); } string str = builder.ToString(); assertTrue("Entity not properly escaped", str.IndexOf("<", StringComparison.Ordinal) == -1); //there is one > in the text assertTrue("Forrest should have been stripped out", str.IndexOf("forrest", StringComparison.Ordinal) == -1 && str.IndexOf("Forrest", StringComparison.Ordinal) == -1); assertTrue("File should start with 'Welcome to Solr' after trimming", str.Trim().StartsWith("Welcome to Solr", StringComparison.Ordinal)); assertTrue("File should start with 'Foundation.' after trimming", str.Trim().EndsWith("Foundation.", StringComparison.Ordinal)); }
public void Reader() { string s = "<html>test1 test2</html>"; StringReader reader = new StringReader(s); HTMLStripCharFilter f = new HTMLStripCharFilter(reader); StringBuilder sb = new StringBuilder(); char[] chars = new char[1024]; int length; while ((length = f.Read(chars, 0, chars.Length)) > 0) { sb.Append(chars, 0, length); } Assert.Equal("test", sb.ToString()); }
public void TestHTML() { var reader = new HTMLStripCharFilter(CharReader.Get(new StreamReader(GetTestFile("htmlStripReaderTest.html")))); var builder = new StringBuilder(); var ch = -1; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var str = builder.ToString(); Assert.IsTrue(str.IndexOf("<") == -1, "Entity not properly escaped"); //there is one > in the text Assert.IsTrue(str.IndexOf("forrest") == -1 && str.IndexOf("Forrest") == -1, "Forrest should have been stripped out"); Assert.IsTrue(str.Trim().StartsWith("Welcome to Solr"), "File should start with 'Welcome to Solr' after trimming"); Assert.IsTrue(str.Trim().EndsWith("Foundation."), "File should start with 'Foundation.' after trimming"); }
public void TestMalformedHTML() { const string test = "a <a hr<ef=aa<a>> </close</a>"; const string gold = "a <a hr<ef=aa > </close "; // <aa hhr<<eef=aa > </close< var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test))); var builder = new StringBuilder(); var ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); assertTrue(result + " is not equal to " + gold + "<EOS>", result.Equals(gold)); }
public virtual void DoTestOffsets(string @in) { HTMLStripCharFilter reader = new HTMLStripCharFilter(new StreamReader(new MemoryStream(@in.GetBytes(Encoding.UTF8)))); int ch = 0; int off = 0; // offset in the reader int strOff = -1; // offset in the original string while ((ch = reader.Read()) > 0) { int correctedOff = reader.CorrectOffset(off); if (ch == 'X') { strOff = @in.IndexOf('X', strOff + 1); assertEquals(strOff, correctedOff); } off++; } }
private static void processBuffer(String test, String assertMsg) { // System.out.println("-------------------processBuffer----------"); var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test))); //force the use of BufferedReader var builder = new StringBuilder(); try { var ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } } finally { // System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>"); } Assert.AreEqual(test, builder.ToString(), assertMsg); }
public void doTestOffsets(String input) { var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(input))); int ch = 0; int off = 0; // offset in the reader int strOff = -1; // offset in the original string while ((ch = reader.Read()) != -1) { var correctedOff = reader.CorrectOffset(off); if (ch == 'X') { strOff = input.IndexOf('X', strOff + 1); assertEquals(strOff, correctedOff); } off++; } }
public void TestGamma() { const string test = "Γ"; const string gold = "\u0393"; var set = new HashSet <String> { "reserved" }; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set); var builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>"); }
public void TestEntities() { const string test = " <foo> Übermensch = Γ bar Γ"; const string gold = " <foo> \u00DCbermensch = \u0393 bar \u0393"; var set = new HashSet <String> { "reserved" }; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set); var builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>"); }
public void TestMoreEntities() { const string test = " <junk/> ! @ and ’"; const string gold = " <junk/> ! @ and ’"; var set = new HashSet <String> { "reserved" }; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set); var builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold); }
public void TestComment() { const string test = "<!--- three dashes, still a valid comment ---> "; const string gold = " "; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test))); //force the use of BufferedReader int ch = 0; var builder = new StringBuilder(); try { while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } } finally { // System.out.println("String: " + builder.toString()); } assertTrue(builder.ToString() + " is not equal to " + gold + "<EOS>", builder.ToString().Equals(gold) == true); }
public virtual void TestReserved() { string test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>"; ISet <string> set = new HashSet <string>(); set.Add("reserved"); TextReader reader = new HTMLStripCharFilter(new StringReader(test), set); StringBuilder builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) > 0) { builder.Append((char)ch); } string result = builder.ToString(); // System.out.println("Result: " + result); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", StringComparison.Ordinal), result.IndexOf("reserved", StringComparison.Ordinal) == 9); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 15, StringComparison.Ordinal), result.IndexOf("reserved", 15, StringComparison.Ordinal) == 38); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 41, StringComparison.Ordinal), result.IndexOf("reserved", 41, StringComparison.Ordinal) == 54); assertTrue("Other tag should be removed", result.IndexOf("other", StringComparison.Ordinal) == -1); }
public void TestReserved() { const string test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>"; var set = new HashSet <String> { "reserved" }; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set); var builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Result: " + result); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved"), result.IndexOf("reserved") == 9); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 15), result.IndexOf("reserved", 15) == 38); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 41), result.IndexOf("reserved", 41) == 54); assertTrue("Other tag should be removed", result.IndexOf("other") == -1); }
public void TestHebrewScenarios() { const string html = "<div class=\"foo\">בדיקה ראשונה</div> וכאן נוסיף גם <a href=\"#bar\">לינק</a> ועכשיו " + "גם <a alt=\"לינק מסובך עם תיאור\" href=\"http://lucene.apache.org/\">לינק מסובך יותר</a>. " + " <!-- הערה אחת ויחידה -->"; const string gold = " בדיקה ראשונה וכאן נוסיף גם לינק ועכשיו " + "גם לינק מסובך יותר . "; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(html))); var builder = new StringBuilder(); var ch = -1; var goldArray = gold.ToCharArray(); var position = 0; while ((ch = reader.Read()) != -1) { var theChar = (char)ch; builder.Append(theChar); Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position] + "\". Buffer so far: " + builder + "<EOB>"); position++; } Assert.AreEqual(gold, builder.ToString()); doTestOffsets("שלום X מה X שלומך חבר"); }
public void Test() { const string html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " + "another <a href=\"http://lucene.apache.org/\">link</a>. " + "This is an entity: & plus a <. Here is an &. <!-- is a comment -->"; const string gold = " this is some text here is a link and " + "another link . " + "This is an entity: & plus a <. Here is an &. "; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(html))); var builder = new StringBuilder(); var ch = -1; var goldArray = gold.ToCharArray(); var position = 0; while ((ch = reader.Read()) != -1) { var theChar = (char)ch; builder.Append(theChar); Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position] + "\". Buffer so far: " + builder + "<EOB>"); position++; } Assert.AreEqual(gold, builder.ToString()); }
public virtual void TestRandomText() { StringBuilder text = new StringBuilder(); int minNumWords = 10; int maxNumWords = 10000; int minWordLength = 3; int maxWordLength = 20; int numWords = TestUtil.NextInt(Random(), minNumWords, maxNumWords); switch (TestUtil.NextInt(Random(), 0, 4)) { case 0: { for (int wordNum = 0; wordNum < numWords; ++wordNum) { text.Append(TestUtil.RandomUnicodeString(Random(), maxWordLength)); text.Append(' '); } break; } case 1: { for (int wordNum = 0; wordNum < numWords; ++wordNum) { text.Append(TestUtil.RandomRealisticUnicodeString(Random(), minWordLength, maxWordLength)); text.Append(' '); } break; } default: { // ASCII 50% of the time for (int wordNum = 0; wordNum < numWords; ++wordNum) { text.Append(TestUtil.RandomSimpleString(Random())); text.Append(' '); } } break; } TextReader reader = new HTMLStripCharFilter(new StringReader(text.ToString())); while (reader.Read() > 0) ; }
public virtual void TestReserved() { string test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>"; ISet<string> set = new HashSet<string>(); set.Add("reserved"); TextReader reader = new HTMLStripCharFilter(new StringReader(test), set); StringBuilder builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) > 0) { builder.Append((char)ch); } string result = builder.ToString(); // System.out.println("Result: " + result); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", StringComparison.Ordinal), result.IndexOf("reserved", StringComparison.Ordinal) == 9); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 15, StringComparison.Ordinal), result.IndexOf("reserved", 15, StringComparison.Ordinal) == 38); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 41, StringComparison.Ordinal), result.IndexOf("reserved", 41, StringComparison.Ordinal) == 54); assertTrue("Other tag should be removed", result.IndexOf("other", StringComparison.Ordinal) == -1); }