public void IncrementsOffsetCorrectlyWithAnotherReader2() { const string input = @"test1 <a href=""foo"">testlink</a> test2 test3"; CharFilter filter = new HTMLStripCharFilter(CharReader.Get(new StringReader(input))); Tokenizer t = new Tokenizer(filter); string token = string.Empty; List<Token> results = new List<Token>(); t.NextToken(out token); Assert.Equal(0, filter.CorrectOffset(t.Offset)); Assert.Equal(5, t.LengthInSource); t.NextToken(out token); Assert.Equal(20, filter.CorrectOffset(t.Offset)); Assert.Equal(8, t.LengthInSource); t.NextToken(out token); Assert.Equal(33, filter.CorrectOffset(t.Offset)); Assert.Equal(5, t.LengthInSource); t.NextToken(out token); Assert.Equal(39, filter.CorrectOffset(t.Offset)); Assert.Equal(5, t.LengthInSource); }
public void TestGamma() { const string test = "Γ"; const string gold = "\u0393"; var set = new HashSet<String> { "reserved" }; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set); var builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>"); }
public void TestHTML() { var reader = new HTMLStripCharFilter(CharReader.Get(new StreamReader(GetTestFile("htmlStripReaderTest.html")))); var builder = new StringBuilder(); var ch = -1; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var str = builder.ToString(); Assert.IsTrue(str.IndexOf("<") == -1, "Entity not properly escaped");//there is one > in the text Assert.IsTrue(str.IndexOf("forrest") == -1 && str.IndexOf("Forrest") == -1, "Forrest should have been stripped out"); Assert.IsTrue(str.Trim().StartsWith("Welcome to Solr"), "File should start with 'Welcome to Solr' after trimming"); Assert.IsTrue(str.Trim().EndsWith("Foundation."), "File should start with 'Foundation.' after trimming"); }
public void Test() { const string html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " + "another <a href=\"http://lucene.apache.org/\">link</a>. " + "This is an entity: & plus a <. Here is an &. <!-- is a comment -->"; const string gold = " this is some text here is a link and " + "another link . " + "This is an entity: & plus a <. Here is an &. "; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(html))); var builder = new StringBuilder(); var ch = -1; var goldArray = gold.ToCharArray(); var position = 0; while ((ch = reader.Read()) != -1) { var theChar = (char)ch; builder.Append(theChar); Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position] + "\". Buffer so far: " + builder + "<EOB>"); position++; } Assert.AreEqual(gold, builder.ToString()); }
public void TestEntities() { const string test = " <foo> Übermensch = Γ bar Γ"; const string gold = " <foo> \u00DCbermensch = \u0393 bar \u0393"; var set = new HashSet<String> { "reserved" }; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set); var builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>"); }
public void TestHebrewScenarios() { const string html = "<div class=\"foo\">בדיקה ראשונה</div> וכאן נוסיף גם <a href=\"#bar\">לינק</a> ועכשיו " + "גם <a alt=\"לינק מסובך עם תיאור\" href=\"http://lucene.apache.org/\">לינק מסובך יותר</a>. " + " <!-- הערה אחת ויחידה -->"; const string gold = " בדיקה ראשונה וכאן נוסיף גם לינק ועכשיו " + "גם לינק מסובך יותר . "; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(html))); var builder = new StringBuilder(); var ch = -1; var goldArray = gold.ToCharArray(); var position = 0; while ((ch = reader.Read()) != -1) { var theChar = (char)ch; builder.Append(theChar); Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position] + "\". Buffer so far: " + builder + "<EOB>"); position++; } Assert.AreEqual(gold, builder.ToString()); doTestOffsets("שלום X מה X שלומך חבר"); }
public void doTestOffsets(String input) { var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(input))); int ch = 0; int off = 0; // offset in the reader int strOff = -1; // offset in the original string while ((ch = reader.Read()) != -1) { var correctedOff = reader.CorrectOffset(off); if (ch == 'X') { strOff = input.IndexOf('X', strOff + 1); assertEquals(strOff, correctedOff); } off++; } }
public void TestComment() { const string test = "<!--- three dashes, still a valid comment ---> "; const string gold = " "; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test))); //force the use of BufferedReader int ch = 0; var builder = new StringBuilder(); try { while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } } finally { // System.out.println("String: " + builder.toString()); } assertTrue(builder.ToString() + " is not equal to " + gold + "<EOS>", builder.ToString().Equals(gold) == true); }
private static void processBuffer(String test, String assertMsg) { // System.out.println("-------------------processBuffer----------"); var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test))); //force the use of BufferedReader var builder = new StringBuilder(); try { var ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } } finally { // System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>"); } Assert.AreEqual(test, builder.ToString(), assertMsg); }
public void TestMalformedHTML() { const string test = "a <a hr<ef=aa<a>> </close</a>"; const string gold = "a <a hr<ef=aa > </close "; // <aa hhr<<eef=aa > </close< var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test))); var builder = new StringBuilder(); var ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); assertTrue(result + " is not equal to " + gold + "<EOS>", result.Equals(gold)); }
public void TestReserved() { const string test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>"; var set = new HashSet<String> {"reserved"}; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set); var builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Result: " + result); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved"), result.IndexOf("reserved") == 9); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 15), result.IndexOf("reserved", 15) == 38); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 41), result.IndexOf("reserved", 41) == 54); assertTrue("Other tag should be removed", result.IndexOf("other") == -1); }
public void TestMoreEntities() { const string test = " <junk/> ! @ and ’"; const string gold = " <junk/> ! @ and ’"; var set = new HashSet<String> {"reserved"}; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set); var builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold); }
public override Lucene.Net.Analysis.TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { var htmlCharFilter = new HTMLStripCharFilter(CharReader.Get(reader)); return base.TokenStream(fieldName, htmlCharFilter); }
public override TokenStream ReusableTokenStream(string fieldName, System.IO.TextReader reader) { var filter = new HTMLStripCharFilter(CharReader.Get(reader)); return base.TokenStream(fieldName, filter); }