public void TestOffsetsWithTokenizer() { const string input = @"test1 <a href=""foo"">testlink</a> test2 test3"; Tokenizer t = new WhitespaceTokenizer(new HTMLStripCharFilter(CharReader.Get(new StringReader(input)))); string token = string.Empty; List <Token> results = new List <Token>(); OffsetAttribute att = ((OffsetAttribute)t.GetAttribute(typeof(OffsetAttribute))); t.IncrementToken(); Assert.AreEqual(0, att.StartOffset()); Assert.AreEqual(5, att.EndOffset() - att.StartOffset()); t.IncrementToken(); Assert.AreEqual(20, att.StartOffset()); Assert.AreEqual(8, att.EndOffset() - att.StartOffset()); t.IncrementToken(); Assert.AreEqual(33, att.StartOffset()); Assert.AreEqual(5, att.EndOffset() - att.StartOffset()); t.IncrementToken(); Assert.AreEqual(39, att.StartOffset()); Assert.AreEqual(5, att.EndOffset() - att.StartOffset()); }
public void IncrementsOffsetCorrectlyWithAnotherReader2() { const string input = @"test1 <a href=""foo"">testlink</a> test2 test3"; CharFilter filter = new HTMLStripCharFilter(CharReader.Get(new StringReader(input))); Tokenizer t = new Tokenizer(filter); string token = string.Empty; List <Token> results = new List <Token>(); t.NextToken(out token); Assert.Equal(0, filter.CorrectOffset(t.Offset)); Assert.Equal(5, t.LengthInSource); t.NextToken(out token); Assert.Equal(20, filter.CorrectOffset(t.Offset)); Assert.Equal(8, t.LengthInSource); t.NextToken(out token); Assert.Equal(33, filter.CorrectOffset(t.Offset)); Assert.Equal(5, t.LengthInSource); t.NextToken(out token); Assert.Equal(39, filter.CorrectOffset(t.Offset)); Assert.Equal(5, t.LengthInSource); }
public void TestHTML() { var reader = new HTMLStripCharFilter(CharReader.Get(new StreamReader(GetTestFile("htmlStripReaderTest.html")))); var builder = new StringBuilder(); var ch = -1; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var str = builder.ToString(); Assert.IsTrue(str.IndexOf("<") == -1, "Entity not properly escaped"); //there is one > in the text Assert.IsTrue(str.IndexOf("forrest") == -1 && str.IndexOf("Forrest") == -1, "Forrest should have been stripped out"); Assert.IsTrue(str.Trim().StartsWith("Welcome to Solr"), "File should start with 'Welcome to Solr' after trimming"); Assert.IsTrue(str.Trim().EndsWith("Foundation."), "File should start with 'Foundation.' after trimming"); }
public void TestMalformedHTML() { const string test = "a <a hr<ef=aa<a>> </close</a>"; const string gold = "a <a hr<ef=aa > </close "; // <aa hhr<<eef=aa > </close< var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test))); var builder = new StringBuilder(); var ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); assertTrue(result + " is not equal to " + gold + "<EOS>", result.Equals(gold)); }
public void doTestOffsets(String input) { var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(input))); int ch = 0; int off = 0; // offset in the reader int strOff = -1; // offset in the original string while ((ch = reader.Read()) != -1) { var correctedOff = reader.CorrectOffset(off); if (ch == 'X') { strOff = input.IndexOf('X', strOff + 1); assertEquals(strOff, correctedOff); } off++; } }
private static void processBuffer(String test, String assertMsg) { // System.out.println("-------------------processBuffer----------"); var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test))); //force the use of BufferedReader var builder = new StringBuilder(); try { var ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } } finally { // System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>"); } Assert.AreEqual(test, builder.ToString(), assertMsg); }
public void TestGamma() { const string test = "Γ"; const string gold = "\u0393"; var set = new HashSet <String> { "reserved" }; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set); var builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>"); }
public void TestMoreEntities() { const string test = " <junk/> ! @ and ’"; const string gold = " <junk/> ! @ and ’"; var set = new HashSet <String> { "reserved" }; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set); var builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold); }
public void TestComment() { const string test = "<!--- three dashes, still a valid comment ---> "; const string gold = " "; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test))); //force the use of BufferedReader int ch = 0; var builder = new StringBuilder(); try { while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } } finally { // System.out.println("String: " + builder.toString()); } assertTrue(builder.ToString() + " is not equal to " + gold + "<EOS>", builder.ToString().Equals(gold) == true); }
public void TestEntities() { const string test = " <foo> Übermensch = Γ bar Γ"; const string gold = " <foo> \u00DCbermensch = \u0393 bar \u0393"; var set = new HashSet <String> { "reserved" }; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set); var builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Resu: " + result + "<EOL>"); // System.out.println("Gold: " + gold + "<EOL>"); Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>"); }
public void TestReserved() { const string test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>"; var set = new HashSet <String> { "reserved" }; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set); var builder = new StringBuilder(); int ch = 0; while ((ch = reader.Read()) != -1) { builder.Append((char)ch); } var result = builder.ToString(); // System.out.println("Result: " + result); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved"), result.IndexOf("reserved") == 9); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 15), result.IndexOf("reserved", 15) == 38); assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 41), result.IndexOf("reserved", 41) == 54); assertTrue("Other tag should be removed", result.IndexOf("other") == -1); }
public void IncrementsOffsetCorrectlyWithAnotherReader() { int[] expectedOffsets = { 0, 5, 10, 15 }; int curPos = 0; string token = string.Empty; Tokenizer t = new Tokenizer( new HTMLStripCharFilter(CharReader.Get(new System.IO.StringReader(@"test<a href=""foo"">test</a>test test")))); while (true) { Tokenizer.TokenType token_type = t.NextToken(out token); if (token_type == 0) { break; } Assert.Equal(expectedOffsets[curPos++], t.Offset); Assert.Equal(4, t.LengthInSource); } }
public void TestHebrewScenarios() { const string html = "<div class=\"foo\">בדיקה ראשונה</div> וכאן נוסיף גם <a href=\"#bar\">לינק</a> ועכשיו " + "גם <a alt=\"לינק מסובך עם תיאור\" href=\"http://lucene.apache.org/\">לינק מסובך יותר</a>. " + " <!-- הערה אחת ויחידה -->"; const string gold = " בדיקה ראשונה וכאן נוסיף גם לינק ועכשיו " + "גם לינק מסובך יותר . "; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(html))); var builder = new StringBuilder(); var ch = -1; var goldArray = gold.ToCharArray(); var position = 0; while ((ch = reader.Read()) != -1) { var theChar = (char)ch; builder.Append(theChar); Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position] + "\". Buffer so far: " + builder + "<EOB>"); position++; } Assert.AreEqual(gold, builder.ToString()); doTestOffsets("שלום X מה X שלומך חבר"); }
public void Test() { const string html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " + "another <a href=\"http://lucene.apache.org/\">link</a>. " + "This is an entity: & plus a <. Here is an &. <!-- is a comment -->"; const string gold = " this is some text here is a link and " + "another link . " + "This is an entity: & plus a <. Here is an &. "; var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(html))); var builder = new StringBuilder(); var ch = -1; var goldArray = gold.ToCharArray(); var position = 0; while ((ch = reader.Read()) != -1) { var theChar = (char)ch; builder.Append(theChar); Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position] + "\". Buffer so far: " + builder + "<EOB>"); position++; } Assert.AreEqual(gold, builder.ToString()); }
/// Easy-use constructor that takes a {@link Reader}. public MappingCharFilter(NormalizeCharMap normMap, System.IO.TextReader in_Renamed) : base(CharReader.Get(in_Renamed)) { this.normMap = normMap; }
public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { var filter = new HTMLStripCharFilter(CharReader.Get(reader)); return(base.TokenStream(fieldName, filter)); }
public void Step(IDocumentAccessor document, IndentationSettings settings) { var line = document.Text; if (settings.LeaveEmptyLines && line.Length == 0) { return; } line = line.TrimStart(); var indent = new StringBuilder(); if (line.Length == 0) { if (this._blockComment) { return; } indent.Append(this._block.InnerIndent); indent.Append(settings.IndentString.Repeat(this._block.OneLineBlock)); if (this._block.Continuation) { indent.Append(settings.IndentString); } if (document.Text != indent.ToString()) { document.Text = indent.ToString(); } return; } if (document.TrimEnd()) { line = document.Text.TrimStart(); } var oldBlock = this._block; var startInComment = this._blockComment; this._lineComment = false; this._escape = false; this._lastNonCommentChar = '\n'; var reader = new CharReader(line); var cha = ' '; var prev = '\0'; var next = '\n'; var indented = false; while (reader.IsRemainChar) { cha = reader.Get(); prev = reader.Backward; next = reader.Ahead; if (this._lineComment) { break; } if (this._escape) { this._escape = false; continue; } switch (cha) { case '/': if (this._blockComment && prev == '*') { this._blockComment = false; } if (!this._inString) { if (!this._blockComment && next == '/') { this._lineComment = true; } if (!this._lineComment && next == '*') { this._blockComment = true; } } break; case '"': if (!(this._lineComment || this._blockComment)) { if (this._inString) { this._inString = !this._escape; } } break; case '\\': if (this._inString) { this._escape = true; } break; default: break; } if (this._lineComment || this._blockComment || this._inString) { if (this._wordBuilder.Length > 0) { this._block.LastLiteral = this._wordBuilder.ToString(); } this._wordBuilder.Length = 0; continue; } if (char.IsLetterOrDigit(cha)) { this._wordBuilder.Append(cha); } else { if (this._wordBuilder.Length > 0) { this._block.LastLiteral = this._wordBuilder.ToString(); } this._wordBuilder.Length = 0; } switch (cha) { case '(': case '{': case '[': this._block.ResetOneLineBlock(); this._blocks.Push(this._block); this._block.StartLine = document.LineNumber; if (!indented) { this._block.Indent(settings); indented = true; } this._block.Bracket = cha; break; case ')': case '}': case ']': var openBracket = StringChecker.GetOpenBracket(cha); while (this._block.Bracket != openBracket) { if (this._blocks.Count == 0) { break; } this._block = this._blocks.Pop(); } if (this._blocks.Count == 0) { break; } this._block = this._blocks.Pop(); this._block.Continuation = false; this._block.ResetOneLineBlock(); break; } if (!char.IsWhiteSpace(cha)) { this._lastNonCommentChar = cha; } } if (this._wordBuilder.Length > 0) { this._block.LastLiteral = this._wordBuilder.ToString(); } this._wordBuilder.Length = 0; if ((startInComment && line[0] != '*') || document.Text.StartsWith("//\t", StringComparison.Ordinal) || (document.Text == "//")) { return; } if ("]})".Contains(line[0])) { indent.Append(oldBlock.OuterIndent); oldBlock.ResetOneLineBlock(); oldBlock.Continuation = false; } else { indent.Append(oldBlock.InnerIndent); } if (document.IsReadOnly) { if (!oldBlock.Continuation && oldBlock.OneLineBlock == 0 && oldBlock.StartLine == this._block.StartLine && this._block.StartLine < document.LineNumber && this._lastNonCommentChar != ':') { indent.Length = 0; line = document.Text; for (int i = 0; i < line.Length; ++i) { if (!char.IsWhiteSpace(line[i])) { break; } indent.Append(line[i]); } if (startInComment && indent.Length > 0 && indent[indent.Length - 1] == ' ') { indent.Length -= 1; } this._block.InnerIndent = indent.ToString(); } return; } if (startInComment) { indent.Append(' '); } if (indent.Length != (document.Text.Length - line.Length) || !document.Text.StartsWith(indent.ToString(), StringComparison.Ordinal) || char.IsWhiteSpace(document.Text[indent.Length])) { document.Text = indent.ToString() + line; } }