コード例 #1
0
		public void IncrementsOffsetCorrectlyWithAnotherReader2()
		{
			const string input = @"test1 <a href=""foo"">testlink</a> test2 test3";

			CharFilter filter = new HTMLStripCharFilter(CharReader.Get(new StringReader(input)));
			Tokenizer t = new Tokenizer(filter);

			string token = string.Empty;
			List<Token> results = new List<Token>();

			t.NextToken(out token);
			Assert.Equal(0, filter.CorrectOffset(t.Offset));
			Assert.Equal(5, t.LengthInSource);

			t.NextToken(out token);
			Assert.Equal(20, filter.CorrectOffset(t.Offset));
			Assert.Equal(8, t.LengthInSource);

			t.NextToken(out token);
			Assert.Equal(33, filter.CorrectOffset(t.Offset));
			Assert.Equal(5, t.LengthInSource);

			t.NextToken(out token);
			Assert.Equal(39, filter.CorrectOffset(t.Offset));
			Assert.Equal(5, t.LengthInSource);
		}
コード例 #2
0
		public void TestGamma()
		{
			const string test = "&Gamma;";
			const string gold = "\u0393";
			var set = new HashSet<String> { "reserved" };
			var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
			var builder = new StringBuilder();
			int ch = 0;
			while ((ch = reader.Read()) != -1)
			{
				builder.Append((char)ch);
			}
			var result = builder.ToString();
			// System.out.println("Resu: " + result + "<EOL>");
			// System.out.println("Gold: " + gold + "<EOL>");
			Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>");
		}
コード例 #3
0
		public void TestHTML()
		{
			var reader = new HTMLStripCharFilter(CharReader.Get(new StreamReader(GetTestFile("htmlStripReaderTest.html"))));
			var builder = new StringBuilder();
			var ch = -1;
			while ((ch = reader.Read()) != -1)
			{
				builder.Append((char)ch);
			}
			var str = builder.ToString();
			Assert.IsTrue(str.IndexOf("&lt;") == -1, "Entity not properly escaped");//there is one > in the text
			Assert.IsTrue(str.IndexOf("forrest") == -1 && str.IndexOf("Forrest") == -1, "Forrest should have been stripped out");
			Assert.IsTrue(str.Trim().StartsWith("Welcome to Solr"), "File should start with 'Welcome to Solr' after trimming");

			Assert.IsTrue(str.Trim().EndsWith("Foundation."), "File should start with 'Foundation.' after trimming");

		}
コード例 #4
0
		public void Test()
		{
			const string html = "<div class=\"foo\">this is some text</div> here is a <a href=\"#bar\">link</a> and " +
								"another <a href=\"http://lucene.apache.org/\">link</a>. " +
								"This is an entity: &amp; plus a &lt;.  Here is an &. <!-- is a comment -->";
			const string gold = " this is some text  here is a  link  and " +
								"another  link . " +
								"This is an entity: & plus a <.  Here is an &.  ";
			var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(html)));
			var builder = new StringBuilder();
			var ch = -1;
			var goldArray = gold.ToCharArray();
			var position = 0;
			while ((ch = reader.Read()) != -1)
			{
				var theChar = (char)ch;
				builder.Append(theChar);
				Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position]
						   + "\". Buffer so far: " + builder + "<EOB>");
				position++;
			}
			Assert.AreEqual(gold, builder.ToString());
		}
コード例 #5
0
		public void TestEntities()
		{
			const string test = "&nbsp; &lt;foo&gt; &Uuml;bermensch &#61; &Gamma; bar &#x393;";
			const string gold = "  <foo> \u00DCbermensch = \u0393 bar \u0393";
			var set = new HashSet<String> { "reserved" };
			var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
			var builder = new StringBuilder();
			int ch = 0;
			while ((ch = reader.Read()) != -1)
			{
				builder.Append((char)ch);
			}
			var result = builder.ToString();
			// System.out.println("Resu: " + result + "<EOL>");
			// System.out.println("Gold: " + gold + "<EOL>");
			Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold + "<EOS>");
		}
コード例 #6
0
		public void TestHebrewScenarios()
		{
			const string html = "<div class=\"foo\">בדיקה ראשונה</div> וכאן נוסיף גם <a href=\"#bar\">לינק</a> ועכשיו " +
					"גם <a alt=\"לינק מסובך עם תיאור\" href=\"http://lucene.apache.org/\">לינק מסובך יותר</a>. " +
					" <!-- הערה אחת ויחידה -->";
			const string gold = " בדיקה ראשונה  וכאן נוסיף גם  לינק  ועכשיו " +
			                    "גם  לינק מסובך יותר .   ";
			var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(html)));
			var builder = new StringBuilder();
			var ch = -1;
			var goldArray = gold.ToCharArray();
			var position = 0;
			while ((ch = reader.Read()) != -1)
			{
				var theChar = (char)ch;
				builder.Append(theChar);
				Assert.IsTrue(theChar == goldArray[position], "\"" + theChar + "\"" + " at position: " + position + " does not equal: \"" + goldArray[position] + "\". Buffer so far: " + builder + "<EOB>");
				position++;
			}
			Assert.AreEqual(gold, builder.ToString());

			doTestOffsets("שלום X מה X שלומך חבר");
		}
コード例 #7
0
		public void doTestOffsets(String input)
		{
			var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(input)));
			int ch = 0;
			int off = 0; // offset in the reader
			int strOff = -1; // offset in the original string
			while ((ch = reader.Read()) != -1)
			{
				var correctedOff = reader.CorrectOffset(off);

				if (ch == 'X')
				{
					strOff = input.IndexOf('X', strOff + 1);
					assertEquals(strOff, correctedOff);
				}

				off++;
			}
		}
コード例 #8
0
		public void TestComment()
		{

			const string test = "<!--- three dashes, still a valid comment ---> ";
			const string gold = "  ";
			var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test))); //force the use of BufferedReader
			int ch = 0;
			var builder = new StringBuilder();
			try
			{
				while ((ch = reader.Read()) != -1)
				{
					builder.Append((char)ch);
				}
			}
			finally
			{
				// System.out.println("String: " + builder.toString());
			}
			assertTrue(builder.ToString() + " is not equal to " + gold + "<EOS>", builder.ToString().Equals(gold) == true);
		}
コード例 #9
0
		private static void processBuffer(String test, String assertMsg)
		{
			// System.out.println("-------------------processBuffer----------");
			var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test))); //force the use of BufferedReader
			var builder = new StringBuilder();
			try
			{
				var ch = 0;
				while ((ch = reader.Read()) != -1)
				{
					builder.Append((char)ch);
				}
			}
			finally
			{
				// System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
			}
			Assert.AreEqual(test, builder.ToString(), assertMsg);
		}
コード例 #10
0
		public void TestMalformedHTML()
		{
			const string test = "a <a hr<ef=aa<a>> </close</a>";
			const string gold = "a <a hr<ef=aa > </close ";
			//					   <aa hhr<<eef=aa > </close<
			var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)));
			var builder = new StringBuilder();
			var ch = 0;
			while ((ch = reader.Read()) != -1)
			{
				builder.Append((char)ch);
			}
			var result = builder.ToString();
			// System.out.println("Resu: " + result + "<EOL>");
			// System.out.println("Gold: " + gold + "<EOL>");
			assertTrue(result + " is not equal to " + gold + "<EOS>", result.Equals(gold));
		}
コード例 #11
0
		public void TestReserved()
		{
			const string test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved ggg=\"hhhh\"/> <other/>";
			var set = new HashSet<String> {"reserved"};
			var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
			var builder = new StringBuilder();
			int ch = 0;
			while ((ch = reader.Read()) != -1)
			{
				builder.Append((char)ch);
			}
			var result = builder.ToString();
			// System.out.println("Result: " + result);
			assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved"), result.IndexOf("reserved") == 9);
			assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 15), result.IndexOf("reserved", 15) == 38);
			assertTrue("Escaped tag not preserved: " + result.IndexOf("reserved", 41), result.IndexOf("reserved", 41) == 54);
			assertTrue("Other tag should be removed", result.IndexOf("other") == -1);
		}
コード例 #12
0
		public void TestMoreEntities()
		{
			const string test = "&nbsp; &lt;junk/&gt; &nbsp; &#33; &#64; and &#8217;";
			const string gold = "  <junk/>   ! @ and ’";
			var set = new HashSet<String> {"reserved"};
			var reader = new HTMLStripCharFilter(CharReader.Get(new StringReader(test)), set);
			var builder = new StringBuilder();
			int ch = 0;
			while ((ch = reader.Read()) != -1)
			{
				builder.Append((char)ch);
			}
			var result = builder.ToString();
			// System.out.println("Resu: " + result + "<EOL>");
			// System.out.println("Gold: " + gold + "<EOL>");
			Assert.IsTrue(result.Equals(gold), result + " is not equal to " + gold);
		}
コード例 #13
0
		public override Lucene.Net.Analysis.TokenStream TokenStream(string fieldName, System.IO.TextReader reader)
		{
			var htmlCharFilter = new HTMLStripCharFilter(CharReader.Get(reader));
			return base.TokenStream(fieldName, htmlCharFilter);
		}
コード例 #14
0
		public override TokenStream ReusableTokenStream(string fieldName, System.IO.TextReader reader)
		{
			var filter = new HTMLStripCharFilter(CharReader.Get(reader));
			return base.TokenStream(fieldName, filter);
		}