Exemplo n.º 1
0
        public virtual void  AssertAnalyzesTo(Analyzer a, System.String input, System.String[] output)
        {
            TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input));

            for (int i = 0; i < output.Length; i++)
            {
                Token t = ts.Next();
                Assert.IsNotNull(t);
                Assert.AreEqual(t.TermText(), output[i]);
            }
            Assert.IsNull(ts.Next());
            ts.Close();
        }
        /// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary>
        /// <param name="r">a source of text to be tokenized
        /// </param>
        /// <param name="termFreqMap">a Map of terms and their frequencies
        /// </param>
        /// <param name="fieldName">Used by analyzer for any special per-field analysis
        /// </param>
        private void AddTermFrequencies(StreamReader r, IDictionary termFreqMap, String fieldName)
        {
            Lucene.Net.Analysis.TokenStream ts = analyzer.TokenStream(fieldName, r);
            Lucene.Net.Analysis.Token       token;
            int tokenCount = 0;

            while ((token = ts.Next()) != null)
            {
                // for every token
                String word = token.TermText();
                tokenCount++;
                if (tokenCount > maxNumTokensParsed)
                {
                    break;
                }
                if (IsNoiseWord(word))
                {
                    continue;
                }

                // increment frequency
                Int cnt = (Int)termFreqMap[word];
                if (cnt == null)
                {
                    termFreqMap[word] = new Int();
                }
                else
                {
                    cnt.x++;
                }
            }
        }
Exemplo n.º 3
0
        public virtual void  TestPerField()
        {
            System.String           text     = "Qwerty";
            PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer());

            analyzer.AddAnalyzer("special", new SimpleAnalyzer());

            TokenStream tokenStream = analyzer.TokenStream("Field", new System.IO.StringReader(text));
            Token       token       = tokenStream.Next();

            Assert.AreEqual("Qwerty", token.TermText(), "WhitespaceAnalyzer does not lowercase");

            tokenStream = analyzer.TokenStream("special", new System.IO.StringReader(text));
            token       = tokenStream.Next();
            Assert.AreEqual("qwerty", token.TermText(), "SimpleAnalyzer lowercases");
        }
Exemplo n.º 4
0
        private int GetTokenizerLength(string text)
        {
            int result = 0;

            MyAnalyzer ma = new MyAnalyzer(AnalyzerFactory.stopWords);

            Lucene.Net.Analysis.TokenStream ts = ma.TokenStream("", new System.IO.StringReader(text));

            Lucene.Net.Analysis.Token token;
            while ((token = ts.Next()) != null)
            {
                int len = token.TermLength();
                if (len == 1)
                {
                    char[] buff = token.TermBuffer();
                    if (buff[0] != MyFilter.Separator)
                    {
                        result++;
                    }
                }
                else
                {
                    result += len;
                }
            }
            ts.Close();

            return(result);
        }
Exemplo n.º 5
0
        private string GetTokenizerText(string text)
        {
            StringBuilder result = new StringBuilder();

            MyAnalyzer ma = new MyAnalyzer(AnalyzerFactory.stopWords);

            Lucene.Net.Analysis.TokenStream ts = ma.TokenStream("", new System.IO.StringReader(text));

            Lucene.Net.Analysis.Token token;
            while ((token = ts.Next()) != null)
            {
                int    len  = token.TermLength();
                char[] buff = token.TermBuffer();
                if (len == 1)
                {
                    if (buff[0] != MyFilter.Separator)
                    {
                        result.Append(buff, 0, 1);
                    }
                }
                else
                {
                    result.Append(buff, 0, len);
                }
            }
            ts.Close();

            return(result.ToString());
        }
Exemplo n.º 6
0
        public virtual void  TestStopList()
        {
            System.Collections.Hashtable stopWordsSet = new System.Collections.Hashtable();
            stopWordsSet.Add("good", "good");
            stopWordsSet.Add("test", "test");
            stopWordsSet.Add("analyzer", "analyzer");

            // {{Aroush  how can we copy 'stopWordsSet' to 'System.String[]'?
            System.String[] arrStopWordsSet = new System.String[3];
            arrStopWordsSet[0] = "good";
            arrStopWordsSet[1] = "test";
            arrStopWordsSet[2] = "analyzer";
            // Aroush}}

            StopAnalyzer newStop = new StopAnalyzer(arrStopWordsSet);

            System.IO.StringReader reader = new System.IO.StringReader("This is a good test of the english stop analyzer");
            TokenStream            stream = newStop.TokenStream("test", reader);

            Assert.IsTrue(stream != null);
            Token token = null;

            try
            {
                while ((token = stream.Next()) != null)
                {
                    System.String text = token.TermText();
                    Assert.IsTrue(stopWordsSet.Contains(text) == false);
                }
            }
            catch (System.IO.IOException e)
            {
                Assert.IsTrue(false);
            }
        }
Exemplo n.º 7
0
        /// <summary>
        /// 分词测试
        /// </summary>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public string Token(string keyword)
        {
            string ret = "";

            System.IO.StringReader          reader = new System.IO.StringReader(keyword);
            Lucene.Net.Analysis.TokenStream ts     = analyzer.TokenStream(keyword, reader);
            Lucene.Net.Analysis.Token       token  = ts.Next();
            while (token != null)
            {
                ret  += " " + token.TermText();
                token = ts.Next();
            }
            ts.CloneAttributes();
            reader.Close();
            analyzer.Close();
            return(ret);
        }
        public virtual string GetView(TokenStream tokenStream, out int numberOfTokens)
        {
            StringBuilder sb = new StringBuilder();

            Token token = tokenStream.Next();

            numberOfTokens = 0;

            while (token != null)
            {
                numberOfTokens++;
                sb.Append(GetTokenView(token));
                token = tokenStream.Next();
            }

            return sb.ToString();
        }
Exemplo n.º 9
0
        public virtual void  TestDefaults()
        {
            Assert.IsTrue(stop != null);
            System.IO.StringReader reader = new System.IO.StringReader("This is a test of the english stop analyzer");
            TokenStream            stream = stop.TokenStream("test", reader);

            Assert.IsTrue(stream != null);
            Token token = null;

            try
            {
                while ((token = stream.Next()) != null)
                {
                    Assert.IsTrue(inValidTokens.Contains(token.TermText()) == false);
                }
            }
            catch (System.IO.IOException e)
            {
                Assert.IsTrue(false);
            }
        }
        private static void  ConsumeStreamVeryOldAPI(TokenStream stream)
        {
            stream.Reset();

            Token token;
            int   i = 0;

            while ((token = stream.Next()) != null)
            {
                System.String term = token.Term();
                Payload       p    = token.GetPayload();
                if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION)
                {
                    Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun");
                }
                else
                {
                    Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)");
                }
                Assert.AreEqual(results[i], term);
                i++;
            }
        }
Exemplo n.º 11
0
		private static void  ConsumeStreamVeryOldAPI(TokenStream stream)
		{
			stream.Reset();
			
			Token token;
			int i = 0;
			while ((token = stream.Next()) != null)
			{
				System.String term = token.Term();
				Payload p = token.GetPayload();
				if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION)
				{
					Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun");
				}
				else
				{
					Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)");
				}
				Assert.AreEqual(results[i], term);
				i++;
			}
		}
		private void  CheckTokens(TokenStream stream)
		{
			int count = 0;
			Token token;
			while ((token = stream.Next()) != null)
			{
				Assert.IsTrue(count < tokens.Length);
				Assert.AreEqual(tokens[count], token.TermText());
				count++;
			}
			
			Assert.AreEqual(tokens.Length, count);
		}
Exemplo n.º 13
0
		internal virtual void  VerifyPayload(TokenStream ts)
		{
			Token t = new Token();
			for (byte b = 1; ; b++)
			{
				t.Clear();
				t = ts.Next(t);
				if (t == null)
					break;
				// System.out.println("id="+System.identityHashCode(t) + " " + t);
				// System.out.println("payload=" + (int)t.getPayload().toByteArray()[0]);
				Assert.AreEqual(b, t.GetPayload().ToByteArray()[0]);
			}
		}
Exemplo n.º 14
0
        public override string GetView(TokenStream tokenStream, out int numberOfTokens)
        {
            StringBuilder sb = new StringBuilder();

            Token token = tokenStream.Next();

            numberOfTokens = 0;
            OleDbConnection myConnection = new OleDbConnection("Provider=Microsoft.Jet.OLEDB.4.0; Data Source=" + Application.ExecutablePath.Substring(0, Application.ExecutablePath.LastIndexOf('\\')) + "\\DB.mdb");
            myConnection.Open();

            OleDbCommand myCommand = new OleDbCommand();
            myCommand.Connection = myConnection;
            myCommand.CommandText = "CREATE TABLE [frequencies](" +
                "[Word] VARCHAR( 50 ) NOT NULL ," +
                "[Count] INT NOT NULL )";
            myCommand.ExecuteNonQuery();

            while (token != null)
            {
                numberOfTokens++;

                if (termDictionary.Keys.Contains(token.TermText()))
                    termDictionary[token.TermText()] = termDictionary[token.TermText()] + 1;
                else
                    termDictionary.Add(token.TermText(), 1);

                token = tokenStream.Next();
            }

            foreach (var item in termDictionary.OrderBy(x => x.Key))
            {
                sb.Append(item.Key + " [" + item.Value + "]   ");
                myCommand.CommandText = "insert into frequencies ([Word],[Count]) values ('" + item.Key + "','" + item.Value + "')";
                myCommand.ExecuteNonQuery();
            }
                myCommand.Connection.Close();

            termDictionary.Clear();

            return sb.ToString();
        }