public virtual void AssertAnalyzesTo(Analyzer a, System.String input, System.String[] output) { TokenStream ts = a.TokenStream("dummy", new System.IO.StringReader(input)); for (int i = 0; i < output.Length; i++) { Token t = ts.Next(); Assert.IsNotNull(t); Assert.AreEqual(t.TermText(), output[i]); } Assert.IsNull(ts.Next()); ts.Close(); }
/// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary> /// <param name="r">a source of text to be tokenized /// </param> /// <param name="termFreqMap">a Map of terms and their frequencies /// </param> /// <param name="fieldName">Used by analyzer for any special per-field analysis /// </param> private void AddTermFrequencies(StreamReader r, IDictionary termFreqMap, String fieldName) { Lucene.Net.Analysis.TokenStream ts = analyzer.TokenStream(fieldName, r); Lucene.Net.Analysis.Token token; int tokenCount = 0; while ((token = ts.Next()) != null) { // for every token String word = token.TermText(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (IsNoiseWord(word)) { continue; } // increment frequency Int cnt = (Int)termFreqMap[word]; if (cnt == null) { termFreqMap[word] = new Int(); } else { cnt.x++; } } }
public virtual void TestPerField() { System.String text = "Qwerty"; PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer()); analyzer.AddAnalyzer("special", new SimpleAnalyzer()); TokenStream tokenStream = analyzer.TokenStream("Field", new System.IO.StringReader(text)); Token token = tokenStream.Next(); Assert.AreEqual("Qwerty", token.TermText(), "WhitespaceAnalyzer does not lowercase"); tokenStream = analyzer.TokenStream("special", new System.IO.StringReader(text)); token = tokenStream.Next(); Assert.AreEqual("qwerty", token.TermText(), "SimpleAnalyzer lowercases"); }
private int GetTokenizerLength(string text) { int result = 0; MyAnalyzer ma = new MyAnalyzer(AnalyzerFactory.stopWords); Lucene.Net.Analysis.TokenStream ts = ma.TokenStream("", new System.IO.StringReader(text)); Lucene.Net.Analysis.Token token; while ((token = ts.Next()) != null) { int len = token.TermLength(); if (len == 1) { char[] buff = token.TermBuffer(); if (buff[0] != MyFilter.Separator) { result++; } } else { result += len; } } ts.Close(); return(result); }
private string GetTokenizerText(string text) { StringBuilder result = new StringBuilder(); MyAnalyzer ma = new MyAnalyzer(AnalyzerFactory.stopWords); Lucene.Net.Analysis.TokenStream ts = ma.TokenStream("", new System.IO.StringReader(text)); Lucene.Net.Analysis.Token token; while ((token = ts.Next()) != null) { int len = token.TermLength(); char[] buff = token.TermBuffer(); if (len == 1) { if (buff[0] != MyFilter.Separator) { result.Append(buff, 0, 1); } } else { result.Append(buff, 0, len); } } ts.Close(); return(result.ToString()); }
public virtual void TestStopList() { System.Collections.Hashtable stopWordsSet = new System.Collections.Hashtable(); stopWordsSet.Add("good", "good"); stopWordsSet.Add("test", "test"); stopWordsSet.Add("analyzer", "analyzer"); // {{Aroush how can we copy 'stopWordsSet' to 'System.String[]'? System.String[] arrStopWordsSet = new System.String[3]; arrStopWordsSet[0] = "good"; arrStopWordsSet[1] = "test"; arrStopWordsSet[2] = "analyzer"; // Aroush}} StopAnalyzer newStop = new StopAnalyzer(arrStopWordsSet); System.IO.StringReader reader = new System.IO.StringReader("This is a good test of the english stop analyzer"); TokenStream stream = newStop.TokenStream("test", reader); Assert.IsTrue(stream != null); Token token = null; try { while ((token = stream.Next()) != null) { System.String text = token.TermText(); Assert.IsTrue(stopWordsSet.Contains(text) == false); } } catch (System.IO.IOException e) { Assert.IsTrue(false); } }
/// <summary> /// 分词测试 /// </summary> /// <param name="keyword"></param> /// <returns></returns> public string Token(string keyword) { string ret = ""; System.IO.StringReader reader = new System.IO.StringReader(keyword); Lucene.Net.Analysis.TokenStream ts = analyzer.TokenStream(keyword, reader); Lucene.Net.Analysis.Token token = ts.Next(); while (token != null) { ret += " " + token.TermText(); token = ts.Next(); } ts.CloneAttributes(); reader.Close(); analyzer.Close(); return(ret); }
public virtual string GetView(TokenStream tokenStream, out int numberOfTokens) { StringBuilder sb = new StringBuilder(); Token token = tokenStream.Next(); numberOfTokens = 0; while (token != null) { numberOfTokens++; sb.Append(GetTokenView(token)); token = tokenStream.Next(); } return sb.ToString(); }
public virtual void TestDefaults() { Assert.IsTrue(stop != null); System.IO.StringReader reader = new System.IO.StringReader("This is a test of the english stop analyzer"); TokenStream stream = stop.TokenStream("test", reader); Assert.IsTrue(stream != null); Token token = null; try { while ((token = stream.Next()) != null) { Assert.IsTrue(inValidTokens.Contains(token.TermText()) == false); } } catch (System.IO.IOException e) { Assert.IsTrue(false); } }
private static void ConsumeStreamVeryOldAPI(TokenStream stream) { stream.Reset(); Token token; int i = 0; while ((token = stream.Next()) != null) { System.String term = token.Term(); Payload p = token.GetPayload(); if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) { Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun"); } else { Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)"); } Assert.AreEqual(results[i], term); i++; } }
private static void ConsumeStreamVeryOldAPI(TokenStream stream) { stream.Reset(); Token token; int i = 0; while ((token = stream.Next()) != null) { System.String term = token.Term(); Payload p = token.GetPayload(); if (p != null && p.GetData().Length == 1 && p.GetData()[0] == PartOfSpeechAnnotatingFilter.PROPER_NOUN_ANNOTATION) { Assert.IsTrue("tokenstream".Equals(term), "only TokenStream is a proper noun"); } else { Assert.IsFalse("tokenstream".Equals(term), "all other tokens (if this test fails, the special POSToken subclass is not correctly passed through the chain)"); } Assert.AreEqual(results[i], term); i++; } }
private void CheckTokens(TokenStream stream) { int count = 0; Token token; while ((token = stream.Next()) != null) { Assert.IsTrue(count < tokens.Length); Assert.AreEqual(tokens[count], token.TermText()); count++; } Assert.AreEqual(tokens.Length, count); }
internal virtual void VerifyPayload(TokenStream ts) { Token t = new Token(); for (byte b = 1; ; b++) { t.Clear(); t = ts.Next(t); if (t == null) break; // System.out.println("id="+System.identityHashCode(t) + " " + t); // System.out.println("payload=" + (int)t.getPayload().toByteArray()[0]); Assert.AreEqual(b, t.GetPayload().ToByteArray()[0]); } }
public override string GetView(TokenStream tokenStream, out int numberOfTokens) { StringBuilder sb = new StringBuilder(); Token token = tokenStream.Next(); numberOfTokens = 0; OleDbConnection myConnection = new OleDbConnection("Provider=Microsoft.Jet.OLEDB.4.0; Data Source=" + Application.ExecutablePath.Substring(0, Application.ExecutablePath.LastIndexOf('\\')) + "\\DB.mdb"); myConnection.Open(); OleDbCommand myCommand = new OleDbCommand(); myCommand.Connection = myConnection; myCommand.CommandText = "CREATE TABLE [frequencies](" + "[Word] VARCHAR( 50 ) NOT NULL ," + "[Count] INT NOT NULL )"; myCommand.ExecuteNonQuery(); while (token != null) { numberOfTokens++; if (termDictionary.Keys.Contains(token.TermText())) termDictionary[token.TermText()] = termDictionary[token.TermText()] + 1; else termDictionary.Add(token.TermText(), 1); token = tokenStream.Next(); } foreach (var item in termDictionary.OrderBy(x => x.Key)) { sb.Append(item.Key + " [" + item.Value + "] "); myCommand.CommandText = "insert into frequencies ([Word],[Count]) values ('" + item.Key + "','" + item.Value + "')"; myCommand.ExecuteNonQuery(); } myCommand.Connection.Close(); termDictionary.Clear(); return sb.ToString(); }