public void ReplaceConventionsIncludeMultipleSymbol() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var tokens = tokenizer.Tokenize("I jump. And you?"); Assert.IsTrue(tokens[0].Text == "I"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "jump"); Assert.IsTrue(tokens[1].Start == 2); Assert.IsTrue(tokens[2].Text == "."); Assert.IsTrue(tokens[2].Start == 6); Assert.IsTrue(tokens[3].Text == "And"); Assert.IsTrue(tokens[3].Start == 8); Assert.IsTrue(tokens[4].Text == "you"); Assert.IsTrue(tokens[4].Start == 12); Assert.IsTrue(tokens[5].Text == "?"); Assert.IsTrue(tokens[5].Start == 15); }
public void TriGramInCoNLL2000() { // tokenization var tokenizer = new TokenizerFactory <RegexTokenizer>(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC }, SupportedLanguage.English); var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); // test tag var tagger = new TaggerFactory <NGramTagger>(new TagOptions { NGram = 3, Tag = "NN", Corpus = GetTaggedCorpus() }, SupportedLanguage.English); tagger.Tag(new Sentence { Words = tokens }); Assert.IsTrue(tokens[0].Pos == "NNP"); Assert.IsTrue(tokens[1].Pos == "IN"); Assert.IsTrue(tokens[2].Pos == "DT"); Assert.IsTrue(tokens[3].Pos == "NNP"); }
public static void parseRules(List /*<String>*/ rules, SynonymMap map, string mappingSep, string synSep, bool expansion, TokenizerFactory tokFactory) { int count = 0; for (var iter = rules.iterator(); iter.hasNext();) { // To use regexes, we need an expression that specifies an odd number of chars. // This can't really be done with string.split(), and since we need to // do unescaping at some point anyway, we wouldn't be saving any effort // by using regexes. string rule = (string)iter.next(); List /*<String>*/ mapping = StrUtils.splitSmart(rule, mappingSep, false); List /*<List<String>>*/ source; List /*<List<String>>*/ target; if (mapping.size() > 2) { throw new System.ApplicationException("Invalid Synonym Rule:" + rule); } else if (mapping.size() == 2) { source = getSynList((string)mapping.get(0), synSep, tokFactory); target = getSynList((string)mapping.get(1), synSep, tokFactory); } else { source = getSynList((string)mapping.get(0), synSep, tokFactory); if (expansion) { // expand to all arguments target = source; } else { // reduce to first argument target = new ArrayList/*<List<String>>*/ (1); target.add(source.get(0)); } } bool includeOrig = false; for (var fromIter = source.iterator(); fromIter.hasNext();) { List /*<String>*/ fromToks = (List)fromIter.next(); count++; for (var toIter = target.iterator(); toIter.hasNext();) { List /*<String>*/ toToks = (List)toIter.next(); map.add(fromToks, SynonymMap.makeTokens(toToks), includeOrig, true ); } } } }
public void Inform(IResourceLoader loader) { TokenizerFactory factory = tokenizerFactory == null ? null : LoadTokenizerFactory(loader, tokenizerFactory); Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, factory); try { string formatClass = format; if (format == null || format.Equals("solr", StringComparison.Ordinal)) { formatClass = typeof(SolrSynonymParser).AssemblyQualifiedName; } else if (format.Equals("wordnet", StringComparison.Ordinal)) { formatClass = typeof(WordnetSynonymParser).AssemblyQualifiedName; } // TODO: expose dedup as a parameter? map = LoadSynonyms(loader, formatClass, true, analyzer); } catch (Exception e) { throw new IOException("Error parsing synonyms file:", e); } }
public void TestVectorizer() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var extractor = new CountFeatureExtractor(); extractor.Sentences = tokenizer.Tokenize(Corpus()); extractor.Vectorize(new List <string>()); var vectors = Vectors(); for (int i = 0; i < extractor.Sentences.Count; i++) { var sentence = extractor.Sentences[i]; for (int j = 0; j < extractor.Features.Count; j++) { var word = sentence.Words.Find(w => w.Lemma == extractor.Features[j]); if (word != null) { Assert.IsTrue(word.Vector == vectors[i][j]); } } } }
public void OneHotTest() { var reader = new FasttextDataReader(); var sentences = reader.Read(new ReaderOptions { DataDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), FileName = "cooking.stackexchange.txt" }); var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); for (int i = 0; i < newSentences.Count; i++) { newSentences[i].Label = sentences[i].Label; } sentences = newSentences.ToList(); var encoder = new OneHotEncoder(); encoder.Sentences = sentences; encoder.EncodeAll(); }
public void TriGramInCoNLL2000() { // tokenization var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment"); // test tag var tagger = new TaggerFactory(new TagOptions { CorpusDir = Configuration.GetValue <String>("CherubNLP:dataDir"), NGram = 3, Tag = "NN" }, SupportedLanguage.English); tagger.GetTagger <NGramTagger>(); tagger.Tag(new Sentence { Words = tokens }); Assert.IsTrue(tokens[0].Pos == "NNP"); Assert.IsTrue(tokens[1].Pos == "IN"); Assert.IsTrue(tokens[2].Pos == "DT"); Assert.IsTrue(tokens[3].Pos == "NNP"); }
private static List /*<String>*/ splitByTokenizer(string source, TokenizerFactory tokFactory) { StringReader reader = new StringReader(source); TokenStream ts = loadTokenizer(tokFactory, reader); List /*<String>*/ tokList = new ArrayList/*<String>*/ (); try { #pragma warning disable 612 for (Token token = ts.next(); token != null; token = ts.next()) { #pragma warning restore 612 string text = new string(token.termBuffer(), 0, token.termLength()); if (text.Length > 0) { tokList.add(text); } } } catch (IOException e) { throw new System.ApplicationException("Unexpected exception.", e); } finally{ reader.close(); } return(tokList); }
public void ReplaceConventions() { var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var tokens = tokenizer.Tokenize("I cannot jump."); Assert.IsTrue(tokens[0].Text == "I"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[1].Text == "can"); Assert.IsTrue(tokens[1].Start == 2); Assert.IsTrue(tokens[2].Text == "not"); Assert.IsTrue(tokens[2].Start == 5); Assert.IsTrue(tokens[3].Text == "jump"); Assert.IsTrue(tokens[3].Start == 9); Assert.IsTrue(tokens[4].Text == "."); Assert.IsTrue(tokens[4].Start == 13); }
internal FactoryAnalyzer(TokenizerFactory tokenizer, TokenFilterFactory tokenfilter, CharFilterFactory charFilter) { Debug.Assert(tokenizer != null); this.tokenizer = tokenizer; this.charFilter = charFilter; this.tokenfilter = tokenfilter; }
public void TokenizeInWhiteSpace() { var tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = RegexTokenizer.WHITE_SPACE }, SupportedLanguage.English); tokenizer.GetTokenizer <RegexTokenizer>(); var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?"); Assert.IsTrue(tokens[0].Start == 0); Assert.IsTrue(tokens[0].Text == "Chop"); Assert.IsTrue(tokens[1].Start == 5); Assert.IsTrue(tokens[1].Text == "into"); Assert.IsTrue(tokens[2].Start == 10); Assert.IsTrue(tokens[2].Text == "pieces,"); Assert.IsTrue(tokens[3].Start == 18); Assert.IsTrue(tokens[3].Text == "isn't"); Assert.IsTrue(tokens[4].Start == 24); Assert.IsTrue(tokens[4].Text == "it?"); }
public AsyncTcpServer(int port, int poolSize, TokenizerFactory <T> tokFactory, ServerProtocolFactory <T> protFactory) { _port = port; _poolSize = poolSize; _tokenizerFactory = tokFactory; _protocolFactory = protFactory; }
public void Inform(IResourceLoader loader) { TokenizerFactory factory = tokenizerFactory is null ? null : LoadTokenizerFactory(loader, tokenizerFactory); Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { #pragma warning disable 612, 618 Tokenizer tokenizer = factory is null ? new WhitespaceTokenizer(LuceneVersion.LUCENE_CURRENT, reader) : factory.Create(reader); TokenStream stream = ignoreCase ? (TokenStream) new LowerCaseFilter(LuceneVersion.LUCENE_CURRENT, tokenizer) : tokenizer; #pragma warning restore 612, 618 return(new TokenStreamComponents(tokenizer, stream)); }); try { string formatClass = format; if (format is null || format.Equals("solr", StringComparison.Ordinal)) { formatClass = typeof(SolrSynonymParser).AssemblyQualifiedName; } else if (format.Equals("wordnet", StringComparison.Ordinal)) { formatClass = typeof(WordnetSynonymParser).AssemblyQualifiedName; } // TODO: expose dedup as a parameter? map = LoadSynonyms(loader, formatClass, true, analyzer); }
public static void DemoAPI(LexicalizedParser lp) { // This option shows parsing a list of correctly tokenized words var sent = new[] { "This", "is", "an", "easy", "sentence", "." }; java.util.List rawWords = Sentence.toCoreLabelList(sent); Tree parse = lp.apply(rawWords); parse.pennPrint(); // This option shows loading and using an explicit tokenizer const string Sent2 = "This is another sentence."; TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); var sent2Reader = new StringReader(Sent2); java.util.List rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize(); parse = lp.apply(rawWords2); var tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); java.util.List tdl = gs.typedDependenciesCCprocessed(); Console.WriteLine("\n{0}\n", tdl); var tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(parse); }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: private void doTestTokenizer(String tokenizer) throws java.io.IOException private void doTestTokenizer(string tokenizer) { //JAVA TO C# CONVERTER TODO TASK: Java wildcard generics are not converted to .NET: //ORIGINAL LINE: Class<? extends org.apache.lucene.analysis.util.TokenizerFactory> factoryClazz = org.apache.lucene.analysis.util.TokenizerFactory.lookupClass(tokenizer); Type <?> factoryClazz = TokenizerFactory.lookupClass(tokenizer); TokenizerFactory factory = (TokenizerFactory)initialize(factoryClazz); if (factory != null) { // we managed to fully create an instance. check a few more things: // if it implements MultiTermAware, sanity check its impl if (factory is MultiTermAwareComponent) { AbstractAnalysisFactory mtc = ((MultiTermAwareComponent)factory).MultiTermComponent; assertNotNull(mtc); // its not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it assertFalse(mtc is CharFilterFactory); } // beast it just a little, it shouldnt throw exceptions: // (it should have thrown them in initialize) checkRandomData(random(), new FactoryAnalyzer(factory, null, null), 100, 20, false, false); } }
private List <CodeRegion> Act(string filePath) { var tokenizer = TokenizerFactory.GetFor(DemoLanguage.Java); var parser = new RegionParser(filePath, tokenizer); var result = parser.GetRegions(); return(result.ToList()); }
public static DemoCodeBuilder Initialize(string filePath, DemoLanguage language) { var tokenizer = TokenizerFactory.GetFor(language); var parser = new RegionParser(filePath, tokenizer); var regions = parser.GetRegions().ToList(); return(new DemoCodeBuilder(filePath, regions)); }
private string Convert(string phrase, TokenizerMapConverter tokenizerMapConverter) { var tokenizer = TokenizerFactory.Create(tokenizerMapConverter.Tokenizer); var words = tokenizer.Segment(phrase); var wordsConvert = WordsConverterFactory.Create(tokenizerMapConverter.Converter); return(wordsConvert.Convert(words)); }
public AnalyzerFactory(ResourceLoader resourceLoader) { _tokenizerFactory = new WhitespaceTokenizerFactory(); _lowerCaseFactory = new LowerCaseFilterFactory(); _synonymFactory = new SynonymFilterFactory(); { var args = new java.util.HashMap(); args.put("ignoreCase", "true"); args.put("expand", "false"); args.put("synonyms", "synonyms.txt"); _synonymFactory.init(args); ((ResourceLoaderAware)_synonymFactory).inform(resourceLoader); } _commonGramsFactory = new CommonGramsFilterFactory(); { var args = new java.util.HashMap(); args.put("ignoreCase", "true"); _commonGramsFactory.init(args); ((ResourceLoaderAware)_commonGramsFactory).inform(resourceLoader); } _commonGramsQueryFactory = new CommonGramsQueryFilterFactory(); { var args = new java.util.HashMap(); args.put("ignoreCase", "true"); _commonGramsQueryFactory.init(args); ((ResourceLoaderAware)_commonGramsQueryFactory).inform(resourceLoader); } _wordDelimiterFactory = new WordDelimiterFilterFactory(); { var args = new java.util.HashMap(); args.put("catenateWords", "1"); args.put("catenateNumbers", "1"); args.put("protected", "protwords.txt"); _wordDelimiterFactory.init(args); ((ResourceLoaderAware)_wordDelimiterFactory).inform(resourceLoader); } _stemmerFactory = new KStemFilterFactory(); { var args = new java.util.HashMap(); args.put("protected", "protwords.txt"); _stemmerFactory.init(args); ((ResourceLoaderAware)_stemmerFactory).inform(resourceLoader); } _edgeNGramFactory = new EdgeNGramTokenFilterFactory(); { var args = new java.util.HashMap(); args.put("side", "FRONT"); args.put("minGramSize", 2); _edgeNGramFactory.init(args); ((ResourceLoaderAware)_edgeNGramFactory).inform(resourceLoader); } }
public void TestList(string ch, AsmToken[] tk) { var sut = TokenizerFactory.Create(); var actual = sut.Tokenize(ch) .Select(t => t.Kind) .ToList(); Assert.Equal(tk, actual); }
public AnalyzerFactory(IList <CharFilterFactory> charFilterFactories, TokenizerFactory tokenizerFactory, IList <TokenFilterFactory> tokenFilterFactories) { this.charFilterFactories = charFilterFactories; Debug.Assert(null != tokenizerFactory); this.tokenizerFactory = tokenizerFactory; this.tokenFilterFactories = tokenFilterFactories; }
internal FactoryAnalyzer(TokenizerFactory tokenizer, TokenFilterFactory tokenfilter, CharFilterFactory charFilter) { if (Debugging.AssertsEnabled) { Debugging.Assert(tokenizer != null); } this.tokenizer = tokenizer; this.charFilter = charFilter; this.tokenfilter = tokenfilter; }
public BotSharpTokenizer() { _tokenizer = new TokenizerFactory <RegexTokenizer>(new TokenizationOptions { Pattern = RegexTokenizer.WORD_PUNC, SpecialWords = new List <string> { "'s" } }, SupportedLanguage.English); }
public override void run(string format, string[] args) { base.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(@params.Params, false); if (mlParams != null) { if (!TrainUtil.isValid(mlParams.Settings)) { throw new TerminateToolException(1, "Training parameters file '" + @params.Params + "' is invalid!"); } if (TrainUtil.isSequenceTraining(mlParams.Settings)) { throw new TerminateToolException(1, "Sequence training is not supported!"); } } if (mlParams == null) { mlParams = ModelUtil.createTrainingParameters(@params.Iterations.Value, @params.Cutoff.Value); } File modelOutFile = @params.Model; CmdLineUtil.checkOutputFile("tokenizer model", modelOutFile); TokenizerModel model; try { Dictionary dict = loadDict(@params.AbbDict); TokenizerFactory tokFactory = TokenizerFactory.create(@params.Factory, @params.Lang, dict, @params.AlphaNumOpt.Value, null); model = opennlp.tools.tokenize.TokenizerME.train(sampleStream, tokFactory, mlParams); } catch (IOException e) { throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.Message, e); } finally { try { sampleStream.close(); } catch (IOException) { // sorry that this can fail } } CmdLineUtil.writeModel("tokenizer", modelOutFile, model); }
public AnalyzerFactory(IList <CharFilterFactory> charFilterFactories, TokenizerFactory tokenizerFactory, IList <TokenFilterFactory> tokenFilterFactories) { this.charFilterFactories = charFilterFactories; if (Debugging.AssertsEnabled) { Debugging.Assert(null != tokenizerFactory); } this.tokenizerFactory = tokenizerFactory; this.tokenFilterFactories = tokenFilterFactories; }
public void CookingTest() { var reader = new FasttextDataReader(); var sentences = reader.Read(new ReaderOptions { DataDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), FileName = "cooking.stackexchange.txt" }); var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); for (int i = 0; i < newSentences.Count; i++) { newSentences[i].Label = sentences[i].Label; } sentences = newSentences.ToList(); sentences.Shuffle(); var options = new ClassifyOptions { ModelFilePath = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange", "nb.model"), TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"), Dimension = 100 }; var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English); var dataset = sentences.Split(0.7M); classifier.Train(dataset.Item1); int correct = 0; int total = 0; dataset.Item2.ForEach(td => { var classes = classifier.Classify(td); if (td.Label == classes[0].Item1) { correct++; } total++; }); var accuracy = (float)correct / total; Assert.IsTrue(accuracy > 0.5); }
public void SpookyAuthorIdentification() { var reader = new KaggleTextDataReader(); var sentences = reader.Read(new ReaderOptions { FileName = "train.csv" }); var tokenizer = new TokenizerFactory(new TokenizationOptions { }, SupportedLanguage.English); tokenizer.GetTokenizer <TreebankTokenizer>(); var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList()); for (int i = 0; i < newSentences.Count; i++) { newSentences[i].Id = sentences[i].Id; newSentences[i].Label = sentences[i].Label; } sentences = newSentences.ToList(); sentences.Shuffle(); var dataset = sentences.Take(2000).ToList().Split(0.7M); var options = new ClassifyOptions { ModelDir = AppContext.BaseDirectory, ModelFilePath = Path.Combine(AppContext.BaseDirectory, "nb.model"), Dimension = 300 }; var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English); classifier.GetClassifer("NaiveBayesClassifier"); classifier.Train(dataset.Item1); int correct = 0; int total = 0; dataset.Item2.ForEach(td => { var classes = classifier.Classify(td); if (td.Label == classes[0].Item1) { correct++; } total++; }); var accuracy = (float)correct / total; Assert.IsTrue(accuracy > 0.5); }
public void Inform(IResourceLoader loader) { TokenizerFactory tokFactory = null; if (tf != null) { tokFactory = LoadTokenizerFactory(loader, tf); } IEnumerable <string> wlist = LoadRules(synonyms, loader); synMap = new SlowSynonymMap(ignoreCase); ParseRules(wlist, synMap, "=>", ",", expand, tokFactory); }
public NlpService() { string parserFileOrUrl = "englishPCFG.ser.gz"; _lp = LexicalizedParser.loadModel(parserFileOrUrl); if (_lp == null) { throw new InvalidOperationException("couldn't load " + parserFileOrUrl); } _tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); _tlp = new PennTreebankLanguagePack(); _structureFactory = _tlp.grammaticalStructureFactory(); }
private void Init() { if (_tokenizer == null) { _tokenizer = new TokenizerFactory(new TokenizationOptions { Pattern = Configuration.GetValue <String>("options:pattern") }, SupportedLanguage.English); string tokenizerName = Configuration.GetValue <String>($"tokenizer"); _tokenizer.GetTokenizer(tokenizerName); } }