Ejemplos de código de TokenizerFactory en C# (CSharp)

Ejemplo n.º 1

0

Mostrar archivo

Archivo: TreebankTokenizerTest.cs Proyecto: steven-wang70/CherubNLP

        public void ReplaceConventionsIncludeMultipleSymbol()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var tokens = tokenizer.Tokenize("I jump. And you?");

            Assert.IsTrue(tokens[0].Text == "I");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "jump");
            Assert.IsTrue(tokens[1].Start == 2);

            Assert.IsTrue(tokens[2].Text == ".");
            Assert.IsTrue(tokens[2].Start == 6);

            Assert.IsTrue(tokens[3].Text == "And");
            Assert.IsTrue(tokens[3].Start == 8);

            Assert.IsTrue(tokens[4].Text == "you");
            Assert.IsTrue(tokens[4].Start == 12);

            Assert.IsTrue(tokens[5].Text == "?");
            Assert.IsTrue(tokens[5].Start == 15);
        }

Ejemplo n.º 2

0

Mostrar archivo

        public void TriGramInCoNLL2000()
        {
            // tokenization
            var tokenizer = new TokenizerFactory <RegexTokenizer>(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment");

            // test tag
            var tagger = new TaggerFactory <NGramTagger>(new TagOptions
            {
                NGram  = 3,
                Tag    = "NN",
                Corpus = GetTaggedCorpus()
            }, SupportedLanguage.English);

            tagger.Tag(new Sentence {
                Words = tokens
            });

            Assert.IsTrue(tokens[0].Pos == "NNP");
            Assert.IsTrue(tokens[1].Pos == "IN");
            Assert.IsTrue(tokens[2].Pos == "DT");
            Assert.IsTrue(tokens[3].Pos == "NNP");
        }

Ejemplo n.º 3

0

Mostrar archivo

        public static void parseRules(List /*<String>*/ rules, SynonymMap map, string mappingSep,
                                      string synSep, bool expansion, TokenizerFactory tokFactory)
        {
            int count = 0;

            for (var iter = rules.iterator(); iter.hasNext();)
            {
                // To use regexes, we need an expression that specifies an odd number of chars.
                // This can't really be done with string.split(), and since we need to
                // do unescaping at some point anyway, we wouldn't be saving any effort
                // by using regexes.

                string            rule    = (string)iter.next();
                List /*<String>*/ mapping = StrUtils.splitSmart(rule, mappingSep, false);

                List /*<List<String>>*/ source;
                List /*<List<String>>*/ target;

                if (mapping.size() > 2)
                {
                    throw new System.ApplicationException("Invalid Synonym Rule:" + rule);
                }
                else if (mapping.size() == 2)
                {
                    source = getSynList((string)mapping.get(0), synSep, tokFactory);
                    target = getSynList((string)mapping.get(1), synSep, tokFactory);
                }
                else
                {
                    source = getSynList((string)mapping.get(0), synSep, tokFactory);
                    if (expansion)
                    {
                        // expand to all arguments
                        target = source;
                    }
                    else
                    {
                        // reduce to first argument
                        target = new ArrayList/*<List<String>>*/ (1);
                        target.add(source.get(0));
                    }
                }

                bool includeOrig = false;
                for (var fromIter = source.iterator(); fromIter.hasNext();)
                {
                    List /*<String>*/ fromToks = (List)fromIter.next();
                    count++;
                    for (var toIter = target.iterator(); toIter.hasNext();)
                    {
                        List /*<String>*/ toToks = (List)toIter.next();
                        map.add(fromToks,
                                SynonymMap.makeTokens(toToks),
                                includeOrig,
                                true
                                );
                    }
                }
            }
        }

Ejemplo n.º 4

0

Mostrar archivo

        public void Inform(IResourceLoader loader)
        {
            TokenizerFactory factory = tokenizerFactory == null ? null : LoadTokenizerFactory(loader, tokenizerFactory);

            Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this, factory);

            try
            {
                string formatClass = format;
                if (format == null || format.Equals("solr", StringComparison.Ordinal))
                {
                    formatClass = typeof(SolrSynonymParser).AssemblyQualifiedName;
                }
                else if (format.Equals("wordnet", StringComparison.Ordinal))
                {
                    formatClass = typeof(WordnetSynonymParser).AssemblyQualifiedName;
                }
                // TODO: expose dedup as a parameter?
                map = LoadSynonyms(loader, formatClass, true, analyzer);
            }
            catch (Exception e)
            {
                throw new IOException("Error parsing synonyms file:", e);
            }
        }

Ejemplo n.º 5

0

Mostrar archivo

        public void TestVectorizer()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var extractor = new CountFeatureExtractor();

            extractor.Sentences = tokenizer.Tokenize(Corpus());
            extractor.Vectorize(new List <string>());

            var vectors = Vectors();

            for (int i = 0; i < extractor.Sentences.Count; i++)
            {
                var sentence = extractor.Sentences[i];

                for (int j = 0; j < extractor.Features.Count; j++)
                {
                    var word = sentence.Words.Find(w => w.Lemma == extractor.Features[j]);

                    if (word != null)
                    {
                        Assert.IsTrue(word.Vector == vectors[i][j]);
                    }
                }
            }
        }

Ejemplo n.º 6

0

Mostrar archivo

Archivo: OneHotEncodingTest.cs Proyecto: steven-wang70/CherubNLP

        public void OneHotTest()
        {
            var reader    = new FasttextDataReader();
            var sentences = reader.Read(new ReaderOptions
            {
                DataDir  = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                FileName = "cooking.stackexchange.txt"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            var encoder = new OneHotEncoder();

            encoder.Sentences = sentences;
            encoder.EncodeAll();
        }

Ejemplo n.º 7

0

Mostrar archivo

Archivo: NGramTaggerTest.cs Proyecto: steven-wang70/CherubNLP

        public void TriGramInCoNLL2000()
        {
            // tokenization
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WORD_PUNC
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("Chancellor of the Exchequer Nigel Lawson's restated commitment");

            // test tag
            var tagger = new TaggerFactory(new TagOptions
            {
                CorpusDir = Configuration.GetValue <String>("CherubNLP:dataDir"),
                NGram     = 3,
                Tag       = "NN"
            }, SupportedLanguage.English);

            tagger.GetTagger <NGramTagger>();

            tagger.Tag(new Sentence {
                Words = tokens
            });

            Assert.IsTrue(tokens[0].Pos == "NNP");
            Assert.IsTrue(tokens[1].Pos == "IN");
            Assert.IsTrue(tokens[2].Pos == "DT");
            Assert.IsTrue(tokens[3].Pos == "NNP");
        }

Ejemplo n.º 8

0

Mostrar archivo

        private static List /*<String>*/ splitByTokenizer(string source, TokenizerFactory tokFactory)
        {
            StringReader      reader  = new StringReader(source);
            TokenStream       ts      = loadTokenizer(tokFactory, reader);
            List /*<String>*/ tokList = new ArrayList/*<String>*/ ();

            try {
#pragma warning disable 612
                for (Token token = ts.next(); token != null; token = ts.next())
                {
#pragma warning restore 612
                    string text = new string(token.termBuffer(), 0, token.termLength());
                    if (text.Length > 0)
                    {
                        tokList.add(text);
                    }
                }
            } catch (IOException e) {
                throw new System.ApplicationException("Unexpected exception.", e);
            }
            finally{
                reader.close();
            }
            return(tokList);
        }

Ejemplo n.º 9

0

Mostrar archivo

Archivo: TreebankTokenizerTest.cs Proyecto: Spongebob5/_

        public void ReplaceConventions()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var tokens = tokenizer.Tokenize("I cannot jump.");

            Assert.IsTrue(tokens[0].Text == "I");
            Assert.IsTrue(tokens[0].Start == 0);

            Assert.IsTrue(tokens[1].Text == "can");
            Assert.IsTrue(tokens[1].Start == 2);

            Assert.IsTrue(tokens[2].Text == "not");
            Assert.IsTrue(tokens[2].Start == 5);

            Assert.IsTrue(tokens[3].Text == "jump");
            Assert.IsTrue(tokens[3].Start == 9);

            Assert.IsTrue(tokens[4].Text == ".");
            Assert.IsTrue(tokens[4].Start == 13);
        }

Ejemplo n.º 10

0

Mostrar archivo

 internal FactoryAnalyzer(TokenizerFactory tokenizer, TokenFilterFactory tokenfilter, CharFilterFactory charFilter)
 {
     Debug.Assert(tokenizer != null);
     this.tokenizer   = tokenizer;
     this.charFilter  = charFilter;
     this.tokenfilter = tokenfilter;
 }

Ejemplo n.º 11

0

Mostrar archivo

        public void TokenizeInWhiteSpace()
        {
            var tokenizer = new TokenizerFactory(new TokenizationOptions
            {
                Pattern = RegexTokenizer.WHITE_SPACE
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <RegexTokenizer>();

            var tokens = tokenizer.Tokenize("Chop into pieces, isn't it?");

            Assert.IsTrue(tokens[0].Start == 0);
            Assert.IsTrue(tokens[0].Text == "Chop");

            Assert.IsTrue(tokens[1].Start == 5);
            Assert.IsTrue(tokens[1].Text == "into");

            Assert.IsTrue(tokens[2].Start == 10);
            Assert.IsTrue(tokens[2].Text == "pieces,");

            Assert.IsTrue(tokens[3].Start == 18);
            Assert.IsTrue(tokens[3].Text == "isn't");

            Assert.IsTrue(tokens[4].Start == 24);
            Assert.IsTrue(tokens[4].Text == "it?");
        }

Ejemplo n.º 12

0

Mostrar archivo

 public AsyncTcpServer(int port, int poolSize, TokenizerFactory <T> tokFactory, ServerProtocolFactory <T> protFactory)
 {
     _port             = port;
     _poolSize         = poolSize;
     _tokenizerFactory = tokFactory;
     _protocolFactory  = protFactory;
 }

Ejemplo n.º 13

0

Mostrar archivo

        public void Inform(IResourceLoader loader)
        {
            TokenizerFactory factory = tokenizerFactory is null ? null : LoadTokenizerFactory(loader, tokenizerFactory);

            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
            {
#pragma warning disable 612, 618
                Tokenizer tokenizer = factory is null ? new WhitespaceTokenizer(LuceneVersion.LUCENE_CURRENT, reader) : factory.Create(reader);
                TokenStream stream  = ignoreCase ? (TokenStream) new LowerCaseFilter(LuceneVersion.LUCENE_CURRENT, tokenizer) : tokenizer;
#pragma warning restore 612, 618
                return(new TokenStreamComponents(tokenizer, stream));
            });

            try
            {
                string formatClass = format;
                if (format is null || format.Equals("solr", StringComparison.Ordinal))
                {
                    formatClass = typeof(SolrSynonymParser).AssemblyQualifiedName;
                }
                else if (format.Equals("wordnet", StringComparison.Ordinal))
                {
                    formatClass = typeof(WordnetSynonymParser).AssemblyQualifiedName;
                }
                // TODO: expose dedup as a parameter?
                map = LoadSynonyms(loader, formatClass, true, analyzer);
            }

Ejemplo n.º 14

0

Mostrar archivo

        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            var sent = new[] { "This", "is", "an", "easy", "sentence", "." };

            java.util.List rawWords = Sentence.toCoreLabelList(sent);
            Tree           parse    = lp.apply(rawWords);

            parse.pennPrint();

            // This option shows loading and using an explicit tokenizer
            const string     Sent2            = "This is another sentence.";
            TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader = new StringReader(Sent2);

            java.util.List rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            parse = lp.apply(rawWords2);

            var tlp = new PennTreebankLanguagePack();
            GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
            GrammaticalStructure        gs  = gsf.newGrammaticalStructure(parse);

            java.util.List tdl = gs.typedDependenciesCCprocessed();
            Console.WriteLine("\n{0}\n", tdl);

            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(parse);
        }

Ejemplo n.º 15

0

Mostrar archivo

Archivo: TestFactories.cs Proyecto: zfxsss/lucenenet

//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: private void doTestTokenizer(String tokenizer) throws java.io.IOException
        private void doTestTokenizer(string tokenizer)
        {
//JAVA TO C# CONVERTER TODO TASK: Java wildcard generics are not converted to .NET:
//ORIGINAL LINE: Class<? extends org.apache.lucene.analysis.util.TokenizerFactory> factoryClazz = org.apache.lucene.analysis.util.TokenizerFactory.lookupClass(tokenizer);
            Type <?>         factoryClazz = TokenizerFactory.lookupClass(tokenizer);
            TokenizerFactory factory      = (TokenizerFactory)initialize(factoryClazz);

            if (factory != null)
            {
                // we managed to fully create an instance. check a few more things:

                // if it implements MultiTermAware, sanity check its impl
                if (factory is MultiTermAwareComponent)
                {
                    AbstractAnalysisFactory mtc = ((MultiTermAwareComponent)factory).MultiTermComponent;
                    assertNotNull(mtc);
                    // its not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it
                    assertFalse(mtc is CharFilterFactory);
                }

                // beast it just a little, it shouldnt throw exceptions:
                // (it should have thrown them in initialize)
                checkRandomData(random(), new FactoryAnalyzer(factory, null, null), 100, 20, false, false);
            }
        }

Ejemplo n.º 16

0

Mostrar archivo

        private List <CodeRegion> Act(string filePath)
        {
            var tokenizer = TokenizerFactory.GetFor(DemoLanguage.Java);
            var parser    = new RegionParser(filePath, tokenizer);
            var result    = parser.GetRegions();

            return(result.ToList());
        }

Ejemplo n.º 17

0

Mostrar archivo

        public static DemoCodeBuilder Initialize(string filePath, DemoLanguage language)
        {
            var tokenizer = TokenizerFactory.GetFor(language);
            var parser    = new RegionParser(filePath, tokenizer);
            var regions   = parser.GetRegions().ToList();

            return(new DemoCodeBuilder(filePath, regions));
        }

Ejemplo n.º 18

0

Mostrar archivo

        private string Convert(string phrase, TokenizerMapConverter tokenizerMapConverter)
        {
            var tokenizer    = TokenizerFactory.Create(tokenizerMapConverter.Tokenizer);
            var words        = tokenizer.Segment(phrase);
            var wordsConvert = WordsConverterFactory.Create(tokenizerMapConverter.Converter);

            return(wordsConvert.Convert(words));
        }

Ejemplo n.º 19

0

Mostrar archivo

        public AnalyzerFactory(ResourceLoader resourceLoader)
        {
            _tokenizerFactory = new WhitespaceTokenizerFactory();
            _lowerCaseFactory = new LowerCaseFilterFactory();

            _synonymFactory = new SynonymFilterFactory();
            {
                var args = new java.util.HashMap();
                args.put("ignoreCase", "true");
                args.put("expand", "false");
                args.put("synonyms", "synonyms.txt");
                _synonymFactory.init(args);
                ((ResourceLoaderAware)_synonymFactory).inform(resourceLoader);
            }

            _commonGramsFactory = new CommonGramsFilterFactory();
            {
                var args = new java.util.HashMap();
                args.put("ignoreCase", "true");
                _commonGramsFactory.init(args);
                ((ResourceLoaderAware)_commonGramsFactory).inform(resourceLoader);
            }

            _commonGramsQueryFactory = new CommonGramsQueryFilterFactory();
            {
                var args = new java.util.HashMap();
                args.put("ignoreCase", "true");
                _commonGramsQueryFactory.init(args);
                ((ResourceLoaderAware)_commonGramsQueryFactory).inform(resourceLoader);
            }

            _wordDelimiterFactory = new WordDelimiterFilterFactory();
            {
                var args = new java.util.HashMap();
                args.put("catenateWords", "1");
                args.put("catenateNumbers", "1");
                args.put("protected", "protwords.txt");
                _wordDelimiterFactory.init(args);
                ((ResourceLoaderAware)_wordDelimiterFactory).inform(resourceLoader);
            }

            _stemmerFactory = new KStemFilterFactory();
            {
                var args = new java.util.HashMap();
                args.put("protected", "protwords.txt");
                _stemmerFactory.init(args);
                ((ResourceLoaderAware)_stemmerFactory).inform(resourceLoader);
            }

            _edgeNGramFactory = new EdgeNGramTokenFilterFactory();
            {
                var args = new java.util.HashMap();
                args.put("side", "FRONT");
                args.put("minGramSize", 2);
                _edgeNGramFactory.init(args);
                ((ResourceLoaderAware)_edgeNGramFactory).inform(resourceLoader);
            }
        }

Ejemplo n.º 20

0

Mostrar archivo

        public void TestList(string ch, AsmToken[] tk)
        {
            var sut    = TokenizerFactory.Create();
            var actual = sut.Tokenize(ch)
                         .Select(t => t.Kind)
                         .ToList();

            Assert.Equal(tk, actual);
        }

Ejemplo n.º 21

0

Mostrar archivo

Archivo: AnalyzerFactory.cs Proyecto: simixsistemas/lucenenet

 public AnalyzerFactory(IList <CharFilterFactory> charFilterFactories,
                        TokenizerFactory tokenizerFactory,
                        IList <TokenFilterFactory> tokenFilterFactories)
 {
     this.charFilterFactories = charFilterFactories;
     Debug.Assert(null != tokenizerFactory);
     this.tokenizerFactory     = tokenizerFactory;
     this.tokenFilterFactories = tokenFilterFactories;
 }

Ejemplo n.º 22

0

Mostrar archivo

 internal FactoryAnalyzer(TokenizerFactory tokenizer, TokenFilterFactory tokenfilter, CharFilterFactory charFilter)
 {
     if (Debugging.AssertsEnabled)
     {
         Debugging.Assert(tokenizer != null);
     }
     this.tokenizer   = tokenizer;
     this.charFilter  = charFilter;
     this.tokenfilter = tokenfilter;
 }

Ejemplo n.º 23

0

Mostrar archivo

Archivo: BotSharpTokenizer.cs Proyecto: pattyzhang527/BotSharp

 public BotSharpTokenizer()
 {
     _tokenizer = new TokenizerFactory <RegexTokenizer>(new TokenizationOptions
     {
         Pattern      = RegexTokenizer.WORD_PUNC,
         SpecialWords = new List <string> {
             "'s"
         }
     }, SupportedLanguage.English);
 }

Ejemplo n.º 24

0

Mostrar archivo

Archivo: TokenizerTrainerTool.cs Proyecto: quicquam/opennlp4net

        public override void run(string format, string[] args)
        {
            base.run(format, args);

            mlParams = CmdLineUtil.loadTrainingParameters(@params.Params, false);

            if (mlParams != null)
            {
                if (!TrainUtil.isValid(mlParams.Settings))
                {
                    throw new TerminateToolException(1, "Training parameters file '" + @params.Params + "' is invalid!");
                }

                if (TrainUtil.isSequenceTraining(mlParams.Settings))
                {
                    throw new TerminateToolException(1, "Sequence training is not supported!");
                }
            }

            if (mlParams == null)
            {
                mlParams = ModelUtil.createTrainingParameters(@params.Iterations.Value, @params.Cutoff.Value);
            }

            File modelOutFile = @params.Model;

            CmdLineUtil.checkOutputFile("tokenizer model", modelOutFile);

            TokenizerModel model;

            try
            {
                Dictionary dict = loadDict(@params.AbbDict);

                TokenizerFactory tokFactory = TokenizerFactory.create(@params.Factory, @params.Lang, dict, @params.AlphaNumOpt.Value, null);
                model = opennlp.tools.tokenize.TokenizerME.train(sampleStream, tokFactory, mlParams);
            }
            catch (IOException e)
            {
                throw new TerminateToolException(-1, "IO error while reading training data or indexing data: " + e.Message, e);
            }
            finally
            {
                try
                {
                    sampleStream.close();
                }
                catch (IOException)
                {
                    // sorry that this can fail
                }
            }

            CmdLineUtil.writeModel("tokenizer", modelOutFile, model);
        }

Ejemplo n.º 25

0

Mostrar archivo

Archivo: AnalyzerFactory.cs Proyecto: zalintyre/lucenenet

 public AnalyzerFactory(IList <CharFilterFactory> charFilterFactories,
                        TokenizerFactory tokenizerFactory,
                        IList <TokenFilterFactory> tokenFilterFactories)
 {
     this.charFilterFactories = charFilterFactories;
     if (Debugging.AssertsEnabled)
     {
         Debugging.Assert(null != tokenizerFactory);
     }
     this.tokenizerFactory     = tokenizerFactory;
     this.tokenFilterFactories = tokenFilterFactories;
 }

Ejemplo n.º 26

0

Mostrar archivo

        public void CookingTest()
        {
            var reader    = new FasttextDataReader();
            var sentences = reader.Read(new ReaderOptions
            {
                DataDir  = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                FileName = "cooking.stackexchange.txt"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            sentences.Shuffle();

            var options = new ClassifyOptions
            {
                ModelFilePath     = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange", "nb.model"),
                TrainingCorpusDir = Path.Combine(Configuration.GetValue <String>("MachineLearning:dataDir"), "Text Classification", "cooking.stackexchange"),
                Dimension         = 100
            };
            var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English);

            var dataset = sentences.Split(0.7M);

            classifier.Train(dataset.Item1);

            int correct = 0;
            int total   = 0;

            dataset.Item2.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
                total++;
            });

            var accuracy = (float)correct / total;

            Assert.IsTrue(accuracy > 0.5);
        }

Ejemplo n.º 27

0

Mostrar archivo

Archivo: ClassificationTest.cs Proyecto: steven-wang70/CherubNLP

        public void SpookyAuthorIdentification()
        {
            var reader    = new KaggleTextDataReader();
            var sentences = reader.Read(new ReaderOptions {
                FileName = "train.csv"
            });

            var tokenizer = new TokenizerFactory(new TokenizationOptions {
            }, SupportedLanguage.English);

            tokenizer.GetTokenizer <TreebankTokenizer>();

            var newSentences = tokenizer.Tokenize(sentences.Select(x => x.Text).ToList());

            for (int i = 0; i < newSentences.Count; i++)
            {
                newSentences[i].Id    = sentences[i].Id;
                newSentences[i].Label = sentences[i].Label;
            }
            sentences = newSentences.ToList();

            sentences.Shuffle();
            var dataset = sentences.Take(2000).ToList().Split(0.7M);

            var options = new ClassifyOptions
            {
                ModelDir      = AppContext.BaseDirectory,
                ModelFilePath = Path.Combine(AppContext.BaseDirectory, "nb.model"),
                Dimension     = 300
            };
            var classifier = new ClassifierFactory <SentenceFeatureExtractor>(options, SupportedLanguage.English);

            classifier.GetClassifer("NaiveBayesClassifier");
            classifier.Train(dataset.Item1);

            int correct = 0;
            int total   = 0;

            dataset.Item2.ForEach(td =>
            {
                var classes = classifier.Classify(td);
                if (td.Label == classes[0].Item1)
                {
                    correct++;
                }
                total++;
            });

            var accuracy = (float)correct / total;

            Assert.IsTrue(accuracy > 0.5);
        }

Ejemplo n.º 28

0

Mostrar archivo

        public void Inform(IResourceLoader loader)
        {
            TokenizerFactory tokFactory = null;

            if (tf != null)
            {
                tokFactory = LoadTokenizerFactory(loader, tf);
            }

            IEnumerable <string> wlist = LoadRules(synonyms, loader);

            synMap = new SlowSynonymMap(ignoreCase);
            ParseRules(wlist, synMap, "=>", ",", expand, tokFactory);
        }

Ejemplo n.º 29

0

Mostrar archivo

Archivo: NLPService.cs Proyecto: henkin/Nala

        public NlpService()
        {
            string parserFileOrUrl = "englishPCFG.ser.gz";

            _lp = LexicalizedParser.loadModel(parserFileOrUrl);
            if (_lp == null)
            {
                throw new InvalidOperationException("couldn't load " + parserFileOrUrl);
            }
            _tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");

            _tlp = new PennTreebankLanguagePack();
            _structureFactory = _tlp.grammaticalStructureFactory();
        }

Ejemplo n.º 30

0

Mostrar archivo

        private void Init()
        {
            if (_tokenizer == null)
            {
                _tokenizer = new TokenizerFactory(new TokenizationOptions
                {
                    Pattern = Configuration.GetValue <String>("options:pattern")
                }, SupportedLanguage.English);

                string tokenizerName = Configuration.GetValue <String>($"tokenizer");

                _tokenizer.GetTokenizer(tokenizerName);
            }
        }

Ejemplos de TokenizerFactory en C# (CSharp)