예제 #1
0
        private void InitializeTokenizer()
        {
            InputStream modelIn = null;

            try
            {
                modelIn = new FileInputStream(TokenizerModel);
                TokenizerModel model = new TokenizerModel(modelIn);
                tokenizer = new TokenizerME(model);
            }
            catch (IOException ex)
            {
                tokenizer = null;
            }
            finally
            {
                if (modelIn != null)
                {
                    try
                    {
                        modelIn.close();
                    }
                    catch (IOException ex)
                    {
                    }
                }
            }
        }
예제 #2
0
        private static TokenizerME PrepareTokenizer()
        {
            var tokenInputStream = new FileInputStream(TokenModelPath);  //load the token model into a stream
            var tokenModel       = new TokenizerModel(tokenInputStream); //load the token model

            return(new TokenizerME(tokenModel));                         //create the tokenizer
        }
예제 #3
0
        static void Main(string[] args)
        {
            String[]     files = Directory.GetFiles(@"..\..\..\..\Dataset", "lab5result.txt");
            StreamWriter sw    = new StreamWriter(@"..\..\..\..\lab6result.txt", false);

            foreach (string file in files)
            {
                using (StreamReader sr = new StreamReader(file))
                {
                    while (sr.Peek() != -1)
                    {
                        string         line = sr.ReadLine();
                        string[]       tokens;
                        InputStream    modelIn     = new FileInputStream(@"..\..\..\..\en-token.bin");
                        TokenizerModel model       = new TokenizerModel(modelIn);
                        TokenizerME    enTokenizer = new TokenizerME(model);
                        tokens = enTokenizer.tokenize(line);

                        for (int i = 0; i < tokens.Length; i++)
                        {
                            sw.Write(tokens[i] + " ");
                            if (tokens[i].Equals("."))
                            {
                                sw.Write("\r\n");
                            }
                        }
                    }
                }
            }
            sw.Close();
        }
예제 #4
0
        static void Main(string[] args)
        {
            String[]     file = Directory.GetFiles(@"C:\10311171\lab-6-opennlp-ju-zi-qie-fen-HUNGLIWEN\lab6\Dataset", "*.html");
            StreamWriter sw   = new StreamWriter(@"ReadByTokens.txt");

            foreach (string filename in file)
            {
                using (StreamReader sr = new StreamReader(filename))
                {
                    while (sr.Peek() != -1)
                    {
                        string         line = sr.ReadLine();
                        string[]       tokens;
                        InputStream    modelIn     = new FileInputStream(@"C:\10311171\lab-6-opennlp-ju-zi-qie-fen-HUNGLIWEN\en-token.bin");
                        TokenizerModel model       = new TokenizerModel(modelIn);
                        TokenizerME    enTokenizer = new TokenizerME(model);
                        tokens = enTokenizer.tokenize(line);
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            sw.Write(tokens[i] + " ");

                            if (tokens[i].Equals("."))
                            {
                                sw.Write("\n");
                            }
                        }
                    }
                }
            }
            sw.Close();
        }
예제 #5
0
        static void Main(string[] args)
        {
            String[]     files = Directory.GetFiles(@"C:\10311173\lab-5-opennlp-tokenization-Changyiyu\Dataset", "*.html");
            StreamWriter sw    = new StreamWriter(@"TokensReader.txt");

            foreach (string thefile in files)
            {
                using (StreamReader sr = new StreamReader(thefile))
                {
                    while (sr.Peek() != -1)
                    {
                        string         line = sr.ReadLine();
                        string[]       tokens;
                        InputStream    modelIn     = new FileInputStream(@"C:\10311173\lab-6-opennlp-ju-zi-qie-fen-Changyiyu\en-token.bin");
                        TokenizerModel model       = new TokenizerModel(modelIn);
                        TokenizerME    enTokenizer = new TokenizerME(model);
                        tokens = enTokenizer.tokenize(line);

                        for (int i = 0; i < tokens.Length; i++)
                        {
                            //Regex reg = new Regex(".");
                            //Match m = reg.Match(tokens[i]);
                            sw.Write(tokens[i] + " ");
                            if (tokens[i].Equals("."))
                            {
                                sw.Write("\r\n");
                            }
                        }
                    }
                }
            }
            sw.Close();
        }
예제 #6
0
        string[] Tokenizer(string review)
        {
            InputStream    modelIn   = new FileInputStream(modelPath + "en-token.zip");
            TokenizerModel model     = new TokenizerModel(modelIn);
            TokenizerME    tokenizer = new TokenizerME(model);

            string[] tokens = tokenizer.tokenize(review.Replace(".", ""));
            return(tokens);
        }
예제 #7
0
        public Span[] GetTokens(string paragraph)
        {
            var            bin       = GetFileStream("en-token.bin");
            TokenizerModel model     = new TokenizerModel(bin);
            TokenizerME    tokenizer = new TokenizerME(model);

            Span[] tokens = tokenizer.tokenizePos(paragraph);

            bin.close();

            return(tokens);
        }
예제 #8
0
 public static NLPTokenizerOp GetTokenizer(string modelName)
 {
     if (modelName == null)
     {
         return(new NLPTokenizerOp());
     }
     else
     {
         TokenizerModel model = tokenizerModels[modelName];
         return(new NLPTokenizerOp(model));
     }
 }
예제 #9
0
 public static TokenizerModel GetTokenizerModel(string modelName, IResourceLoader loader)
 {
     if (!tokenizerModels.TryGetValue(modelName, out TokenizerModel model) || model == null)
     {
         using (Stream resource = loader.OpenResource(modelName))
         {
             model = new TokenizerModel(new ikvm.io.InputStreamWrapper(resource));
         }
         tokenizerModels[modelName] = model;
     }
     return(model);
 }
예제 #10
0
        static Tokenizer()
        {
            var modelFile = HttpContext.Current.Server.MapPath("~/Files/TextAnalytics/en-token.bin");

            if (!File.Exists(modelFile))
            {
                throw new FileNotFoundException("Unable to find tokenizer model file at " + modelFile);
            }

            ModelIn = new java.io.FileInputStream(modelFile);
            Model   = new TokenizerModel(ModelIn);
        }
예제 #11
0
파일: OpenNLP.cs 프로젝트: baio/d-mill
        public void Tokenize()
        {
            var modelStream = new java.io.FileInputStream("../../Models/en-token.bin");

            var model = new TokenizerModel(modelStream);

            var tokenizer = new TokenizerME(model);

            var txt = File.ReadAllText(@"c:\dev\d-mill\uspe\Data\uspe-sentenced.txt");

            var tokens = tokenizer.tokenize(txt);
        }
예제 #12
0
        static Tokenizer()
        {
            var modelFile = ConfigurationManager.AppSettings["ModelTokenizer"] ?? string.Empty;

            if(string.IsNullOrWhiteSpace(modelFile))
                throw new Exception("ModelTokenizer setting not defined in App.Config");

            modelFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, modelFile);
            if(!File.Exists(modelFile))
                throw new FileNotFoundException("Unable to find tokenizer model file at " + modelFile);

            ModelIn = new java.io.FileInputStream(modelFile);
            Model = new TokenizerModel(ModelIn);
        }
예제 #13
0
        public override void run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.WriteLine(Help);
            }
            else
            {
                TokenizerModel model = (new TokenizerModelLoader()).load(new File(args[0]));

                CommandLineTokenizer tokenizer = new CommandLineTokenizer(new opennlp.tools.tokenize.TokenizerME(model));

                tokenizer.process();
            }
        }
예제 #14
0
        static void Main(string[] args)
        {
            String[]     file = Directory.GetFiles(@"..\..\..\..\Dataset\", "*.html");
            StreamWriter sw   = new StreamWriter(@"..\..\..\Html.txt");

            foreach (String files in file)
            {
                using (StreamReader sr = new StreamReader(files))
                {
                    while (sr.Peek() != -1)
                    {
                        string line        = sr.ReadLine();
                        string pattern     = @"(<P>|<BR />|<DIV>)";
                        string replacement = "";
                        Regex  rgx         = new Regex(pattern);
                        line        = rgx.Replace(line, replacement);
                        pattern     = @"(</P>|<p/>|</DIV>)";
                        replacement = "";
                        rgx         = new Regex(pattern);
                        line        = rgx.Replace(line, replacement);
                        string regFind       = @"<a.*?>" + @"(?'text'.*?)" + @"</a>";
                        string regReplace    = @"${text}";
                        string regFindimg    = @"<img.*?title=""" + @"(?'text2'.*?)" + @""" />";
                        string regReplaceimg = @"${text2}";
                        line = Regex.Replace(line, @"&nbsp;", "");
                        line = Regex.Replace(line, regFind, regReplace);

                        String[]       tokens;
                        InputStream    modelIn     = new FileInputStream(@"..\..\..\..\en-token.bin");
                        TokenizerModel model       = new TokenizerModel(modelIn);
                        TokenizerME    enTokenizer = new TokenizerME(model);
                        tokens = enTokenizer.tokenize(Regex.Replace(line, regFindimg, regReplaceimg));
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            sw.Write(tokens[i] + " ");
                            if (tokens[i].Equals("."))
                            {
                                sw.Write("\n");
                            }
                        }
                    }
                }
            }
            sw.Close();
        }
예제 #15
0
        static Tokenizer()
        {
            var modelFile = ConfigurationManager.AppSettings["ModelTokenizer"] ?? string.Empty;

            if (string.IsNullOrWhiteSpace(modelFile))
            {
                throw new Exception("ModelTokenizer setting not defined in App.Config");
            }

            modelFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, modelFile);
            if (!File.Exists(modelFile))
            {
                throw new FileNotFoundException("Unable to find tokenizer model file at " + modelFile);
            }

            ModelIn = new java.io.FileInputStream(modelFile);
            Model   = new TokenizerModel(ModelIn);
        }
        public override void run(string format, string[] args)
        {
            base.run(format, args);

            TokenizerModel model = (new TokenizerModelLoader()).load(@params.Model);

            TokenizerEvaluationMonitor misclassifiedListener = null;

            if (@params.Misclassified.Value)
            {
                misclassifiedListener = new TokenEvaluationErrorListener();
            }

            TokenizerEvaluator evaluator = new TokenizerEvaluator(new opennlp.tools.tokenize.TokenizerME(model), misclassifiedListener);

            Console.Write("Evaluating ... ");

            try
            {
                evaluator.evaluate(sampleStream);
            }
            catch (IOException e)
            {
                Console.Error.WriteLine("failed");
                throw new TerminateToolException(-1, "IO error while reading test data: " + e.Message, e);
            }
            finally
            {
                try
                {
                    sampleStream.close();
                }
                catch (IOException)
                {
                    // sorry that this can fail
                }
            }

            Console.WriteLine("done");

            Console.WriteLine();

            Console.WriteLine(evaluator.FMeasure);
        }
예제 #17
0
        public void TestDummyFactory()
        {
            const string lang    = "es";
            const string pattern = "^[0-9A-Za-z]+$";

            var dic = LoadAbbDictionary();

            var model = Train(new DummyTokenizerFactory(lang, dic, true, pattern));

            Assert.IsInstanceOf(typeof(DummyTokenizerFactory), model.Factory);

            var factory = model.Factory;

            Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyDictionary), factory.AbbreviationDictionary);
            Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyContextGenerator), factory.ContextGenerator);

            Assert.AreEqual(pattern, factory.AlphaNumericPattern);
            Assert.AreEqual(lang, factory.LanguageCode);
            Assert.AreEqual(lang, model.Language);
            Assert.AreEqual(true, factory.UseAlphaNumericOptimization);

            using (var data = new MemoryStream()) {
                model.Serialize(new UnclosableStream(data));

                data.Seek(0, SeekOrigin.Begin);

                var fromSerialized = new TokenizerModel(data);

                Assert.IsInstanceOf(typeof(DummyTokenizerFactory), fromSerialized.Factory);

                factory = fromSerialized.Factory;

                Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyDictionary), factory.AbbreviationDictionary);
                Assert.IsInstanceOf(typeof(DummyTokenizerFactory.DummyContextGenerator), factory.ContextGenerator);

                Assert.AreEqual(pattern, factory.AlphaNumericPattern);
                Assert.AreEqual(lang, factory.LanguageCode);
                Assert.AreEqual(lang, fromSerialized.Language);
                Assert.AreEqual(true, factory.UseAlphaNumericOptimization);
            }
        }
예제 #18
0
        public NLP()
        {
            //loading sentence detector model
            java.io.FileInputStream modelInpStream = new java.io.FileInputStream("Resources\\en-sent.bin");
            SentenceModel           sentenceModel  = new SentenceModel(modelInpStream);

            sentenceDetector = new SentenceDetectorME(sentenceModel);

            //loading tokenizer model
            modelInpStream = new java.io.FileInputStream("Resources\\en-token.bin");
            TokenizerModel tokenizerModel = new TokenizerModel(modelInpStream);

            tokenizer = new TokenizerME(tokenizerModel);

            modelInpStream = new java.io.FileInputStream("Resources\\en-pos-maxent.bin");
            POSModel posModel = new POSModel(modelInpStream);

            tagger = new POSTaggerME(posModel);

            modelInpStream = new java.io.FileInputStream("Resources\\en-chunker.bin");
            ChunkerModel chunkerModel = new ChunkerModel(modelInpStream);

            chunker = new ChunkerME(chunkerModel);

            modelInpStream = new java.io.FileInputStream("Resources\\en-parser-chunking.bin");
            ParserModel parserModel = new ParserModel(modelInpStream);

            parser = ParserFactory.create(parserModel);

            //loading stop words list
            StreamReader sr = new StreamReader("Resources\\english.stop.txt");
            string       line;

            while ((line = sr.ReadLine()) != null)
            {
                stopwords.Add(Stemming(line));
                stopwords.Add(line);
            }
        }
예제 #19
0
        static void Main(string[] args)
        {
            String[]     file = Directory.GetFiles(@"C:\10311209\lab-6-opennlp-ju-zi-qie-fen-XiuXuanLiu\Dataset", "*.html");
            StreamWriter sw   = new StreamWriter(@"ReadByTokens.txt");

            foreach (string filename in file)
            {
                using (StreamReader sr = new StreamReader(filename))
                {
                    while (sr.Peek() != -1)
                    {
                        string line = sr.ReadLine();
                        line = Regex.Replace(line, "<P[^>]*>", ""); //[^>] 所有不是 > 的字
                        line = Regex.Replace(line, @"&nbsp;", "");
                        line = Regex.Replace(line, "<DIV[^>]*>", "");
                        line = Regex.Replace(line, "<BR[^>]*>", "");
                        line = Regex.Replace(line, "<img[^>]*title=\"(?'titleName'.*?)\"[^>]*>", "${titleName}");
                        line = Regex.Replace(line, "<[^>]*href.*>(?'Name'.*?)<[^>]*>", "${Name}");
                        line = Regex.Replace(line, "<[^>]*>", "");

                        string[]       tokens;
                        InputStream    modelIn     = new FileInputStream(@"C:\10311209\lab-6-opennlp-ju-zi-qie-fen-XiuXuanLiu\en-token.bin");
                        TokenizerModel model       = new TokenizerModel(modelIn);
                        TokenizerME    enTokenizer = new TokenizerME(model);
                        tokens = enTokenizer.tokenize(line);
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            sw.Write(tokens[i] + " ");
                            if (tokens[i].Equals("."))
                            {
                                sw.Write("\n");
                            }
                        }
                    }
                }
            }
            sw.Close();
        }
        static void Main(string[] args)
        {
            java.io.InputStream modelIn  = new java.io.FileInputStream(string.Format("en-sent.bin"));
            java.io.InputStream modelIn2 = new java.io.FileInputStream(string.Format("en-token.bin"));
            TokenizerModel      model    = new TokenizerModel(modelIn2);
            TokenizerME         mE       = new TokenizerME(model);
            SentenceModel       sM       = new SentenceModel(modelIn);
            SentenceDetector    detector = new SentenceDetectorME(sM);
            string folderName            = @"C:\Users\Administrator\Desktop\lab-6-opennlp-ju-zi-qie-fen-10411174\file";

            foreach (string fname in System.IO.Directory.GetFiles(folderName))
            {
                String       line  = null;
                String[]     name  = fname.Split('\\');
                StreamWriter sw    = new StreamWriter(@"C:\Users\Administrator\Desktop\lab-6-opennlp-ju-zi-qie-fen-10411174\answer\" + name[6]);
                StreamReader file2 = new StreamReader(fname);
                while ((line = file2.ReadLine()) != null)
                {
                    string   str   = null;
                    string[] sents = detector.sentDetect(line);
                    if (sents.Length.Equals(0))
                    {
                        continue;
                    }
                    foreach (var s in sents)
                    {
                        str = str + s;
                    }
                    var Tokens = mE.tokenize(str);
                    foreach (var s in Tokens)
                    {
                        sw.Write(s + " ");
                    }
                    sw.WriteLine();
                }
                sw.Close();
            }
        }
예제 #21
0
        public void TestDefault()
        {
            var          dict = LoadAbbDictionary();
            const string lang = "es";

            var model = Train(new TokenizerFactory(lang, dict, false, null));

            var factory = model.Factory;

            Assert.IsInstanceOf(typeof(Dict), factory.AbbreviationDictionary);
            Assert.IsInstanceOf(typeof(DefaultTokenContextGenerator), factory.ContextGenerator);

            Assert.AreEqual(Factory.DefaultAlphanumeric, factory.AlphaNumericPattern);
            Assert.AreEqual(lang, factory.LanguageCode);
            Assert.AreEqual(lang, model.Language);

            Assert.AreEqual(false, factory.UseAlphaNumericOptimization);

            using (var data = new MemoryStream()) {
                model.Serialize(new UnclosableStream(data));

                data.Seek(0, SeekOrigin.Begin);

                var fromSerialized = new TokenizerModel(data);

                factory = fromSerialized.Factory;

                Assert.IsInstanceOf(typeof(Dict), factory.AbbreviationDictionary);
                Assert.IsInstanceOf(typeof(DefaultTokenContextGenerator), factory.ContextGenerator);

                Assert.AreEqual(Factory.DefaultAlphanumeric, factory.AlphaNumericPattern);
                Assert.AreEqual(lang, factory.LanguageCode);
                Assert.AreEqual(lang, fromSerialized.Language);

                Assert.AreEqual(false, factory.UseAlphaNumericOptimization);
            }
        }
예제 #22
0
        public override ObjectStream <CorefSample> create(string[] args)
        {
            Parameters @params = ArgumentParser.parse(args, typeof(Parameters));

            ParserModel parserModel = (new ParserModelLoader()).load(@params.ParserModel);
            Parser      parser      = ParserFactory.create(parserModel);

            TokenizerModel tokenizerModel = (new TokenizerModelLoader()).load(@params.TokenizerModel);
            Tokenizer      tokenizer      = new TokenizerME(tokenizerModel);

            ObjectStream <string> mucDocStream = new FileToStringSampleStream(new DirectorySampleStream(@params.Data, new FileFilterAnonymousInnerClassHelper(this), false), Charset.forName("UTF-8"));

            ObjectStream <RawCorefSample> rawSamples = new MucCorefSampleStream(tokenizer, mucDocStream);

            ObjectStream <RawCorefSample> parsedSamples = new FullParseCorefEnhancerStream(parser, rawSamples);


            // How to load all these nameFinder models ?!
            // Lets make a param per model, not that nice, but ok!

            IDictionary <string, Jfile> modelFileTagMap = new Dictionary <string, Jfile>();

            modelFileTagMap["person"]       = @params.PersonModel;
            modelFileTagMap["organization"] = @params.OrganizationModel;

            IList <TokenNameFinder> nameFinders = new List <TokenNameFinder>();
            IList <string>          tags        = new List <string>();

            foreach (KeyValuePair <string, Jfile> entry in modelFileTagMap)
            {
                nameFinders.Add(new NameFinderME((new TokenNameFinderModelLoader()).load(entry.Value)));
                tags.Add(entry.Key);
            }

            return(new MucMentionInserterStream(new NameFinderCorefEnhancerStream(nameFinders.ToArray(), tags.ToArray(), parsedSamples)));
        }
예제 #23
0
        public Tokenizer(FileStream modelStream)
        {
            TokenizerModel model = new TokenizerModel(modelStream);

            this.tokenizer = new TokenizerME(model);
        }
예제 #24
0
 private static TokenizerME PrepareTokenizer()
 {
     var tokenInputStream = new FileInputStream(TokenModelPath); //load the token model into a stream
     var tokenModel = new TokenizerModel(tokenInputStream); //load the token model
     return new TokenizerME(tokenModel); //create the tokenizer
 }
 private TokenizerME PrepareTokenizer()
 {
     //TODO[danielcamargo]: we need to find/train the model in spanish
     var tokenInputStream =
         new FileInputStream(string.Format(@"Models\{0}-token.bin", _language == "es" ? "pt" : _language));
     var tokenModel = new TokenizerModel(tokenInputStream);
     tokenInputStream.close();
     return new TokenizerME(tokenModel);
 }
예제 #26
0
        public void giveDefinitionAndHomonym(string currentSentence)
        {
            try
            {
                java.io.InputStream modelIn = new java.io.FileInputStream(@"C:\en-token.bin");
                TokenizerModel model = new TokenizerModel(modelIn);

                Tokenizer tokenizer = new TokenizerME(model);

                string[] words = tokenizer.tokenize(currentSentence);

                Homonyms homonyms = new Homonyms();

                for (int i = 0; i < words.Length; i++)
                {
                    System.Console.WriteLine();
                    Homonym homonym = homonyms.findWordInList(words[i]);

                    if (homonym.homonyms == null)
                    {
                        System.Console.WriteLine("No homonyms found for: " + words[i]);
                    }

                    else
                    {
                        List<string> selectedHomonyms = homonym.selectedHomonyms();

                        System.Console.WriteLine("Homonyms are: " + words[i]);
                        foreach (string selectedWord in selectedHomonyms)
                        {
                            System.Console.Write(selectedWord + ",");
                        }
                    }

                    System.Console.WriteLine();
                    System.Console.WriteLine("Definition for: " + words[i]);
                    using (WebClient client = new WebClient())
                    {
                        string line = client.DownloadString("http://api.wordnik.com/v4/word.json/" + words[i] + "/definitions?limit=200&includeRelated=true&useCanonical=false&includeTags=false&api_key=a2a73e7b926c924fad7001ca3111acd55af2ffabf50eb4ae5");
                        if (!line.Equals("[]"))
                        {
                            string[] lines1 = System.Text.RegularExpressions.Regex.Split(line, "\"text\":\"");
                            string[] lines2 = System.Text.RegularExpressions.Regex.Split(lines1[1], "\",\"sequence\"[\\W\\w]+");
                            System.Console.WriteLine(lines2[0]);
                        }
                        else
                        {
                            System.Console.WriteLine("Definition cannot be found, word is mispelled or doesn't exist within our current data");
                        }
                    }

                }
            }
            catch (Exception e)
            {
                System.Console.WriteLine(e.Message);
            }
        }
예제 #27
0
        public string grabPossiblePunSentences(string currentSentence)
        {
            try
            {
                java.io.InputStream modelIn = new java.io.FileInputStream(@"C:\en-token.bin");
                TokenizerModel model = new TokenizerModel(modelIn);

                Tokenizer tokenizer = new TokenizerME(model);

                string[] words = tokenizer.tokenize(currentSentence);
                List<string> possibleSentences = new List<string>();

                Homonyms homonyms = new Homonyms();

                for (int i = 0; i < words.Length; i++)
                {
                    System.Console.WriteLine();
                    Homonym homonym = homonyms.findWordInList(words[i]);

                    if (homonym.homonyms == null)
                    {

                    }

                    else
                    {
                        string possibleSentence = "";
                        for (int r = 0; r < words.Length; r++)
                        {
                            if (words[i].Equals(words[r]))
                            {
                                Random random = new Random();
                                int randomNumber = random.Next(homonym.homonyms.Length);
                                possibleSentence += " " + homonym.homonyms[randomNumber];
                            }

                            else
                            {
                                possibleSentence += " " + words[r];
                            }
                        }
                        possibleSentences.Add(possibleSentence);
                    }
                }
                currentSentence = choosePossiblePunSentence(currentSentence, possibleSentences);
            }

            catch (Exception e)
            {

            }

            return currentSentence;
        }
예제 #28
0
        public static IEnumerable<IEnumerable<ChunkItem>> GetChunks(IEnumerable<string> Sentences)
        {
            var posModelStream = new java.io.ByteArrayInputStream(Resource.en_pos_maxent);//new java.io.FileInputStream(@"C:\dev\d-mill\TextProcessing\OpenNLP\Models\en-pos-maxent.bin");

            var posModel = new POSModel(posModelStream);

            var pos = new POSTaggerME(posModel);

            var modelStream = new java.io.ByteArrayInputStream(Resource.en_token); //java.io.FileInputStream(@"C:\dev\d-mill\TextProcessing\OpenNLP\Models\en-token.bin");

            var model = new TokenizerModel(modelStream);

            var tokenizer = new TokenizerME(model);

            var chunkerModelStream = new java.io.ByteArrayInputStream(Resource.en_chunker);

            var chunkerModel = new ChunkerModel(chunkerModelStream);

            var chunker = new ChunkerME(chunkerModel);

            return Sentences.Select(p => {

                var tokens = tokenizer.tokenize(p);

                var tags = pos.tag(tokens);

                var chunks = chunker.chunk(tokens, tags);

                var res = new List<ChunkItem>();

                for (var i = 0; i < chunks.Length; i++)
                {
                    res.Add(new ChunkItem { token = tokens[i], tag = tags[i], chunk = chunks[i] });
                }

                return res;
            });
        }
예제 #29
0
        public Dictionary <string, List <string> > Main(string line)
        {
            //debug sentence
            // line = "Show me the sales of Kean Cola .25ltr Bottle in Nicosia from January 2017 to October 2017 as a line chart.";
            matchedWords?.Clear();
            nounPhrases?.Clear();
            nouns?.Clear();
            adjectivePhrases?.Clear();
            verbPhrases?.Clear();
            InputStream modelIn = new FileInputStream(HttpRuntime.AppDomainAppPath + "\\Models\\en-parser-chunking.bin");

            InputStream modelIn1  = new FileInputStream(HttpRuntime.AppDomainAppPath + "\\Models\\en-ner-date.bin");
            InputStream modelIn2  = new FileInputStream(HttpRuntime.AppDomainAppPath + "\\Models\\en-token.bin");
            ParserModel model     = new ParserModel(modelIn);
            var         myParser  = ParserFactory.create(model);
            var         topParses = ParserTool.parseLine(line, myParser, 1);

            foreach (var p in topParses)
            {
                GetSentenceParts(p);
            }


            try
            {
                TokenizerModel       model1 = new TokenizerModel(modelIn2);
                TokenNameFinderModel model2 = new TokenNameFinderModel(modelIn1);

                Tokenizer tokenizer  = new TokenizerME(model1);
                var       nameFinder = new NameFinderME(model2);

                var tokens    = tokenizer.tokenize(line);
                var nameSpans = nameFinder.find(tokens);

                var array = Span.spansToStrings(nameSpans, tokens);

                //
                //                foreach (var v in array)
                //                {
                //                    System.Diagnostics.Debug.WriteLine(v);
                //                }

                dates = new HashSet <string>(array);



                PrintSets();
//                System.Diagnostics.Debug.WriteLine("\nProcessing Presentation type");
//
//                if (nouns.Contains("table"))
//                {
//                    matchedWords.Add(new Tuple<string, string>("PRESENTATION_TYPE", "table"));
//                }
//                if (nounPhrases.Contains("bar chart"))
//                {
//                    matchedWords.Add(new Tuple<string, string>("PRESENTATION_TYPE", "bar chart"));
//                }
//                if (nounPhrases.Contains("line chart"))
//                {
//                    matchedWords.Add(new Tuple<string, string>("PRESENTATION_TYPE", "line chart"));
//                }
                //TODO IF NO OPTION IS FOUND ASK THE USER TO GIVE YOU ONE. IMPLEMENT IT IN THE WEB VERSION SOON

                System.Diagnostics.Debug.WriteLine("\nProcessing Dates");

                if (dates.Count == 2)
                {
                    if (dates.ElementAt(0).contains("from"))
                    {
                        var           a       = dates.ElementAt(0).replace("from", "");
                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");
                        matchedWords.Add(a, newList);
                        newList = new List <string>();
                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dates.ElementAt(1), newList);
                    }
                    else
                    {
                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");
                        matchedWords.Add(dates.ElementAt(0), newList);
                        newList = new List <string>();
                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dates.ElementAt(1), newList);
                    }
                }

                if (dates.Count == 1)
                {
                    if (dates.ElementAt(0).contains("from"))
                    {
                        var a   = dates.ElementAt(0).replace("from", "");
                        var dts = a.Split(new[] { " to " }, StringSplitOptions.None);

                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");
                        matchedWords.Add(dts[0], newList);
                        newList = new List <string>();
                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dts[1], newList);
                    }
                    else
                    {
                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");

                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dates.ElementAt(0), newList);
                    }
                }

                System.Diagnostics.Debug.WriteLine("\nProcessing noun phrases");

                //                var manager = new Manager();
                //                var serializer = new XmlSerializer(typeof(Manager.language));
                //                var loadStream = new FileStream("file2.xml", FileMode.Open, FileAccess.Read);
                //                var loadedObject = (Manager.language) serializer.Deserialize(loadStream);


                var doc = new XmlDocument();
//                System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath);
//                System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath);
//                System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath);
//                System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath);
                doc.Load(HttpRuntime.AppDomainAppPath + "\\file2.xml");


                var root = doc.SelectSingleNode("*");
                FindMatchingNodesFromXml(root, nounPhrases);


                foreach (var item in nouns.ToList())
                {
                    foreach (var VARIABLE in matchedWords)
                    {
                        if (VARIABLE.Key.Contains(item))
                        {
                            nouns.Remove(item);    //Will work!
                        }
                    }
                }

                FindMatchingNodesFromXml(root, verbPhrases);
                // FindMatchingNodesFromXml(root, nouns);



                System.Diagnostics.Debug.WriteLine("\nProcessing verb phrases ");


                System.Diagnostics.Debug.WriteLine("\nProcessing nouns ");



                // construct the dictionary object and open it
                var directory = Directory.GetCurrentDirectory() + "\\wordnet\\";
                foreach (var variable in matchedWords)
                {
                    System.Diagnostics.Debug.WriteLine(variable.Value + "\t\t" + variable.Key);
                }

                foreach (var variable in matchedWords)
                {
                    string a = variable.Key;
                    if (line.Contains(a))
                    {
                        line = line.replace(a, "");
                    }
                }

                foreach (var variable in stopWordsofwordnet)
                {
                    string a = " " + variable.toLowerCase() + " ";
                    if (line.Contains(a))
                    {
                        line = line.replace(a, " ");
                    }
                }
                if (line.contains("."))
                {
                    line = line.replace(".", "");
                }
                if (line.contains("-"))
                {
                    line = line.replace("-", " ");
                }
                System.Diagnostics.Debug.WriteLine("/////////////");
                System.Diagnostics.Debug.WriteLine("SECOND PARSE STRING " + line);
                System.Diagnostics.Debug.WriteLine("/////////////");
                line      = line.Trim();
                topParses = ParserTool.parseLine(line, myParser, 1);
                nounPhrases?.Clear();
                dates?.Clear();
                verbPhrases?.Clear();
                nouns?.Clear();
                foreach (var p in topParses)
                {
                    //p.show();
                    GetSentenceParts(p);
                }

                FindMatchingNodesFromXml(root, nounPhrases);



                foreach (var item in nouns.ToList())
                {
                    foreach (var VARIABLE in matchedWords)
                    {
                        if (VARIABLE.Key.Contains(item))
                        {
                            nouns.Remove(item);    //Will work!
                        }
                    }
                }
                FindMatchingNodesFromXml(root, verbPhrases);
                FindMatchingNodesFromXml(root, nouns);


                tokens    = tokenizer.tokenize(line);
                nameSpans = nameFinder.find(tokens);

                array = Span.spansToStrings(nameSpans, tokens);
                dates = new HashSet <string>(array);



                PrintSets();

                System.Diagnostics.Debug.WriteLine("\nProcessing Dates");


                if (dates.Count == 2)
                {
                    if (dates.ElementAt(0).contains("from"))
                    {
                        var           a       = dates.ElementAt(0).replace("from", "");
                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");
                        matchedWords.Add(a, newList);
                        newList = new List <string>();
                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dates.ElementAt(1), newList);
                    }
                    else
                    {
                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");
                        matchedWords.Add(dates.ElementAt(0), newList);
                        newList = new List <string>();
                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dates.ElementAt(1), newList);
                    }
                }

                if (dates.Count == 1)
                {
                    if (dates.ElementAt(0).contains("from"))
                    {
                        var a   = dates.ElementAt(0).replace("from", "");
                        var dts = a.Split(new[] { " to " }, StringSplitOptions.None);

                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");
                        matchedWords.Add(dts[0], newList);
                        newList = new List <string>();
                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dts[1], newList);
                    }
                    else
                    {
                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");

                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dates.ElementAt(0), newList);
                    }
                }

                System.Diagnostics.Debug.WriteLine("\nProcessing noun phrases");

                //                var manager = new Manager();
                //                var serializer = new XmlSerializer(typeof(Manager.language));
                //                var loadStream = new FileStream("file2.xml", FileMode.Open, FileAccess.Read);
                //                var loadedObject = (Manager.language) serializer.Deserialize(loadStream);



                FindMatchingNodesFromXml(root, nounPhrases);
                FindMatchingNodesFromXml(root, verbPhrases);
                FindMatchingNodesFromXml(root, nouns);

                foreach (var variable in matchedWords)
                {
                    System.Diagnostics.Debug.WriteLine(variable.Value + "\t\t" + variable.Key);
                }

                doc = null;
                GC.Collect();
                GC.WaitForPendingFinalizers();
                //MATCHING WITH WORD NET
                System.Diagnostics.Debug.WriteLine(directory);
                //                var wordNet = new WordNetEngine();
                //
                //                wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.adj")), PartOfSpeech.Adjective);
                //                wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.adv")), PartOfSpeech.Adverb);
                //                wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.noun")), PartOfSpeech.Noun);
                //                wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.verb")), PartOfSpeech.Verb);
                //
                //                wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.adj")), PartOfSpeech.Adjective);
                //                wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.adv")), PartOfSpeech.Adverb);
                //                wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.noun")), PartOfSpeech.Noun);
                //                wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.verb")), PartOfSpeech.Verb);
                //
                //                System.Diagnostics.Debug.WriteLine("Loading database...");
                //                wordNet.Load();
                //                System.Diagnostics.Debug.WriteLine("Load completed.");
                //                while (true)
                //                {
                //                    System.Diagnostics.Debug.WriteLine("\nType first word");
                //
                //                    var word = System.Diagnostics.Debug.ReadLine();
                //                    var synSetList = wordNet.GetSynSets(word);
                //
                //                    if (synSetList.Count == 0) System.Diagnostics.Debug.WriteLine($"No SynSet found for '{word}'");
                //
                //                    foreach (var synSet in synSetList)
                //                    {
                //                        var words = string.Join(", ", synSet.Words);
                //
                //                        System.Diagnostics.Debug.WriteLine($"\nWords: {words}");
                //                    }
                //                }
            }
            catch (IOException e)
            {
                e.printStackTrace();
            }
            finally
            {
                if (modelIn1 != null)
                {
                    try
                    {
                        modelIn1.close();
                    }
                    catch (IOException e)
                    {
                    }
                }

                if (modelIn2 != null)
                {
                    try
                    {
                        modelIn2.close();
                    }
                    catch (IOException e)
                    {
                    }
                }



                //            truncateLists(ref nounPhrases);
                //            truncateLists(ref nouns);
                //            truncateLists(ref dates);
                //            truncateLists(ref verbPhrases);
            }



            return(matchedWords);
        }
예제 #30
0
 public Tokenizer(TokenizerModel model)
 {
     this.tokenizer = new TokenizerME(model);
 }