コード例 #1
0
        static void Main(string[] args)
        {
            String[]     files = Directory.GetFiles(@"C:\10311173\lab-5-opennlp-tokenization-Changyiyu\Dataset", "*.html");
            StreamWriter sw    = new StreamWriter(@"TokensReader.txt");

            foreach (string thefile in files)
            {
                using (StreamReader sr = new StreamReader(thefile))
                {
                    while (sr.Peek() != -1)
                    {
                        string         line = sr.ReadLine();
                        string[]       tokens;
                        InputStream    modelIn     = new FileInputStream(@"C:\10311173\lab-6-opennlp-ju-zi-qie-fen-Changyiyu\en-token.bin");
                        TokenizerModel model       = new TokenizerModel(modelIn);
                        TokenizerME    enTokenizer = new TokenizerME(model);
                        tokens = enTokenizer.tokenize(line);

                        for (int i = 0; i < tokens.Length; i++)
                        {
                            //Regex reg = new Regex(".");
                            //Match m = reg.Match(tokens[i]);
                            sw.Write(tokens[i] + " ");
                            if (tokens[i].Equals("."))
                            {
                                sw.Write("\r\n");
                            }
                        }
                    }
                }
            }
            sw.Close();
        }
コード例 #2
0
        public void TestTokenizer()
        {
            var model     = TokenizerTestUtil.CreateMaxentTokenModel();
            var tokenizer = new TokenizerME(model);

            TestTokenizer(tokenizer);
        }
コード例 #3
0
        private StringBuilder ReverseIt(string Message)
        {
            StringBuilder reversedString = new StringBuilder();

            //lets do some language processing tasks to identify sentence structure
            SentenceDetectorME sentenceParser = new SentenceDetectorME(LoadNLP.sentenceModel);
            TokenizerME        tokenizer      = new TokenizerME(LoadNLP.tokenModel);

            string[] sentences = sentenceParser.sentDetect(Message);

            //iterate through each sentence
            foreach (string sentence in sentences)
            {
                string[] tokens = tokenizer.tokenize(sentence);

                //reverse the tokens
                for (int i = 0; i < tokens.Length / 2; i++)
                {
                    string storage = tokens[i];
                    tokens[i] = tokens[tokens.Length - i - 1];
                    tokens[tokens.Length - i - 1] = storage;
                }

                //Now that we've organized the sentence nicely, lets detokenize it and convert back to a usable string
                reversedString.Append(DeTokenize(tokens, DetokenizationDictionary.Operation.MOVE_LEFT));
            }

            return(reversedString);
        }
コード例 #4
0
 public ILexer InitNow()
 {
     Console.WriteLine("Loading...");
     _tokenizer = prepareTokenizer();
     _posTagger = preparePOSTagger();
     return this;
 }
コード例 #5
0
        static void Main(string[] args)
        {
            String[]     files = Directory.GetFiles(@"..\..\..\..\Dataset", "lab5result.txt");
            StreamWriter sw    = new StreamWriter(@"..\..\..\..\lab6result.txt", false);

            foreach (string file in files)
            {
                using (StreamReader sr = new StreamReader(file))
                {
                    while (sr.Peek() != -1)
                    {
                        string         line = sr.ReadLine();
                        string[]       tokens;
                        InputStream    modelIn     = new FileInputStream(@"..\..\..\..\en-token.bin");
                        TokenizerModel model       = new TokenizerModel(modelIn);
                        TokenizerME    enTokenizer = new TokenizerME(model);
                        tokens = enTokenizer.tokenize(line);

                        for (int i = 0; i < tokens.Length; i++)
                        {
                            sw.Write(tokens[i] + " ");
                            if (tokens[i].Equals("."))
                            {
                                sw.Write("\r\n");
                            }
                        }
                    }
                }
            }
            sw.Close();
        }
コード例 #6
0
        static void Main(string[] args)
        {
            String[]     file = Directory.GetFiles(@"C:\10311171\lab-6-opennlp-ju-zi-qie-fen-HUNGLIWEN\lab6\Dataset", "*.html");
            StreamWriter sw   = new StreamWriter(@"ReadByTokens.txt");

            foreach (string filename in file)
            {
                using (StreamReader sr = new StreamReader(filename))
                {
                    while (sr.Peek() != -1)
                    {
                        string         line = sr.ReadLine();
                        string[]       tokens;
                        InputStream    modelIn     = new FileInputStream(@"C:\10311171\lab-6-opennlp-ju-zi-qie-fen-HUNGLIWEN\en-token.bin");
                        TokenizerModel model       = new TokenizerModel(modelIn);
                        TokenizerME    enTokenizer = new TokenizerME(model);
                        tokens = enTokenizer.tokenize(line);
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            sw.Write(tokens[i] + " ");

                            if (tokens[i].Equals("."))
                            {
                                sw.Write("\n");
                            }
                        }
                    }
                }
            }
            sw.Close();
        }
コード例 #7
0
ファイル: Tokenizer.cs プロジェクト: MALLOCkol/Dragon
 /// <summary>
 /// Split the input content to individual words
 /// </summary>
 /// <param name="contents">Content to split into words</param>
 /// <returns></returns>
 public static IEnumerable<string> TokenizeNow(string contents)
 {
     //ToDo: Make preprocessing function a functional pointer
     var processedContents = PreProcessing(contents);
     var tokenizer = new TokenizerME(Model);
     var tokens = tokenizer.tokenize(processedContents);
     return tokens;
 }
コード例 #8
0
 public ILexer InitNow()
 {
     Console.WriteLine("Loading...");
     _tokenizer = prepareTokenizer();
     _nameFinder = prepareNameFinder();
     _locationFinder = prepareLocationFinder();
     _timeFinder = prepareTimeFinder();
     return this;
 }
コード例 #9
0
        string[] Tokenizer(string review)
        {
            InputStream    modelIn   = new FileInputStream(modelPath + "en-token.zip");
            TokenizerModel model     = new TokenizerModel(modelIn);
            TokenizerME    tokenizer = new TokenizerME(model);

            string[] tokens = tokenizer.tokenize(review.Replace(".", ""));
            return(tokens);
        }
コード例 #10
0
        /// <summary>
        /// Split the input content to individual words
        /// </summary>
        /// <param name="contents">Content to split into words</param>
        /// <returns></returns>
        public static IEnumerable <string> TokenizeNow(string contents)
        {
            //ToDo: Make preprocessing function a functional pointer
            var processedContents = PreProcessing(contents);
            var tokenizer         = new TokenizerME(Model);
            var tokens            = tokenizer.tokenize(processedContents);

            return(tokens);
        }
コード例 #11
0
        public void TestTokenizerSimpleModel()
        {
            var model     = TokenizerTestUtil.CreateMaxentTokenModel();
            var tokenizer = new TokenizerME(model);
            var tokens    = tokenizer.Tokenize("test,");

            Assert.AreEqual(2, tokens.Length);
            Assert.AreEqual("test", tokens[0]);
            Assert.AreEqual(",", tokens[1]);
        }
コード例 #12
0
ファイル: TokenizerTestUtil.cs プロジェクト: qooba/SharpNL
 public static TokenizerModel CreateMaxentTokenModel()
 {
     using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) {
         var samples  = new TokenSampleStream(new PlainTextByLineStream(data));
         var mlParams = new TrainingParameters();
         mlParams.Set(Parameters.Iterations, "100");
         mlParams.Set(Parameters.Cutoff, "0");
         return(TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams));
     }
 }
コード例 #13
0
        private void LoadTokenizer()
        {
            if (!alreadyLoadTagger)
            {
                java.io.FileInputStream modelInpStream = new java.io.FileInputStream("Resources\\en-token.bin");
                TokenizerModel          tokenizerModel = new TokenizerModel(modelInpStream);
                tokenizer = new TokenizerME(tokenizerModel);

                alreadyLoadTagger = true;
            }
        }
コード例 #14
0
        public override ObjectStream <NameSample> create(string[] args)
        {
            Parameters @params = ArgumentParser.parse(args, typeof(Parameters));

            TokenizerModel tokenizerModel = (new TokenizerModelLoader()).load(@params.TokenizerModel);
            Tokenizer      tokenizer      = new TokenizerME(tokenizerModel);

            ObjectStream <string> mucDocStream = new FileToStringSampleStream(new DirectorySampleStream(@params.Data, new FileFilterAnonymousInnerClassHelper(this), false), Charset.forName("UTF-8"));

            return(new MucNameSampleStream(tokenizer, mucDocStream));
        }
コード例 #15
0
ファイル: OpenNLP.cs プロジェクト: baio/d-mill
        public void Tokenize()
        {
            var modelStream = new java.io.FileInputStream("../../Models/en-token.bin");

            var model = new TokenizerModel(modelStream);

            var tokenizer = new TokenizerME(model);

            var txt = File.ReadAllText(@"c:\dev\d-mill\uspe\Data\uspe-sentenced.txt");

            var tokens = tokenizer.tokenize(txt);
        }
コード例 #16
0
        public static TokenizerModel TrainModel(string path)
        {
            FileStream        fs     = new FileStream(path, FileMode.Open, FileAccess.Read);
            TokenSampleStream stream = new TokenSampleStream(new PlainTextByLineStream(fs));

            TrainingParameters trainParams = new TrainingParameters();

            trainParams.Set(Parameters.Iterations, "100");
            trainParams.Set(Parameters.Cutoff, "0");

            return(TokenizerME.Train(stream, new TokenizerFactory(TRAINING_LANGUAGE, null, true), trainParams));
        }
コード例 #17
0
        public Span[] GetTokens(string paragraph)
        {
            var            bin       = GetFileStream("en-token.bin");
            TokenizerModel model     = new TokenizerModel(bin);
            TokenizerME    tokenizer = new TokenizerME(model);

            Span[] tokens = tokenizer.tokenizePos(paragraph);

            bin.close();

            return(tokens);
        }
コード例 #18
0
ファイル: OpenNLP.cs プロジェクト: baio/d-mill
        public void TrainTokenizer()
        {
            var charset = Charset.forName("UTF-8");

            var lineStream = new opennlp.tools.util.PlainTextByLineStream(new java.io.FileInputStream(@"c:\dev\d-mill\uspe\Data\uspe-sentenced-train.txt"), charset);

            var sampleStream = new opennlp.tools.namefind.NameSampleDataStream(lineStream);

            var model = TokenizerME.train("en", sampleStream, true);

            var tokenizer = new TokenizerME(model);

            var tokens = tokenizer.tokenize("Hi. How are you? This is Mike.");
        }
コード例 #19
0
        static void Main(string[] args)
        {
            String[]     file = Directory.GetFiles(@"..\..\..\..\Dataset\", "*.html");
            StreamWriter sw   = new StreamWriter(@"..\..\..\Html.txt");

            foreach (String files in file)
            {
                using (StreamReader sr = new StreamReader(files))
                {
                    while (sr.Peek() != -1)
                    {
                        string line        = sr.ReadLine();
                        string pattern     = @"(<P>|<BR />|<DIV>)";
                        string replacement = "";
                        Regex  rgx         = new Regex(pattern);
                        line        = rgx.Replace(line, replacement);
                        pattern     = @"(</P>|<p/>|</DIV>)";
                        replacement = "";
                        rgx         = new Regex(pattern);
                        line        = rgx.Replace(line, replacement);
                        string regFind       = @"<a.*?>" + @"(?'text'.*?)" + @"</a>";
                        string regReplace    = @"${text}";
                        string regFindimg    = @"<img.*?title=""" + @"(?'text2'.*?)" + @""" />";
                        string regReplaceimg = @"${text2}";
                        line = Regex.Replace(line, @"&nbsp;", "");
                        line = Regex.Replace(line, regFind, regReplace);

                        String[]       tokens;
                        InputStream    modelIn     = new FileInputStream(@"..\..\..\..\en-token.bin");
                        TokenizerModel model       = new TokenizerModel(modelIn);
                        TokenizerME    enTokenizer = new TokenizerME(model);
                        tokens = enTokenizer.tokenize(Regex.Replace(line, regFindimg, regReplaceimg));
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            sw.Write(tokens[i] + " ");
                            if (tokens[i].Equals("."))
                            {
                                sw.Write("\n");
                            }
                        }
                    }
                }
            }
            sw.Close();
        }
コード例 #20
0
        public void TestCrossCompatibility()
        {
            using (var data = Tests.OpenFile("/opennlp/tools/tokenize/token.train")) {
                var samples  = new TokenSampleStream(new PlainTextByLineStream(data));
                var mlParams = new TrainingParameters();
                mlParams.Set(Parameters.Iterations, "100");
                mlParams.Set(Parameters.Cutoff, "0");
                var model = TokenizerME.Train(samples, new TokenizerFactory("en", null, true), mlParams);

                var sMe = new TokenizerME(model);

                TokenizerMETest.TestTokenizer(sMe);

                var sProbs = sMe.TokenProbabilities;

                // --- java \/

                var sFile = Path.GetTempFileName();

                model.Serialize(new FileStream(sFile, FileMode.Create));

                var jModel = new opennlp.tools.tokenize.TokenizerModel(
                    OpenNLP.CreateInputStream(sFile)
                    );

                var jMe = new opennlp.tools.tokenize.TokenizerME(jModel);

                TestJavaTokenizer(jMe);

                var jProbs = jMe.getTokenProbabilities();

                Assert.AreEqual(jProbs.Length, sProbs.Length);

                for (int i = 0; i < jProbs.Length; i++)
                {
                    // one difference :(
                    // -0.00000000000000011102230246251565
                    //
                    // but still "insignificant" :)
                    Assert.AreEqual(jProbs[i], sProbs[i], 0.0000000001d);
                }
            }
        }
コード例 #21
0
ファイル: TokenizerTestUtil.cs プロジェクト: qooba/SharpNL
        public static TokenizerModel CreateSimpleMaxentTokenModel()
        {
            var samples = new List <TokenSample> {
                new TokenSample("year", new[] { new Span(0, 4) }),
                new TokenSample("year,", new[] { new Span(0, 4), new Span(4, 5) }),
                new TokenSample("it,", new[] { new Span(0, 2), new Span(2, 3) }),
                new TokenSample("it", new[] { new Span(0, 2) }),
                new TokenSample("yes", new[] { new Span(0, 3) }),
                new TokenSample("yes,", new[] { new Span(0, 3), new Span(3, 4) })
            };

            var mlParams = new TrainingParameters();

            mlParams.Set(Parameters.Iterations, "100");
            mlParams.Set(Parameters.Cutoff, "0");

            return(TokenizerME.Train(
                       new CollectionObjectStream <TokenSample>(samples),
                       new TokenizerFactory("en", null, true),
                       mlParams));
        }
コード例 #22
0
        public NLP()
        {
            //loading sentence detector model
            java.io.FileInputStream modelInpStream = new java.io.FileInputStream("Resources\\en-sent.bin");
            SentenceModel           sentenceModel  = new SentenceModel(modelInpStream);

            sentenceDetector = new SentenceDetectorME(sentenceModel);

            //loading tokenizer model
            modelInpStream = new java.io.FileInputStream("Resources\\en-token.bin");
            TokenizerModel tokenizerModel = new TokenizerModel(modelInpStream);

            tokenizer = new TokenizerME(tokenizerModel);

            modelInpStream = new java.io.FileInputStream("Resources\\en-pos-maxent.bin");
            POSModel posModel = new POSModel(modelInpStream);

            tagger = new POSTaggerME(posModel);

            modelInpStream = new java.io.FileInputStream("Resources\\en-chunker.bin");
            ChunkerModel chunkerModel = new ChunkerModel(modelInpStream);

            chunker = new ChunkerME(chunkerModel);

            modelInpStream = new java.io.FileInputStream("Resources\\en-parser-chunking.bin");
            ParserModel parserModel = new ParserModel(modelInpStream);

            parser = ParserFactory.create(parserModel);

            //loading stop words list
            StreamReader sr = new StreamReader("Resources\\english.stop.txt");
            string       line;

            while ((line = sr.ReadLine()) != null)
            {
                stopwords.Add(Stemming(line));
                stopwords.Add(line);
            }
        }
コード例 #23
0
        static void Main(string[] args)
        {
            String[]     file = Directory.GetFiles(@"C:\10311209\lab-6-opennlp-ju-zi-qie-fen-XiuXuanLiu\Dataset", "*.html");
            StreamWriter sw   = new StreamWriter(@"ReadByTokens.txt");

            foreach (string filename in file)
            {
                using (StreamReader sr = new StreamReader(filename))
                {
                    while (sr.Peek() != -1)
                    {
                        string line = sr.ReadLine();
                        line = Regex.Replace(line, "<P[^>]*>", ""); //[^>] 所有不是 > 的字
                        line = Regex.Replace(line, @"&nbsp;", "");
                        line = Regex.Replace(line, "<DIV[^>]*>", "");
                        line = Regex.Replace(line, "<BR[^>]*>", "");
                        line = Regex.Replace(line, "<img[^>]*title=\"(?'titleName'.*?)\"[^>]*>", "${titleName}");
                        line = Regex.Replace(line, "<[^>]*href.*>(?'Name'.*?)<[^>]*>", "${Name}");
                        line = Regex.Replace(line, "<[^>]*>", "");

                        string[]       tokens;
                        InputStream    modelIn     = new FileInputStream(@"C:\10311209\lab-6-opennlp-ju-zi-qie-fen-XiuXuanLiu\en-token.bin");
                        TokenizerModel model       = new TokenizerModel(modelIn);
                        TokenizerME    enTokenizer = new TokenizerME(model);
                        tokens = enTokenizer.tokenize(line);
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            sw.Write(tokens[i] + " ");
                            if (tokens[i].Equals("."))
                            {
                                sw.Write("\n");
                            }
                        }
                    }
                }
            }
            sw.Close();
        }
コード例 #24
0
        static void Main(string[] args)
        {
            java.io.InputStream modelIn  = new java.io.FileInputStream(string.Format("en-sent.bin"));
            java.io.InputStream modelIn2 = new java.io.FileInputStream(string.Format("en-token.bin"));
            TokenizerModel      model    = new TokenizerModel(modelIn2);
            TokenizerME         mE       = new TokenizerME(model);
            SentenceModel       sM       = new SentenceModel(modelIn);
            SentenceDetector    detector = new SentenceDetectorME(sM);
            string folderName            = @"C:\Users\Administrator\Desktop\lab-6-opennlp-ju-zi-qie-fen-10411174\file";

            foreach (string fname in System.IO.Directory.GetFiles(folderName))
            {
                String       line  = null;
                String[]     name  = fname.Split('\\');
                StreamWriter sw    = new StreamWriter(@"C:\Users\Administrator\Desktop\lab-6-opennlp-ju-zi-qie-fen-10411174\answer\" + name[6]);
                StreamReader file2 = new StreamReader(fname);
                while ((line = file2.ReadLine()) != null)
                {
                    string   str   = null;
                    string[] sents = detector.sentDetect(line);
                    if (sents.Length.Equals(0))
                    {
                        continue;
                    }
                    foreach (var s in sents)
                    {
                        str = str + s;
                    }
                    var Tokens = mE.tokenize(str);
                    foreach (var s in Tokens)
                    {
                        sw.Write(s + " ");
                    }
                    sw.WriteLine();
                }
                sw.Close();
            }
        }
コード例 #25
0
        public override ObjectStream <CorefSample> create(string[] args)
        {
            Parameters @params = ArgumentParser.parse(args, typeof(Parameters));

            ParserModel parserModel = (new ParserModelLoader()).load(@params.ParserModel);
            Parser      parser      = ParserFactory.create(parserModel);

            TokenizerModel tokenizerModel = (new TokenizerModelLoader()).load(@params.TokenizerModel);
            Tokenizer      tokenizer      = new TokenizerME(tokenizerModel);

            ObjectStream <string> mucDocStream = new FileToStringSampleStream(new DirectorySampleStream(@params.Data, new FileFilterAnonymousInnerClassHelper(this), false), Charset.forName("UTF-8"));

            ObjectStream <RawCorefSample> rawSamples = new MucCorefSampleStream(tokenizer, mucDocStream);

            ObjectStream <RawCorefSample> parsedSamples = new FullParseCorefEnhancerStream(parser, rawSamples);


            // How to load all these nameFinder models ?!
            // Lets make a param per model, not that nice, but ok!

            IDictionary <string, Jfile> modelFileTagMap = new Dictionary <string, Jfile>();

            modelFileTagMap["person"]       = @params.PersonModel;
            modelFileTagMap["organization"] = @params.OrganizationModel;

            IList <TokenNameFinder> nameFinders = new List <TokenNameFinder>();
            IList <string>          tags        = new List <string>();

            foreach (KeyValuePair <string, Jfile> entry in modelFileTagMap)
            {
                nameFinders.Add(new NameFinderME((new TokenNameFinderModelLoader()).load(entry.Value)));
                tags.Add(entry.Key);
            }

            return(new MucMentionInserterStream(new NameFinderCorefEnhancerStream(nameFinders.ToArray(), tags.ToArray(), parsedSamples)));
        }
コード例 #26
0
        public Dictionary <string, List <string> > Main(string line)
        {
            //debug sentence
            // line = "Show me the sales of Kean Cola .25ltr Bottle in Nicosia from January 2017 to October 2017 as a line chart.";
            matchedWords?.Clear();
            nounPhrases?.Clear();
            nouns?.Clear();
            adjectivePhrases?.Clear();
            verbPhrases?.Clear();
            InputStream modelIn = new FileInputStream(HttpRuntime.AppDomainAppPath + "\\Models\\en-parser-chunking.bin");

            InputStream modelIn1  = new FileInputStream(HttpRuntime.AppDomainAppPath + "\\Models\\en-ner-date.bin");
            InputStream modelIn2  = new FileInputStream(HttpRuntime.AppDomainAppPath + "\\Models\\en-token.bin");
            ParserModel model     = new ParserModel(modelIn);
            var         myParser  = ParserFactory.create(model);
            var         topParses = ParserTool.parseLine(line, myParser, 1);

            foreach (var p in topParses)
            {
                GetSentenceParts(p);
            }


            try
            {
                TokenizerModel       model1 = new TokenizerModel(modelIn2);
                TokenNameFinderModel model2 = new TokenNameFinderModel(modelIn1);

                Tokenizer tokenizer  = new TokenizerME(model1);
                var       nameFinder = new NameFinderME(model2);

                var tokens    = tokenizer.tokenize(line);
                var nameSpans = nameFinder.find(tokens);

                var array = Span.spansToStrings(nameSpans, tokens);

                //
                //                foreach (var v in array)
                //                {
                //                    System.Diagnostics.Debug.WriteLine(v);
                //                }

                dates = new HashSet <string>(array);



                PrintSets();
//                System.Diagnostics.Debug.WriteLine("\nProcessing Presentation type");
//
//                if (nouns.Contains("table"))
//                {
//                    matchedWords.Add(new Tuple<string, string>("PRESENTATION_TYPE", "table"));
//                }
//                if (nounPhrases.Contains("bar chart"))
//                {
//                    matchedWords.Add(new Tuple<string, string>("PRESENTATION_TYPE", "bar chart"));
//                }
//                if (nounPhrases.Contains("line chart"))
//                {
//                    matchedWords.Add(new Tuple<string, string>("PRESENTATION_TYPE", "line chart"));
//                }
                //TODO IF NO OPTION IS FOUND ASK THE USER TO GIVE YOU ONE. IMPLEMENT IT IN THE WEB VERSION SOON

                System.Diagnostics.Debug.WriteLine("\nProcessing Dates");

                if (dates.Count == 2)
                {
                    if (dates.ElementAt(0).contains("from"))
                    {
                        var           a       = dates.ElementAt(0).replace("from", "");
                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");
                        matchedWords.Add(a, newList);
                        newList = new List <string>();
                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dates.ElementAt(1), newList);
                    }
                    else
                    {
                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");
                        matchedWords.Add(dates.ElementAt(0), newList);
                        newList = new List <string>();
                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dates.ElementAt(1), newList);
                    }
                }

                if (dates.Count == 1)
                {
                    if (dates.ElementAt(0).contains("from"))
                    {
                        var a   = dates.ElementAt(0).replace("from", "");
                        var dts = a.Split(new[] { " to " }, StringSplitOptions.None);

                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");
                        matchedWords.Add(dts[0], newList);
                        newList = new List <string>();
                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dts[1], newList);
                    }
                    else
                    {
                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");

                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dates.ElementAt(0), newList);
                    }
                }

                System.Diagnostics.Debug.WriteLine("\nProcessing noun phrases");

                //                var manager = new Manager();
                //                var serializer = new XmlSerializer(typeof(Manager.language));
                //                var loadStream = new FileStream("file2.xml", FileMode.Open, FileAccess.Read);
                //                var loadedObject = (Manager.language) serializer.Deserialize(loadStream);


                var doc = new XmlDocument();
//                System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath);
//                System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath);
//                System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath);
//                System.Diagnostics.Debug.WriteLine(HttpRuntime.AppDomainAppPath);
                doc.Load(HttpRuntime.AppDomainAppPath + "\\file2.xml");


                var root = doc.SelectSingleNode("*");
                FindMatchingNodesFromXml(root, nounPhrases);


                foreach (var item in nouns.ToList())
                {
                    foreach (var VARIABLE in matchedWords)
                    {
                        if (VARIABLE.Key.Contains(item))
                        {
                            nouns.Remove(item);    //Will work!
                        }
                    }
                }

                FindMatchingNodesFromXml(root, verbPhrases);
                // FindMatchingNodesFromXml(root, nouns);



                System.Diagnostics.Debug.WriteLine("\nProcessing verb phrases ");


                System.Diagnostics.Debug.WriteLine("\nProcessing nouns ");



                // construct the dictionary object and open it
                var directory = Directory.GetCurrentDirectory() + "\\wordnet\\";
                foreach (var variable in matchedWords)
                {
                    System.Diagnostics.Debug.WriteLine(variable.Value + "\t\t" + variable.Key);
                }

                foreach (var variable in matchedWords)
                {
                    string a = variable.Key;
                    if (line.Contains(a))
                    {
                        line = line.replace(a, "");
                    }
                }

                foreach (var variable in stopWordsofwordnet)
                {
                    string a = " " + variable.toLowerCase() + " ";
                    if (line.Contains(a))
                    {
                        line = line.replace(a, " ");
                    }
                }
                if (line.contains("."))
                {
                    line = line.replace(".", "");
                }
                if (line.contains("-"))
                {
                    line = line.replace("-", " ");
                }
                System.Diagnostics.Debug.WriteLine("/////////////");
                System.Diagnostics.Debug.WriteLine("SECOND PARSE STRING " + line);
                System.Diagnostics.Debug.WriteLine("/////////////");
                line      = line.Trim();
                topParses = ParserTool.parseLine(line, myParser, 1);
                nounPhrases?.Clear();
                dates?.Clear();
                verbPhrases?.Clear();
                nouns?.Clear();
                foreach (var p in topParses)
                {
                    //p.show();
                    GetSentenceParts(p);
                }

                FindMatchingNodesFromXml(root, nounPhrases);



                foreach (var item in nouns.ToList())
                {
                    foreach (var VARIABLE in matchedWords)
                    {
                        if (VARIABLE.Key.Contains(item))
                        {
                            nouns.Remove(item);    //Will work!
                        }
                    }
                }
                FindMatchingNodesFromXml(root, verbPhrases);
                FindMatchingNodesFromXml(root, nouns);


                tokens    = tokenizer.tokenize(line);
                nameSpans = nameFinder.find(tokens);

                array = Span.spansToStrings(nameSpans, tokens);
                dates = new HashSet <string>(array);



                PrintSets();

                System.Diagnostics.Debug.WriteLine("\nProcessing Dates");


                if (dates.Count == 2)
                {
                    if (dates.ElementAt(0).contains("from"))
                    {
                        var           a       = dates.ElementAt(0).replace("from", "");
                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");
                        matchedWords.Add(a, newList);
                        newList = new List <string>();
                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dates.ElementAt(1), newList);
                    }
                    else
                    {
                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");
                        matchedWords.Add(dates.ElementAt(0), newList);
                        newList = new List <string>();
                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dates.ElementAt(1), newList);
                    }
                }

                if (dates.Count == 1)
                {
                    if (dates.ElementAt(0).contains("from"))
                    {
                        var a   = dates.ElementAt(0).replace("from", "");
                        var dts = a.Split(new[] { " to " }, StringSplitOptions.None);

                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");
                        matchedWords.Add(dts[0], newList);
                        newList = new List <string>();
                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dts[1], newList);
                    }
                    else
                    {
                        List <string> newList = new List <string>();
                        newList.Add("START_PERIOD");

                        newList.Add("END_PERIOD");
                        //todo fix when the date is the same here
                        matchedWords.Add(dates.ElementAt(0), newList);
                    }
                }

                System.Diagnostics.Debug.WriteLine("\nProcessing noun phrases");

                //                var manager = new Manager();
                //                var serializer = new XmlSerializer(typeof(Manager.language));
                //                var loadStream = new FileStream("file2.xml", FileMode.Open, FileAccess.Read);
                //                var loadedObject = (Manager.language) serializer.Deserialize(loadStream);



                FindMatchingNodesFromXml(root, nounPhrases);
                FindMatchingNodesFromXml(root, verbPhrases);
                FindMatchingNodesFromXml(root, nouns);

                foreach (var variable in matchedWords)
                {
                    System.Diagnostics.Debug.WriteLine(variable.Value + "\t\t" + variable.Key);
                }

                doc = null;
                GC.Collect();
                GC.WaitForPendingFinalizers();
                //MATCHING WITH WORD NET
                System.Diagnostics.Debug.WriteLine(directory);
                //                var wordNet = new WordNetEngine();
                //
                //                wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.adj")), PartOfSpeech.Adjective);
                //                wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.adv")), PartOfSpeech.Adverb);
                //                wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.noun")), PartOfSpeech.Noun);
                //                wordNet.AddDataSource(new StreamReader(Path.Combine(directory, "data.verb")), PartOfSpeech.Verb);
                //
                //                wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.adj")), PartOfSpeech.Adjective);
                //                wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.adv")), PartOfSpeech.Adverb);
                //                wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.noun")), PartOfSpeech.Noun);
                //                wordNet.AddIndexSource(new StreamReader(Path.Combine(directory, "index.verb")), PartOfSpeech.Verb);
                //
                //                System.Diagnostics.Debug.WriteLine("Loading database...");
                //                wordNet.Load();
                //                System.Diagnostics.Debug.WriteLine("Load completed.");
                //                while (true)
                //                {
                //                    System.Diagnostics.Debug.WriteLine("\nType first word");
                //
                //                    var word = System.Diagnostics.Debug.ReadLine();
                //                    var synSetList = wordNet.GetSynSets(word);
                //
                //                    if (synSetList.Count == 0) System.Diagnostics.Debug.WriteLine($"No SynSet found for '{word}'");
                //
                //                    foreach (var synSet in synSetList)
                //                    {
                //                        var words = string.Join(", ", synSet.Words);
                //
                //                        System.Diagnostics.Debug.WriteLine($"\nWords: {words}");
                //                    }
                //                }
            }
            catch (IOException e)
            {
                e.printStackTrace();
            }
            finally
            {
                if (modelIn1 != null)
                {
                    try
                    {
                        modelIn1.close();
                    }
                    catch (IOException e)
                    {
                    }
                }

                if (modelIn2 != null)
                {
                    try
                    {
                        modelIn2.close();
                    }
                    catch (IOException e)
                    {
                    }
                }



                //            truncateLists(ref nounPhrases);
                //            truncateLists(ref nouns);
                //            truncateLists(ref dates);
                //            truncateLists(ref verbPhrases);
            }



            return(matchedWords);
        }
コード例 #27
0
ファイル: Menu.cs プロジェクト: JJColeman/jcoleman_Capstone
        public string grabPossiblePunSentences(string currentSentence)
        {
            try
            {
                java.io.InputStream modelIn = new java.io.FileInputStream(@"C:\en-token.bin");
                TokenizerModel model = new TokenizerModel(modelIn);

                Tokenizer tokenizer = new TokenizerME(model);

                string[] words = tokenizer.tokenize(currentSentence);
                List<string> possibleSentences = new List<string>();

                Homonyms homonyms = new Homonyms();

                for (int i = 0; i < words.Length; i++)
                {
                    System.Console.WriteLine();
                    Homonym homonym = homonyms.findWordInList(words[i]);

                    if (homonym.homonyms == null)
                    {

                    }

                    else
                    {
                        string possibleSentence = "";
                        for (int r = 0; r < words.Length; r++)
                        {
                            if (words[i].Equals(words[r]))
                            {
                                Random random = new Random();
                                int randomNumber = random.Next(homonym.homonyms.Length);
                                possibleSentence += " " + homonym.homonyms[randomNumber];
                            }

                            else
                            {
                                possibleSentence += " " + words[r];
                            }
                        }
                        possibleSentences.Add(possibleSentence);
                    }
                }
                currentSentence = choosePossiblePunSentence(currentSentence, possibleSentences);
            }

            catch (Exception e)
            {

            }

            return currentSentence;
        }
コード例 #28
0
 public NLPTokenizerOp()
 {
     tokenizer = null;
 }
コード例 #29
0
 public NLPTokenizerOp(TokenizerModel model)
 {
     tokenizer = new TokenizerME(model);
 }
コード例 #30
0
 public Tokenizer(TokenizerModel model)
 {
     this.tokenizer = new TokenizerME(model);
 }
コード例 #31
0
        public Tokenizer(FileStream modelStream)
        {
            TokenizerModel model = new TokenizerModel(modelStream);

            this.tokenizer = new TokenizerME(model);
        }
コード例 #32
0
ファイル: Menu.cs プロジェクト: JJColeman/jcoleman_Capstone
        public void giveDefinitionAndHomonym(string currentSentence)
        {
            try
            {
                java.io.InputStream modelIn = new java.io.FileInputStream(@"C:\en-token.bin");
                TokenizerModel model = new TokenizerModel(modelIn);

                Tokenizer tokenizer = new TokenizerME(model);

                string[] words = tokenizer.tokenize(currentSentence);

                Homonyms homonyms = new Homonyms();

                for (int i = 0; i < words.Length; i++)
                {
                    System.Console.WriteLine();
                    Homonym homonym = homonyms.findWordInList(words[i]);

                    if (homonym.homonyms == null)
                    {
                        System.Console.WriteLine("No homonyms found for: " + words[i]);
                    }

                    else
                    {
                        List<string> selectedHomonyms = homonym.selectedHomonyms();

                        System.Console.WriteLine("Homonyms are: " + words[i]);
                        foreach (string selectedWord in selectedHomonyms)
                        {
                            System.Console.Write(selectedWord + ",");
                        }
                    }

                    System.Console.WriteLine();
                    System.Console.WriteLine("Definition for: " + words[i]);
                    using (WebClient client = new WebClient())
                    {
                        string line = client.DownloadString("http://api.wordnik.com/v4/word.json/" + words[i] + "/definitions?limit=200&includeRelated=true&useCanonical=false&includeTags=false&api_key=a2a73e7b926c924fad7001ca3111acd55af2ffabf50eb4ae5");
                        if (!line.Equals("[]"))
                        {
                            string[] lines1 = System.Text.RegularExpressions.Regex.Split(line, "\"text\":\"");
                            string[] lines2 = System.Text.RegularExpressions.Regex.Split(lines1[1], "\",\"sequence\"[\\W\\w]+");
                            System.Console.WriteLine(lines2[0]);
                        }
                        else
                        {
                            System.Console.WriteLine("Definition cannot be found, word is mispelled or doesn't exist within our current data");
                        }
                    }

                }
            }
            catch (Exception e)
            {
                System.Console.WriteLine(e.Message);
            }
        }
コード例 #33
0
ファイル: TextProcessor.cs プロジェクト: baio/d-mill
        public static IEnumerable<IEnumerable<ChunkItem>> GetChunks(IEnumerable<string> Sentences)
        {
            var posModelStream = new java.io.ByteArrayInputStream(Resource.en_pos_maxent);//new java.io.FileInputStream(@"C:\dev\d-mill\TextProcessing\OpenNLP\Models\en-pos-maxent.bin");

            var posModel = new POSModel(posModelStream);

            var pos = new POSTaggerME(posModel);

            var modelStream = new java.io.ByteArrayInputStream(Resource.en_token); //java.io.FileInputStream(@"C:\dev\d-mill\TextProcessing\OpenNLP\Models\en-token.bin");

            var model = new TokenizerModel(modelStream);

            var tokenizer = new TokenizerME(model);

            var chunkerModelStream = new java.io.ByteArrayInputStream(Resource.en_chunker);

            var chunkerModel = new ChunkerModel(chunkerModelStream);

            var chunker = new ChunkerME(chunkerModel);

            return Sentences.Select(p => {

                var tokens = tokenizer.tokenize(p);

                var tags = pos.tag(tokens);

                var chunks = chunker.chunk(tokens, tags);

                var res = new List<ChunkItem>();

                for (var i = 0; i < chunks.Length; i++)
                {
                    res.Add(new ChunkItem { token = tokens[i], tag = tags[i], chunk = chunks[i] });
                }

                return res;
            });
        }
コード例 #34
0
 private static TokenizerModel Train(TokenizerFactory factory)
 {
     return(TokenizerME.Train(CreateSampleStream(), factory, TrainingParameters.DefaultParameters()));
 }
コード例 #35
0
 public Tokenizer()
 {
     this.tokenizer = new TokenizerME(TrainModel(Environment.CurrentDirectory + TRAINING_MODEL_PATH));
 }