예제 #1
0
        public string CorrectSentence(string sent)
        {
            var modelPath = "EnglishTok.nbin";
            var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath);

            var sentDetector = new EnglishMaximumEntropySentenceDetector("EnglishSD.nbin");


            List <string> tokens = new List <string>();

            tokens.AddRange(tokenizer.Tokenize(sent));
            List <string> correct = new List <string>();

            foreach (var token in tokens)
            {
                if (Program.Applywords(token))
                {
                    correct.Add(Correct(token));
                }
                else
                {
                    correct.Add(token);
                }
            }

            string res = string.Join("", correct.Select(str => Program.Applywords(str)?str += " " : str));

            return(res);
        }
예제 #2
0
        private string[] MyTokenizer(string sentence)
        {
            var modelPath = Path.GetDirectoryName(Process.GetCurrentProcess().MainModule.FileName) + @"\Models\EnglishTok.nbin";

            var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath);
            var tokens    = tokenizer.Tokenize(sentence);

            return(tokens);
        }
예제 #3
0
        public static string[] TokenizeSentence(string sentence)
        {
            if (mTokenizer == null)
            {
                mTokenizer = new OpenNLP.Tools.Tokenize.EnglishMaximumEntropyTokenizer(mModelPath + "EnglishTok.nbin");
            }

            return(mTokenizer.Tokenize(sentence));
        }
예제 #4
0
        internal string[] TokenizeSentence(string sentence)
        {
            if (_mTokenizer == null)
            {
                _mTokenizer = new EnglishMaximumEntropyTokenizer(_modelPath + "EnglishTok.nbin");
            }

            return(_mTokenizer.Tokenize(sentence));
        }
예제 #5
0
        public string[] TokenizeSentence(string sentence)
        {
            if (mTokenizer == null)
            {
                mTokenizer = new EnglishMaximumEntropyTokenizer(mModelPath + @"\EnglishTok.nbin");
            }

            return(mTokenizer.Tokenize(sentence));
        }
예제 #6
0
        public IEnumerable <Sentence> Parse(string corpus)
        {
            var sentences = _sentenceDetector.SentenceDetect(corpus);

            foreach (var sentence in sentences)
            {
                var tokens = _tokenizer.Tokenize(sentence).Where(x => !specialTokens.Contains(x)).Select(x => x.ToLower()).ToArray();
                yield return(Sentence.Create(sentence, tokens));
            }
        }
예제 #7
0
        /// <summary>
        /// A tokenizer breaks a text into words, symbols or other meaningful elements. The historical tokenizers are based on the maxent algorithm.
        /// </summary>
        /// <param name="txt">Texto a tokenizar</param>
        /// <returns>a set of tokens</returns>
        public static string[] Tokenizer(this string txt, string path)
        {
            //Models for English. Tive que fazer download deles
            //Não esquecer de adicionar using OpenNLP.Tools.Tokenize;
            var modelPath = path + "EnglishTok.nbin";
            var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath);

            string[] tokens = tokenizer.Tokenize(txt);

            return(tokens);
        }
예제 #8
0
        /// <summary>
        /// Method for splitting English text into tokens (aka words)
        /// </summary>
        /// <param name="text">Text to split</param>
        /// <returns>List of tokens</returns>
        public IEnumerable <string> Tokenize(string text)
        {
            if (string.IsNullOrEmpty(modelPath) || !File.Exists(modelPath))
            {
                throw new Exception("Failed to access NLP tokenizer model");
            }

            var tmp = tok.Tokenize(text);

            return(tmp.Where(x => Regex.IsMatch(x, @"[\w\s]")).ToList());
        }
예제 #9
0
        public void Test()
        {
            var txt       = "1980 was certainly a year for bad backwoods slasher movies. \"Friday The 13th\" and \"The Burning\" may have been the best ones but there were like always a couple of stinkers not far behind like \"Don't Go Into The Woods Alone\" and this one. But in all fairness \"The Prey\" is nowhere near as bad as \"Don't Go Into The Woods\" but it's still not great either.";
            var sentences = sentenceDetector.SentenceDetect(txt);

            Assert.AreEqual(3, sentences.Length);
            var tokens = tokenizer.Tokenize(sentences[0]);

            Assert.AreEqual(11, tokens.Length);
            var tags = postTagger.Tag(tokens);

            Assert.AreEqual(11, tags.Length);
            var chunks = chunker.GetChunks(tokens, tags);

            Assert.AreEqual(7, chunks.Length);
        }
예제 #10
0
        public static List <string> WorkFile(string path)
        {
            //model = new Model[n];
            using (StreamReader sr = new StreamReader(path))
            {
                var modelPath = "EnglishTok.nbin";
                var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath);

                var sentDetector = new EnglishMaximumEntropySentenceDetector("EnglishSD.nbin");
                var text         = sr.ReadToEnd();
                sr.Close();
                var           sentencies = sentDetector.SentenceDetect(text);
                List <string> tokens     = new List <string>();
                foreach (var item in sentencies)
                {
                    tokens.AddRange(tokenizer.Tokenize(item));
                }
                return(tokens);
            }
        }
        public bool IsMatch(string sentence, PhrasalVerb phrasalVerb)
        {
            var tokens          = tokenizer.Tokenize(sentence);
            var matchRoot       = false;
            var particleToMatch = 0;

            for (var i = 0; i < tokens.Length; i++)
            {
                var token = tokens[i];
                if (!matchRoot)
                {
                    // try to match the root first
                    matchRoot = string.Equals(token, phrasalVerb.Root, StringComparison.InvariantCultureIgnoreCase)
                                ||
                                string.Equals(lemmatizer.Lemmatize(token), phrasalVerb.Root,
                                              StringComparison.InvariantCultureIgnoreCase);
                }
                else
                {
                    // match all particles
                    if (phrasalVerb.Particles.Count > particleToMatch)
                    {
                        var particle = phrasalVerb.Particles[particleToMatch];
                        var isMatch  = string.Equals(token, particle, StringComparison.InvariantCultureIgnoreCase);
                        if (isMatch)
                        {
                            particleToMatch++;
                            if (particleToMatch >= phrasalVerb.Particles.Count)
                            {
                                // we matched all particles
                                return(true);
                            }
                        }
                    }
                }
            }
            // if we get here, matching failed
            return(false);
        }
예제 #12
0
        public List <String> ParseTweet(String tweetText)
        {
            List <String> result = new List <string>();

            tweetText = tweetText.Replace("i ", "I ");
            EnglishMaximumEntropyTokenizer tokenizer = new EnglishMaximumEntropyTokenizer(TOK_MODEL_PATH);

            string[] tokens    = tokenizer.Tokenize(tweetText);
            var      posTagger = new EnglishMaximumEntropyPosTagger(POS_MODEL_PATH, TAG_DICT_PATH);

            string[] wordTags = posTagger.Tag(tokens);
            for (int i = 0; i < tokens.Length; i++)
            {
                if (tokens[i].StartsWith("@"))
                {
                    continue;
                }
                else if (Uri.IsWellFormedUriString(tokens[i], UriKind.Absolute))
                {
                    continue;
                }
                else if (tokens[i].Length <= 1 && !int.TryParse(tokens[i], out int n))
                {
                    continue;
                }
                else
                {
                    String token = Regex.Replace(tokens[i], pattern, "");
                    if (ALLOWED_TAGS.Contains(wordTags[i]) && token.Length > 0)
                    {
                        result.Add(token);
                    }
                }
            }
            return(result);
        }
예제 #13
0
 public string[] TokenizeSentence(string sentence)
 {
     return(mTokenizer.Tokenize(sentence));
 }
예제 #14
0
 public IEnumerable <string> Tokenize(string sentence)
 {
     return(tokenizer.Tokenize(sentence));
 }
        /*public bool IsMatch(string sentence, PhrasalVerb phrasalVerb)
         * {
         *  var tokens = tokenizer.Tokenize(sentence);
         *  var pv = MatchingPhrasalVerbs(sentence, new List<PhrasalVerb>() {phrasalVerb});
         *  return pv.Any();
         * }*/

        /*public List<PhrasalVerb> MatchingPhrasalVerbs(string sentence, List<PhrasalVerb> phrasalVerbs)
         * {
         *  // tokenize sentence
         *  var tokens = tokenizer.Tokenize(sentence);
         *  var taggedWords = tagger.Tag(tokens)/*.Where(t => Regex.IsMatch(t, "[A-Z]+")).ToList()#1#;
         *  // create parse tree
         *  var parse = parser.DoParse(tokens);
         *  // retrieve dependencies
         *  var dependencies = ComputeDependencies(parse).ToList();
         *
         *  // compute matching phrasal verbs
         *  var matchingPhrasalVerbs = new List<PhrasalVerb>();
         *  foreach (var phrasalVerb in phrasalVerbs)
         *  {
         *      // get relevant dependencies found
         *      var parts = phrasalVerb.Name.Split(' ');
         *      var root = parts.First();
         *      // find dependencies for this root
         *      var relevantDepedencies = dependencies
         *          .Where(
         *              d =>
         *                  ((string.Equals(root, lemmatizer.Lemmatize(d.Gov().GetWord()),
         *                      StringComparison.InvariantCultureIgnoreCase) && d.Gov().Index() < d.Dep().Index())
         ||
         ||                  (string.Equals(root, lemmatizer.Lemmatize(d.Dep().GetWord()),
         ||                      StringComparison.InvariantCultureIgnoreCase) && d.Dep().Index() < d.Gov().Index()))
         ||                 && (!phrasalVerb.Inseparable || Math.Abs(d.Dep().Index() - d.Gov().Index()) == 1)
         ||                     // for non separable verbs
         ||                 && (!phrasalVerb.SeparableMandatory || Math.Abs(d.Dep().Index() - d.Gov().Index()) > 1)
         ||         // for separable mandatory verbs
         ||         //&& d.Gov().Index() >= 1 && IsVerb(taggedWords[d.Gov().Index() - 1])
         ||         )
         ||         .ToList();
         ||
         ||     // We take only the 2nd part
         ||     // For phrasal verbs with several particles, that's a good approximation for now
         ||     // (we could check that all the particles are also linked)
         ||     if (relevantDepedencies.Any() && parts.Count() > 1)
         ||     {
         ||         var particle1 = parts[1];
         ||         var prtDependencies = relevantDepedencies.Where(d => d.Reln().GetShortName() == "prt").ToList();
         ||         if (prtDependencies.Any())
         ||         {
         ||             // if root has a prt dependency, don't look at other relations
         ||             if (prtDependencies
         ||                 .Any(d => string.Equals(particle1, d.Dep().GetWord(),StringComparison.InvariantCultureIgnoreCase)
         || string.Equals(particle1, d.Gov().GetWord(), StringComparison.InvariantCultureIgnoreCase)))
         ||             {
         ||                 matchingPhrasalVerbs.Add(phrasalVerb);
         ||             }
         ||         }
         ||         else
         ||         {
         ||             // otherwise, look at all the other relations
         ||             var relevantRelationships = relevantDepedencies
         ||                 .Where(d => string.Equals(particle1, d.Dep().GetWord(), StringComparison.InvariantCultureIgnoreCase)
         || string.Equals(particle1, d.Gov().GetWord(), StringComparison.InvariantCultureIgnoreCase))
         ||                 .ToList();
         ||             if (relevantRelationships.Any())
         ||             {
         ||                 matchingPhrasalVerbs.Add(phrasalVerb);
         ||             }
         ||         }
         ||     }
         || }
         || return matchingPhrasalVerbs;
         ||}
         ||
         ||private IEnumerable<TypedDependency> ComputeDependencies(Parse parse)
         ||{
         || // Extract dependencies from lexical tree
         || var tlp = new PennTreebankLanguagePack();
         || var gsf = tlp.GrammaticalStructureFactory();
         || var tree = new ParseTree(parse);
         || try
         || {
         ||     var gs = gsf.NewGrammaticalStructure(tree);
         ||     return gs.TypedDependencies();
         || }
         || catch (Exception)
         || {
         ||     Console.WriteLine("Exception when computing deps for {0}", parse);
         ||     return new List<TypedDependency>();
         || }
         ||}*/

        public List <PhrasalVerb> MatchingPhrasalVerbs(string sentence, List <PhrasalVerb> phrasalVerbs)
        {
            // tokenize sentence
            var tokens = tokenizer.Tokenize(sentence);
            // create parse tree
            var parse = parser.DoParse(tokens);
            // retrieve dependencies
            var dependencies = ComputeDependencies(parse).ToList();

            var matchingPhrasalVerbs = new List <PhrasalVerb>();

            foreach (var phrasalVerb in phrasalVerbs)
            {
                // get relevant dependencies found
                var parts = phrasalVerb.Name.Split(' ').ToList();
                var root  = parts.First();
                // find dependencies for this root
                var rootRelatedDependencies = dependencies
                                              .Where(d => // the (lemmatized) token must be equal to the gov/dep of the dependency
                                                     ((string.Equals(root, lemmatizer.Lemmatize(d.Gov().GetWord()), StringComparison.InvariantCultureIgnoreCase) &&
                                                       d.Gov().Index() < d.Dep().Index()) ||
                                                      (string.Equals(root, lemmatizer.Lemmatize(d.Dep().GetWord()), StringComparison.InvariantCultureIgnoreCase) &&
                                                       d.Dep().Index() < d.Gov().Index()))
                                                     // if the phrasal verb is inseparable, no word must be between the root and the particle
                                                     && (!phrasalVerb.Inseparable.HasValue || (!phrasalVerb.Inseparable.Value || Math.Abs(d.Dep().Index() - d.Gov().Index()) == 1))
                                                     // if the phrasal verb is mandatory seprable, at least one word must be between the root and the particle
                                                     && (!phrasalVerb.SeparableMandatory.HasValue || (!phrasalVerb.SeparableMandatory.Value || Math.Abs(d.Dep().Index() - d.Gov().Index()) > 1))
                                                     )
                                              .ToList();

                // We take only the 2nd part
                // For phrasal verbs with several particles, that's a good approximation for now
                // (we could check that all the particles are also linked)
                if (rootRelatedDependencies.Any() && parts.Count() > 1)
                {
                    var particle1            = parts[1];
                    var relevantDependencies = rootRelatedDependencies.Where(d => d.Reln().GetShortName() == "prt").ToList();
                    if (!relevantDependencies.Any())
                    {
                        // if no "prt" relation, take all relations whatsoever.
                        relevantDependencies = rootRelatedDependencies;
                    }

                    // if one of relevant dependencies have the particle as gov/dep, it's good!
                    var rootParticle1Dependency = relevantDependencies
                                                  .FirstOrDefault(d => string.Equals(particle1, d.Dep().GetWord(), StringComparison.InvariantCultureIgnoreCase) ||
                                                                  string.Equals(particle1, d.Gov().GetWord(), StringComparison.InvariantCultureIgnoreCase));
                    if (rootParticle1Dependency != null && !AreWordSeparatedInSentence(rootParticle1Dependency, dependencies))
                    {
                        var remainingParts = parts.Skip(2).ToList();
                        var lastTokenIndex = Math.Max(rootParticle1Dependency.Gov().Index(), rootParticle1Dependency.Dep().Index()) - 1;

                        var endOfSentenceTokens = tokens.Skip(lastTokenIndex + 1).ToList();
                        if (endOfSentenceTokens.Any())
                        {
                            for (var i = 0; i < endOfSentenceTokens.Count; i++)
                            {
                                if (i < remainingParts.Count)
                                {
                                    if (!string.Equals(remainingParts[i], endOfSentenceTokens[i],
                                                       StringComparison.InvariantCultureIgnoreCase))
                                    {
                                        // no match, get out of the loop
                                        break;
                                    }
                                }
                                else
                                {
                                    // all the remaining parts were included in the remaining tokens --> OK
                                    matchingPhrasalVerbs.Add(phrasalVerb);
                                }
                            }
                        }
                        else
                        {
                            // if there is no remaining parts, the phrasal verb matches
                            if (!remainingParts.Any())
                            {
                                matchingPhrasalVerbs.Add(phrasalVerb);
                            }
                        }


                        /*if (parts.Count <= 2)
                         * {
                         *  // phrasal verb has 1 particle only; we're done
                         *  matchingPhrasalVerbs.Add(phrasalVerb);
                         * }
                         * else
                         * {
                         *  // otherwise, check that the other particles are in the sentence (approximation)
                         *  var lastTokenIndex = Math.Max(rootParticle1Dependency.Gov().Index(), rootParticle1Dependency.Dep().Index()) - 1;
                         *  var endOfSentenceTokens = tokens.Skip(lastTokenIndex).ToList();
                         *  if (parts.Skip(2).All(endOfSentenceTokens.Contains))
                         *  {
                         *      matchingPhrasalVerbs.Add(phrasalVerb);
                         *  }
                         * }*/
                    }
                }
            }

            return(matchingPhrasalVerbs);
        }
 /// <summary>
 /// Gets the tokens
 /// </summary>
 /// <param name="text">The text</param>
 /// <returns>
 /// the tokens
 /// </returns>
 public string[] GetTokens(string text)
 {
     return(_tokenizer.Tokenize(text));
 }
예제 #17
0
파일: Program.cs 프로젝트: quangfox/OpenNlp
        private static void Main(string[] args)
        {
            /*FileStream ostrm;
             * StreamWriter writer;
             * TextWriter oldOut = Console.Out;
             * try
             * {
             *  ostrm = new FileStream("C:\\Users\\Alexandre\\Desktop\\vs_output_2.txt", FileMode.OpenOrCreate, FileAccess.Write);
             *  writer = new StreamWriter(ostrm);
             * }
             * catch (Exception e)
             * {
             *  Console.WriteLine("Cannot open Redirect.txt for writing");
             *  Console.WriteLine(e.Message);
             *  return;
             * }
             * Console.SetOut(writer);*/


            /*// read file
             * var tokenizerTrainingFilePath = currentDirectory + "Input/tokenizer.train";
             * var outputFilePath = currentDirectory + "Output/EnglishTok.nbin";
             * MaximumEntropyTokenizer.Train(tokenizerTrainingFilePath, outputFilePath);*/

            // test detokenization

            /*var tokens = new List<string>() {"do", "n't", "commit"};
             * var detokenizer = new DictionaryDetokenizer();
             * var result = detokenizer.Detokenize(tokens.ToArray());
             * Console.WriteLine(result);*/

            /*// train model file
             * var inputFilePath = currentDirectory + "Input/sentences.train";
             * var outputFilePath = currentDirectory + "Output/" + Path.GetFileNameWithoutExtension(inputFilePath) + ".nbin";
             * var iterations = 100;
             * var cut = 5;
             * var endOfSentenceScanner = new CharactersSpecificEndOfSentenceScanner();
             * Console.WriteLine("Training model...");
             * var model = MaximumEntropySentenceDetector.TrainModel(inputFilePath, iterations, cut, endOfSentenceScanner);
             * Console.WriteLine("Writing output file '{0}'...", outputFilePath);
             * new BinaryGisModelWriter().Persist(model, outputFilePath);
             * Console.WriteLine("Output file written.");*/

            /*// tokenize tests
             * var modelPath = currentDirectory + "../Resources/Models/";
             * var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath + "EnglishTok.nbin");
             *
             * var input = "It was built of a bright brick throughout; its skyline was fantastic, and even its ground plan was wild.";
             * var tokens = tokenizer.Tokenize(input);
             * Console.WriteLine(string.Join(" | ", tokens));*/


            // detect tokenization issues

            /*var pathToFile = currentDirectory + "Input/tokenizerIssues.txt";
             * var modelPath = currentDirectory + "../Resources/Models/";
             * var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath + "EnglishTok.nbin");
             * var allLines = File.ReadAllLines(pathToFile);
             * foreach (var line in allLines)
             * {
             *  var tokens = tokenizer.Tokenize(line);
             *  Console.WriteLine(string.Join(" | ", tokens));
             * }*/

            // parsing
            var sentence  = "This is a generic bank response, which indicates simply that they are not willing to accept the transaction.";
            var tokenizer = new EnglishMaximumEntropyTokenizer(currentDirectory + "../Resources/Models/EnglishTok.nbin");
            var tokens    = tokenizer.Tokenize(sentence);
            var modelPath = currentDirectory + "../Resources/Models/";
            var parser    = new OpenNLP.Tools.Parser.EnglishTreebankParser(modelPath, true, false);
            var parse     = parser.DoParse(tokens);
            // Extract dependencies from lexical tree
            var tlp  = new PennTreebankLanguagePack();
            var gsf  = tlp.GrammaticalStructureFactory();
            var tree = new ParseTree(parse);

            Console.WriteLine(tree);
            var gs           = gsf.NewGrammaticalStructure(tree);
            var dependencies = gs.TypedDependencies();

            foreach (var dep in dependencies)
            {
                Console.WriteLine(dep);
            }

            Console.WriteLine("===========");
            Console.WriteLine("OK");
            Console.ReadKey();
        }