示例#1
0
        public POSTagger()
        {
            string modelsPath = Directory.GetParent(Directory.GetCurrentDirectory()).Parent.Parent.FullName;

            modelsPath = Path.Combine(modelsPath, "models");
            _posTagger = new EnglishMaximumEntropyPosTagger(Path.Combine(modelsPath, "EnglishPOS.nbin"), Path.Combine(modelsPath, "tagdict"));
        }
示例#2
0
        /// <summary>
        /// Loads all the nescesary resources
        /// for the library
        /// </summary>
        public void LoadLibrary()
        {
            try
            {
                HelperMethods.CreateResourceInFileSystem("EnglishSD.nbin");
                HelperMethods.CreateResourceInFileSystem("EnglishTok.nbin");
                HelperMethods.CreateResourceInFileSystem("EnglishPOS.nbin");

                sentenceDetector = new EnglishMaximumEntropySentenceDetector("EnglishSD.nbin");
                tokenizer        = new EnglishMaximumEntropyTokenizer("EnglishTok.nbin");
                posTagger        = new EnglishMaximumEntropyPosTagger("EnglishPOS.nbin");

                positive_words   = HelperMethods.Import_PositiveWords();  //data of positive words
                negative_words   = HelperMethods.Import_NegativeWords();  //data of negative words
                emotion_words    = HelperMethods.Import_EmotionWords();   //data of emotion words with 5 expression values
                inclusion_values = HelperMethods.Import_InclusiveWords(); //data of inclusion with rate
                exclusion_values = HelperMethods.Import_ExclusionWords(); //data of inclusion with rate

                isLibraryLoaded = true;
            } catch (Exception e)
            {
                isLibraryLoaded = false;
                throw new Exception("Error in library load state", e);
            }
        }
示例#3
0
        // [Explicit-Dispose]
        public void Dispose()
        {
            if (_wn != null)
            {
                _wn.Dispose();
            }

            _tokenizer        = null;
            _sentenceDetector = null;
            _posTagger        = null;
            _chunker          = null;

            // Dispose CLI/C++ Dll
            ap = null;

            // Dispose all KB plugins
            if (PlugInsNumber > 0)
            {
                for (int i = 0; i < PlugInsNumber; i++)
                {
                    KBDrivers[i] = null;
                    KBDriversQueryPointers[i] = null;
                }
            }
        }
示例#4
0
 private void initComponents()
 {
     sentenceDetector = new EnglishMaximumEntropySentenceDetector(Path.Combine(ModelDir, "EnglishSD.nbin"));
     tokenizer        = new EnglishMaximumEntropyTokenizer(Path.Combine(ModelDir, "EnglishTok.nbin"));
     posTagger        = new EnglishMaximumEntropyPosTagger(Path.Combine(ModelDir, "EnglishPOS.nbin"));
     chunker          = new EnglishTreebankChunker(Path.Combine(ModelDir, "EnglishChunk.nbin"));
     parser           = new EnglishTreebankParser(FileUtils.WithSeparator(ModelDir), true, false);
 }
示例#5
0
 public WordNetBoostrap(string nlpModelsPath, string wordNetPath)
 {
     this._wordNetPath = wordNetPath;
     _wn               = new WordNetEngine(_wordNetPath, true);
     _tokenizer        = new EnglishRuleBasedTokenizer(false);
     _sentenceDetector = new EnglishMaximumEntropySentenceDetector(nlpModelsPath + "EnglishSD.nbin");
     _posTagger        = new EnglishMaximumEntropyPosTagger(nlpModelsPath + "EnglishPOS.nbin", nlpModelsPath + @"\Parser\tagdict");
 }
 public ParseBasedPhrasalVerbDetector(EnglishTreebankParser parser, Lemmatizer lemmatizer,
                                      EnglishMaximumEntropyTokenizer tokenizer, EnglishMaximumEntropyPosTagger tagger)
 {
     this.parser     = parser;
     this.lemmatizer = lemmatizer;
     this.tokenizer  = tokenizer;
     this.tagger     = tagger;
 }
示例#7
0
        private string[] PosTagTokens(string[] tokens)
        {
            if (_posTagger == null)
            {
                _posTagger = new EnglishMaximumEntropyPosTagger(_modelPath + "EnglishPOS.nbin", _modelPath + @"\Parser\tagdict");
            }

            return(_posTagger.Tag(tokens));
        }
示例#8
0
        public static string[] PosTagTokens(string[] tokens)
        {
            if (mPosTagger == null)
            {
                mPosTagger = new OpenNLP.Tools.PosTagger.EnglishMaximumEntropyPosTagger(mModelPath + "EnglishPOS.nbin", mModelPath + @"\Parser\tagdict");
            }

            return(mPosTagger.Tag(tokens));
        }
示例#9
0
        private string[] MyPosTagger(string str)
        {
            var modelPath  = Path.GetDirectoryName(Process.GetCurrentProcess().MainModule.FileName) + @"\Models\EnglishPOS.nbin";
            var tagDictDir = Path.GetDirectoryName(Process.GetCurrentProcess().MainModule.FileName) + @"\Models\Parser\tagdict";
            //var posTagger = EnglishMaximumEntropyPosTagger(modelPath, tagDictDir);
            var posTagger = new EnglishMaximumEntropyPosTagger(modelPath, tagDictDir);
            var tokens    = MyTokenizer(str);
            var pos       = posTagger.Tag(tokens);

            return(pos);
        }
示例#10
0
        public void Setup()
        {
            var path = Path.Combine(TestContext.CurrentContext.TestDirectory, @"..\..\..\Resources\Models\");

            sentenceDetector = new EnglishMaximumEntropySentenceDetector(Path.Combine(path, "EnglishSD.nbin"));
            postTagger       = new EnglishMaximumEntropyPosTagger(
                Path.Combine(path, @"EnglishPOS.nbin"),
                Path.Combine(path, @"Parser\tagdict"));
            tokenizer = new EnglishMaximumEntropyTokenizer(Path.Combine(path, "EnglishTok.nbin"));
            chunker   = new EnglishTreebankChunker(Path.Combine(path, @"EnglishChunk.nbin"));
        }
        /// <summary>
        /// Initializes the processor
        /// </summary>
        /// <param name="rootPath">The root path</param>
        private static void Initialize(string rootPath)
        {
            string tkModelPath  = Path.Combine(rootPath, "EnglishTok.nbin");
            string sdModelPath  = Path.Combine(rootPath, "EnglishSD.nbin");
            string posModelPath = Path.Combine(rootPath, "EnglishPOS.nbin");

            _tokenizer        = new EnglishMaximumEntropyTokenizer(tkModelPath);
            _sentenceDetector = new EnglishMaximumEntropySentenceDetector(sdModelPath);
            _posTagger        = new EnglishMaximumEntropyPosTagger(posModelPath);
            _isInitialized    = true;
        }
示例#12
0
        public void Dispose()
        {
            _tokenizer        = null;
            _sentenceDetector = null;
            _posTagger        = null;

            SynsetArray = null;
            IsStopWord  = null;

            _wn.Dispose();
        }
示例#13
0
        /// <summary>
        /// A part of speech tagger assigns a part of speech (noun, verb etc.) to each token in a sentence. For the full list of part of speech abbreviations, please refer to the Penn Treebank Project (https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)
        /// </summary>
        /// <param name="txt">Texto de input</param>
        /// <returns>a set of tokens</returns>
        public static string[] PoS(this string txt)
        {
            //Models for English. Tive que fazer download deles
            //Não esquecer de adicionar using OpenNLP.Tools.PosTagger;
            var modelPath2 = Path + "EnglishPOS.nbin";
            var posTagger  = new EnglishMaximumEntropyPosTagger(modelPath2);

            string[] tokens = txt.Tokenizer_RuleBased(); //Faz uma chamada ao método anterior
            string[] pos    = posTagger.Tag(tokens);

            return(tokens);
        }
示例#14
0
        private IList <Tuple <string, string> > GetPosTaggedTokens(string sentence)
        {
            var posTagger  = new EnglishMaximumEntropyPosTagger(_modelPath + "/EnglishPOS.nbin", _modelPath + @"/Parser/tagdict");
            var tokens     = _tokenizer.Tokenize(sentence);
            var taggedList = posTagger.Tag(tokens);
            IList <Tuple <string, string> > tagged = new List <Tuple <string, string> >();

            for (int i = 0; i < tokens.Length; i++)
            {
                tagged.Add(Tuple.Create(tokens[i], taggedList[i]));
            }
            return(GetFilteredTokens(tagged));
        }
示例#15
0
 private static EnglishMaximumEntropyPosTagger GetTagger()
 {
     if (englishMaximumEntropyPosTagger == null)
     {
         if (!File.Exists(ENGLISH_POS_DICTIONARY))
         {
             throw new DexterRuntimeException("Cannot perform naming analysis. Dictionary not found in directory: " +
                                              ENGLISH_POS_DICTIONARY);
         }
         englishMaximumEntropyPosTagger = new EnglishMaximumEntropyPosTagger(ENGLISH_POS_DICTIONARY);
     }
     return(englishMaximumEntropyPosTagger);
 }
示例#16
0
        public static IList <Tuple <string, string> > GetPosTaggedTokens(string sentence)
        {
            var posTagger =
                new EnglishMaximumEntropyPosTagger(Path.Combine(ModelPath, "EnglishPOS.nbin"), Path.Combine(ModelPath, "Parser", "tagdict"));
            var tokens     = Tokenizer.Tokenize(sentence);
            var taggedList = posTagger.Tag(tokens);
            IList <Tuple <string, string> > tagged = new List <Tuple <string, string> >();

            for (int i = 0; i < tokens.Length; i++)
            {
                tagged.Add(Tuple.Create(tokens[i], taggedList[i]));
            }
            return(GetFilteredTokens(tagged));
        }
示例#17
0
        public Result <IEnumerable <string> > Filter(HashSet <string> partOfSpeechToFilter, IEnumerable <string> words)
        {
            var modelPath  = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Data\", "EnglishPOS.nbin");
            var tagDictDir = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Data\", "tagdict");
            var posTagger  = new EnglishMaximumEntropyPosTagger(modelPath, tagDictDir);
            var outList    = new List <string>();

            foreach (var word in words)
            {
                var speechPart = posTagger.Tag(new[] { word });
                if (!partOfSpeechToFilter.Contains(speechPart[0]))
                {
                    outList.Add(word);
                }
            }
            return(Result.Ok((IEnumerable <string>)outList));
        }
示例#18
0
        // Defualt Constructor
        public SemCluster(string DataFolder)
        {
            try
            {
                Console.WriteLine("\tSemCluster Text Analytics Tool");
                Console.WriteLine("\t------------------------------");
                Console.WriteLine("\t-Wikipedia local server couldn't be found!");
                Console.WriteLine("\t-Seeds SemAve is in manual mode!");
                Console.WriteLine();
                Console.WriteLine();
                Console.WriteLine("-> Resources loading ...");
                Console.WriteLine();

                #region Loading External Resources
                _wn               = new WordNetEngine(DataFolder + "WordNet", InMemoryWordNet);
                _tokenizer        = new EnglishRuleBasedTokenizer(TokenizeHyphen);
                _sentenceDetector = new EnglishMaximumEntropySentenceDetector(DataFolder + "EnglishSD.nbin");
                _posTagger        = new EnglishMaximumEntropyPosTagger(DataFolder + "EnglishPOS.nbin", DataFolder + "\\Build\\tagdict");
                _chunker          = new EnglishTreebankChunker(DataFolder + "EnglishChunk.nbin");
                #endregion

                PlugInsManager(DataFolder);

                Console.WriteLine("\tResources loaded successfully");
                Console.WriteLine("\t" + PlugInsNumber + " KB plug-ins found in the repository");
                Console.WriteLine("\tPress any key to continue ...");
                Console.ReadKey();
                Console.WriteLine();

                RootVirtualNode = _wn.GetSynSet("Noun:1740");
                ap = new AffinityPropagationClustering();

                SynSetRelationTypes    = new WordNetApi.Core.WordNetEngine.SynSetRelation[2];
                SynSetRelationTypes[0] = WordNetApi.Core.WordNetEngine.SynSetRelation.Hypernym;
                SynSetRelationTypes[1] = WordNetApi.Core.WordNetEngine.SynSetRelation.InstanceHypernym;
            }
            catch (Exception ex)
            {
                Dispose();
                throw new Exception(ex.Message);
            }
        }
示例#19
0
        public List <String> ParseTweet(String tweetText)
        {
            List <String> result = new List <string>();

            tweetText = tweetText.Replace("i ", "I ");
            EnglishMaximumEntropyTokenizer tokenizer = new EnglishMaximumEntropyTokenizer(TOK_MODEL_PATH);

            string[] tokens    = tokenizer.Tokenize(tweetText);
            var      posTagger = new EnglishMaximumEntropyPosTagger(POS_MODEL_PATH, TAG_DICT_PATH);

            string[] wordTags = posTagger.Tag(tokens);
            for (int i = 0; i < tokens.Length; i++)
            {
                if (tokens[i].StartsWith("@"))
                {
                    continue;
                }
                else if (Uri.IsWellFormedUriString(tokens[i], UriKind.Absolute))
                {
                    continue;
                }
                else if (tokens[i].Length <= 1 && !int.TryParse(tokens[i], out int n))
                {
                    continue;
                }
                else
                {
                    String token = Regex.Replace(tokens[i], pattern, "");
                    if (ALLOWED_TAGS.Contains(wordTags[i]) && token.Length > 0)
                    {
                        result.Add(token);
                    }
                }
            }
            return(result);
        }
示例#20
0
        public static string[] TagPartsOfSpeech(this string[] tokens)
        {
            var posTagger = new EnglishMaximumEntropyPosTagger(_englishPOSPath, _englishTagDictPath);

            return(posTagger.Tag(tokens));
        }
示例#21
0
        static void Main(string[] args)
        {
            // parse phrasal verb on usingEnglish.com

            /*var usingEnglishParser = new UsingEnglishParser();
             * var allPhrasalVerbs = usingEnglishParser.ParseAllPhrasalVerbs();
             * Console.Write("Parsed {0} phrasal verbs on using english", allPhrasalVerbs);*/

            // Persist phrasal verbs

            /*var phrasalVerbFilePath = PathToApplication + "Resources/phrasalVerbs";
             * PersistPhrasalVerbs(allPhrasalVerbs, phrasalVerbFilePath);
             * Console.WriteLine("Phrasal verbs persisted");*/

            // Lemmatizer
            var fullPathToSrcData = PathToApplication + "Resources/lemmatizer/en_lemmatizer_data.lem";
            var stream            = File.OpenRead(fullPathToSrcData);
            var lemmatizer        = new Lemmatizer(stream);

            // load phrasal verbs & examples
            var phrasalVerbFilePath = PathToApplication + "Resources/phrasalVerbs";
            var phrasalVerbs        = ReadFleexPhrasalVerbs()
                                      .Where(pv => !pv.IsMissingOnWordreference && pv.Name != "go to")
                                      .ToList();
            //var phrasalVerbs = ReadPhrasalVerbs(phrasalVerbFilePath);

            //

            /*var results = new List<Tuple<PhrasalVerb, List<TypedDependency>>>();
             * foreach (var phrasalVerb in phrasalVerbs)
             * {
             *  foreach (var usage in phrasalVerb.Usages)
             *  {
             *      var example = LowerCaseAllUpperCasedWords(usage.Example);
             *
             *      var parse = GetParser().DoParse(example);
             *      var particle1 = phrasalVerb.Particle1;
             *
             *      var deps = ParseBasedPhrasalVerbDetector.ComputeDependencies(parse);
             *      var relDeps = deps.Where(d => (d.Gov().GetWord() == particle1 && d.Gov().Index() < d.Dep().Index())
             || (d.Dep().GetWord() == particle1 && d.Dep().Index() < d.Gov().Index()))
             ||                                   .ToList();
             ||     results.Add(new Tuple<PhrasalVerb, List<TypedDependency>>(phrasalVerb, relDeps));
             ||     /*if (relDeps.Count > 1)
             ||     {
             ||         Console.WriteLine("{0}|{1}", phrasalVerb.Name, example);
             ||     }#1#
             || }
             ||}
             ||var groups = results.Where(tup => tup.Item2.Count == 1).Select(tup => tup.Item2.First().Reln()).GroupBy(s => s);
             ||foreach (var g in groups)
             ||{
             || Console.WriteLine("{0} -> {1} occurences", g.Key, g.Count());
             || Console.WriteLine("---");
             ||}*/


            //
            var tokenizerModelPaths = PathToApplication + "Resources/OpenNlp/Models/EnglishTok.nbin";
            var tokenizer           = new EnglishMaximumEntropyTokenizer(tokenizerModelPaths);
            var englishPosPath      = PathToApplication + "Resources/OpenNlp/Models/EnglishPOS.nbin";
            var tagDictPath         = PathToApplication + "Resources/OpenNlp/Models/Parser/tagdict";
            var tagger             = new EnglishMaximumEntropyPosTagger(englishPosPath, tagDictPath);
            var basicDetector      = new BasicPhrasalVerbDetector(tokenizer, lemmatizer);
            var parseBasedDetector = new ParseBasedPhrasalVerbDetector(GetParser(), lemmatizer, tokenizer, tagger);

            var pathToManuallyValidatedPhrasalVerbs   = PathToApplication + "Resources/manual/good.txt";
            var pathToManuallyUnvalidatedPhrasalVerbs = PathToApplication + "Resources/manual/bad.txt";


            /*var sent = "And because fertility rates fell across that very same period that life expectancy was going up, that pyramid that has always represented the distribution of age in the population, with many young ones at the bottom winnowed to a tiny peak of older people who make it and survive to old age is being reshaped into a rectangle.";
             * var pvs = parseBasedDetector.MatchingPhrasalVerbs(sent, phrasalVerbs.ConvertAll(pv => (PhrasalVerb)pv));*/

            // missing pv detections
            var manuallyValidatedExamples = File.ReadAllLines(pathToManuallyValidatedPhrasalVerbs)
                                            .Where(line => phrasalVerbs.Select(pv => pv.Name).Contains(line.Split('|').First()))
                                            .ToList();

            Console.WriteLine("Phrasal verbs not detected:");
            var notDetected = new List <Tuple <string, string> >();

            foreach (var example in manuallyValidatedExamples)
            {
                var sentence    = example.Split('|').Last();
                var phrasalVerb = example.Split('|').First();
                var matchingPvs = parseBasedDetector.MatchingPhrasalVerbs(sentence, phrasalVerbs.ConvertAll(pv => (PhrasalVerb)pv));
                if (!matchingPvs.Any(p => p.Name == phrasalVerb))
                {
                    notDetected.Add(new Tuple <string, string>(sentence, phrasalVerb));
                }
            }
            Console.WriteLine("{0}% phrasal verbs not detected", (float)(notDetected.Count * 100) / manuallyValidatedExamples.Count());
            foreach (var tuple in notDetected)
            {
                Console.WriteLine("{0}; {1}", tuple.Item2, tuple.Item1);
            }
            Console.WriteLine("----------");

            // false positive detection
            var manuallyUnvalidatedExamples = File.ReadAllLines(pathToManuallyUnvalidatedPhrasalVerbs)
                                              .Where(line => phrasalVerbs.Select(pv => pv.Name).Contains(line.Split('|').First()))
                                              .ToList();

            Console.WriteLine("Wrongly detected PV ");
            var wronglyDetected = new List <Tuple <string, string> >();

            foreach (var example in manuallyUnvalidatedExamples)
            {
                var sentence    = example.Split('|').Last();
                var phrasalVerb = example.Split('|').First();
                var matchingPvs = parseBasedDetector.MatchingPhrasalVerbs(sentence, phrasalVerbs.ConvertAll(pv => (PhrasalVerb)pv));
                if (matchingPvs.Any(p => p.Name == phrasalVerb))
                {
                    wronglyDetected.Add(new Tuple <string, string>(sentence, phrasalVerb));
                }
            }
            Console.WriteLine("{0}% of wrongly detected examples:", (float)(wronglyDetected.Count * 100) / manuallyUnvalidatedExamples.Count());
            foreach (var tuple in wronglyDetected)
            {
                Console.WriteLine("'{0}'; {1}", tuple.Item2, tuple.Item1);
            }
            Console.WriteLine("----------");


            // manual input for loosely detected phrasal verb

            /*var pathToSentenceFile = PathToApplication + "Resources/fleex_sentences.txt";
             * var sentences = File.ReadAllLines(pathToSentenceFile);
             * foreach (var sentence in sentences)
             * {
             *  // detect all other phrasal verbs
             *  foreach (var pv in phrasalVerbs)
             *  {
             *      var isMatch = basicDetector.IsMatch(sentence, pv);
             *      if (isMatch)
             *      {
             *          var capitalizedRoot = Regex.Replace(sentence, "\\b" + pv.Root, pv.Root.ToUpper());
             *          var capitalizedParticle = Regex.Replace(capitalizedRoot, "\\b" + pv.Particle1 + "\\b", pv.Particle1.ToUpper());
             *          Console.WriteLine("{0} --> '{1}'; 'y' for OK, n otherwise", pv.Name, capitalizedParticle);
             *          var key = Console.ReadKey();
             *          while (key.KeyChar != 'y' && key.KeyChar != 'n')
             *          {
             *              Console.WriteLine("'y' / 'n' only");
             *              key = Console.ReadKey();
             *          }
             *          Console.WriteLine();
             *          string filePathToWrite = key.KeyChar == 'y'
             *              ? pathToManuallyValidatedPhrasalVerbs
             *              : pathToManuallyUnvalidatedPhrasalVerbs;
             *          using (var writer = new StreamWriter(filePathToWrite, true))
             *          {
             *              writer.WriteLine("{0}|{1}", pv.Name, sentence);
             *          }
             *      }
             *  }
             * }*/


            /*// persisting list of phrasal verbs
             * Console.WriteLine("============");
             * Console.WriteLine("Persisting phrasal verbs to {0}", phrasalVerbFilePath);
             * PersistPhrasalVerbs(phrasalVerbs, phrasalVerbFilePath);
             * Console.WriteLine("Persisted phrasal verbs");*/

            // persisting examples

            /*var pvExamplesFilePath = PathToApplication + "Resources/phrasalVerbsExamples.txt";
             * PersistPhrasalVerbsAndExamples(phrasalVerbs, pvExamplesFilePath);
             * Console.WriteLine("Persisted examples");
             * Console.WriteLine("-------------------------");*/

            // stats on usingenglish phrasal verbs

            /*var verbs = ReadPhrasalVerbs(phrasalVerbFilePath);
             * var nbOfSeparableVerbs = verbs.Count(v => v.Usages.All(u => u.SeparableMandatory));
             * var nbOfInseparableVerbs = verbs.Count(v => v.Usages.All(u => u.Inseparable));
             * Console.WriteLine("{0} separable verbs", nbOfSeparableVerbs);
             * Console.WriteLine("{0} inseparable verbs", nbOfInseparableVerbs);
             * Console.WriteLine("{0} verbs", verbs.Count);*/

            // write a file with examples of phrasal verbs

            /*var pathToOutputFile = PathToApplication + "Resources/phrasalVerbExamples.txt";
             * var verbs = ReadPhrasalVerbs(phrasalVerbFilePath);
             * var lines =
             *  verbs.SelectMany(v => v.Usages.Select(u => new {Usage = u, v.Name}))
             *      .Select(a => string.Format("{0}|{1}", a.Name, a.Usage.Example));
             * foreach (var line in lines)
             * {
             *  Console.WriteLine(line);
             * File.WriteAllLines(pathToOutputFile, lines);*/

            Console.WriteLine("=== END ===");
            Console.ReadKey();
        }