示例#1
0
        public string CorrectSentence(string sent)
        {
            var modelPath = "EnglishTok.nbin";
            var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath);

            var sentDetector = new EnglishMaximumEntropySentenceDetector("EnglishSD.nbin");


            List <string> tokens = new List <string>();

            tokens.AddRange(tokenizer.Tokenize(sent));
            List <string> correct = new List <string>();

            foreach (var token in tokens)
            {
                if (Program.Applywords(token))
                {
                    correct.Add(Correct(token));
                }
                else
                {
                    correct.Add(token);
                }
            }

            string res = string.Join("", correct.Select(str => Program.Applywords(str)?str += " " : str));

            return(res);
        }
示例#2
0
        /// <summary>
        /// Loads all the nescesary resources
        /// for the library
        /// </summary>
        public void LoadLibrary()
        {
            try
            {
                HelperMethods.CreateResourceInFileSystem("EnglishSD.nbin");
                HelperMethods.CreateResourceInFileSystem("EnglishTok.nbin");
                HelperMethods.CreateResourceInFileSystem("EnglishPOS.nbin");

                sentenceDetector = new EnglishMaximumEntropySentenceDetector("EnglishSD.nbin");
                tokenizer        = new EnglishMaximumEntropyTokenizer("EnglishTok.nbin");
                posTagger        = new EnglishMaximumEntropyPosTagger("EnglishPOS.nbin");

                positive_words   = HelperMethods.Import_PositiveWords();  //data of positive words
                negative_words   = HelperMethods.Import_NegativeWords();  //data of negative words
                emotion_words    = HelperMethods.Import_EmotionWords();   //data of emotion words with 5 expression values
                inclusion_values = HelperMethods.Import_InclusiveWords(); //data of inclusion with rate
                exclusion_values = HelperMethods.Import_ExclusionWords(); //data of inclusion with rate

                isLibraryLoaded = true;
            } catch (Exception e)
            {
                isLibraryLoaded = false;
                throw new Exception("Error in library load state", e);
            }
        }
 public ParseBasedPhrasalVerbDetector(EnglishTreebankParser parser, Lemmatizer lemmatizer,
                                      EnglishMaximumEntropyTokenizer tokenizer, EnglishMaximumEntropyPosTagger tagger)
 {
     this.parser     = parser;
     this.lemmatizer = lemmatizer;
     this.tokenizer  = tokenizer;
     this.tagger     = tagger;
 }
示例#4
0
 private void initComponents()
 {
     sentenceDetector = new EnglishMaximumEntropySentenceDetector(Path.Combine(ModelDir, "EnglishSD.nbin"));
     tokenizer        = new EnglishMaximumEntropyTokenizer(Path.Combine(ModelDir, "EnglishTok.nbin"));
     posTagger        = new EnglishMaximumEntropyPosTagger(Path.Combine(ModelDir, "EnglishPOS.nbin"));
     chunker          = new EnglishTreebankChunker(Path.Combine(ModelDir, "EnglishChunk.nbin"));
     parser           = new EnglishTreebankParser(FileUtils.WithSeparator(ModelDir), true, false);
 }
示例#5
0
        private string[] MyTokenizer(string sentence)
        {
            var modelPath = Path.GetDirectoryName(Process.GetCurrentProcess().MainModule.FileName) + @"\Models\EnglishTok.nbin";

            var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath);
            var tokens    = tokenizer.Tokenize(sentence);

            return(tokens);
        }
示例#6
0
        public string[] TokenizeSentence(string sentence)
        {
            if (mTokenizer == null)
            {
                mTokenizer = new EnglishMaximumEntropyTokenizer(mModelPath + @"\EnglishTok.nbin");
            }

            return(mTokenizer.Tokenize(sentence));
        }
示例#7
0
        internal string[] TokenizeSentence(string sentence)
        {
            if (_mTokenizer == null)
            {
                _mTokenizer = new EnglishMaximumEntropyTokenizer(_modelPath + "EnglishTok.nbin");
            }

            return(_mTokenizer.Tokenize(sentence));
        }
示例#8
0
        public static string[] TokenizeSentence(string sentence)
        {
            if (mTokenizer == null)
            {
                mTokenizer = new OpenNLP.Tools.Tokenize.EnglishMaximumEntropyTokenizer(mModelPath + "EnglishTok.nbin");
            }

            return(mTokenizer.Tokenize(sentence));
        }
示例#9
0
        public void Setup()
        {
            var path = Path.Combine(TestContext.CurrentContext.TestDirectory, @"..\..\..\Resources\Models\");

            sentenceDetector = new EnglishMaximumEntropySentenceDetector(Path.Combine(path, "EnglishSD.nbin"));
            postTagger       = new EnglishMaximumEntropyPosTagger(
                Path.Combine(path, @"EnglishPOS.nbin"),
                Path.Combine(path, @"Parser\tagdict"));
            tokenizer = new EnglishMaximumEntropyTokenizer(Path.Combine(path, "EnglishTok.nbin"));
            chunker   = new EnglishTreebankChunker(Path.Combine(path, @"EnglishChunk.nbin"));
        }
示例#10
0
        /// <summary>
        /// A tokenizer breaks a text into words, symbols or other meaningful elements. The historical tokenizers are based on the maxent algorithm.
        /// </summary>
        /// <param name="txt">Texto a tokenizar</param>
        /// <returns>a set of tokens</returns>
        public static string[] Tokenizer(this string txt, string path)
        {
            //Models for English. Tive que fazer download deles
            //Não esquecer de adicionar using OpenNLP.Tools.Tokenize;
            var modelPath = path + "EnglishTok.nbin";
            var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath);

            string[] tokens = tokenizer.Tokenize(txt);

            return(tokens);
        }
        /// <summary>
        /// Initializes the processor
        /// </summary>
        /// <param name="rootPath">The root path</param>
        private static void Initialize(string rootPath)
        {
            string tkModelPath  = Path.Combine(rootPath, "EnglishTok.nbin");
            string sdModelPath  = Path.Combine(rootPath, "EnglishSD.nbin");
            string posModelPath = Path.Combine(rootPath, "EnglishPOS.nbin");

            _tokenizer        = new EnglishMaximumEntropyTokenizer(tkModelPath);
            _sentenceDetector = new EnglishMaximumEntropySentenceDetector(sdModelPath);
            _posTagger        = new EnglishMaximumEntropyPosTagger(posModelPath);
            _isInitialized    = true;
        }
示例#12
0
        public static List <string> WorkFile(string path)
        {
            //model = new Model[n];
            using (StreamReader sr = new StreamReader(path))
            {
                var modelPath = "EnglishTok.nbin";
                var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath);

                var sentDetector = new EnglishMaximumEntropySentenceDetector("EnglishSD.nbin");
                var text         = sr.ReadToEnd();
                sr.Close();
                var           sentencies = sentDetector.SentenceDetect(text);
                List <string> tokens     = new List <string>();
                foreach (var item in sentencies)
                {
                    tokens.AddRange(tokenizer.Tokenize(item));
                }
                return(tokens);
            }
        }
示例#13
0
        public List <String> ParseTweet(String tweetText)
        {
            List <String> result = new List <string>();

            tweetText = tweetText.Replace("i ", "I ");
            EnglishMaximumEntropyTokenizer tokenizer = new EnglishMaximumEntropyTokenizer(TOK_MODEL_PATH);

            string[] tokens    = tokenizer.Tokenize(tweetText);
            var      posTagger = new EnglishMaximumEntropyPosTagger(POS_MODEL_PATH, TAG_DICT_PATH);

            string[] wordTags = posTagger.Tag(tokens);
            for (int i = 0; i < tokens.Length; i++)
            {
                if (tokens[i].StartsWith("@"))
                {
                    continue;
                }
                else if (Uri.IsWellFormedUriString(tokens[i], UriKind.Absolute))
                {
                    continue;
                }
                else if (tokens[i].Length <= 1 && !int.TryParse(tokens[i], out int n))
                {
                    continue;
                }
                else
                {
                    String token = Regex.Replace(tokens[i], pattern, "");
                    if (ALLOWED_TAGS.Contains(wordTags[i]) && token.Length > 0)
                    {
                        result.Add(token);
                    }
                }
            }
            return(result);
        }
示例#14
0
        public Import(int id)
        {
            _databasesDir = Path.Combine(Environment.GetFolderPath(
                                             Environment.SpecialFolder.ApplicationData), "YouWrite");
            _appDir     = Directory.GetCurrentDirectory();
            _categoryID = id;

            var dbName = Path.Combine(Environment.GetFolderPath(
                                          Environment.SpecialFolder.ApplicationData), "YouWrite", _categoryID + ".db");

            var source = new SQLiteConnection
                             ("Data Source=" + dbName + ";Version=3;New=False;Compress=True;");

            _database = new Databases(_appDir, _databasesDir);
            _database.SetConnection(source);
            _refsExtractor = new RefsExtractor(_database);
            mModelPath     = _appDir + @"\";
            if (mTokenizer == null)
            {
                mTokenizer = new EnglishMaximumEntropyTokenizer(mModelPath + "EnglishTok.nbin");
            }

            initialCommands();
        }
示例#15
0
 public EnglishTokenizer() : base(Language.English)
 {
     modelPath = Utility.GetModelPath(OpenNLPModel.Tokenizer, Language);
     tok       = new EnglishMaximumEntropyTokenizer(modelPath);
 }
 public BasicPhrasalVerbDetector(EnglishMaximumEntropyTokenizer tokenizer, Lemmatizer lemmatizer)
 {
     this.tokenizer  = tokenizer;
     this.lemmatizer = lemmatizer;
 }
示例#17
0
        private static void Main(string[] args)
        {
            /*FileStream ostrm;
             * StreamWriter writer;
             * TextWriter oldOut = Console.Out;
             * try
             * {
             *  ostrm = new FileStream("C:\\Users\\Alexandre\\Desktop\\vs_output_2.txt", FileMode.OpenOrCreate, FileAccess.Write);
             *  writer = new StreamWriter(ostrm);
             * }
             * catch (Exception e)
             * {
             *  Console.WriteLine("Cannot open Redirect.txt for writing");
             *  Console.WriteLine(e.Message);
             *  return;
             * }
             * Console.SetOut(writer);*/


            /*// read file
             * var tokenizerTrainingFilePath = currentDirectory + "Input/tokenizer.train";
             * var outputFilePath = currentDirectory + "Output/EnglishTok.nbin";
             * MaximumEntropyTokenizer.Train(tokenizerTrainingFilePath, outputFilePath);*/

            // test detokenization

            /*var tokens = new List<string>() {"do", "n't", "commit"};
             * var detokenizer = new DictionaryDetokenizer();
             * var result = detokenizer.Detokenize(tokens.ToArray());
             * Console.WriteLine(result);*/

            /*// train model file
             * var inputFilePath = currentDirectory + "Input/sentences.train";
             * var outputFilePath = currentDirectory + "Output/" + Path.GetFileNameWithoutExtension(inputFilePath) + ".nbin";
             * var iterations = 100;
             * var cut = 5;
             * var endOfSentenceScanner = new CharactersSpecificEndOfSentenceScanner();
             * Console.WriteLine("Training model...");
             * var model = MaximumEntropySentenceDetector.TrainModel(inputFilePath, iterations, cut, endOfSentenceScanner);
             * Console.WriteLine("Writing output file '{0}'...", outputFilePath);
             * new BinaryGisModelWriter().Persist(model, outputFilePath);
             * Console.WriteLine("Output file written.");*/

            /*// tokenize tests
             * var modelPath = currentDirectory + "../Resources/Models/";
             * var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath + "EnglishTok.nbin");
             *
             * var input = "It was built of a bright brick throughout; its skyline was fantastic, and even its ground plan was wild.";
             * var tokens = tokenizer.Tokenize(input);
             * Console.WriteLine(string.Join(" | ", tokens));*/


            // detect tokenization issues

            /*var pathToFile = currentDirectory + "Input/tokenizerIssues.txt";
             * var modelPath = currentDirectory + "../Resources/Models/";
             * var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath + "EnglishTok.nbin");
             * var allLines = File.ReadAllLines(pathToFile);
             * foreach (var line in allLines)
             * {
             *  var tokens = tokenizer.Tokenize(line);
             *  Console.WriteLine(string.Join(" | ", tokens));
             * }*/

            // parsing
            var sentence  = "This is a generic bank response, which indicates simply that they are not willing to accept the transaction.";
            var tokenizer = new EnglishMaximumEntropyTokenizer(currentDirectory + "../Resources/Models/EnglishTok.nbin");
            var tokens    = tokenizer.Tokenize(sentence);
            var modelPath = currentDirectory + "../Resources/Models/";
            var parser    = new OpenNLP.Tools.Parser.EnglishTreebankParser(modelPath, true, false);
            var parse     = parser.DoParse(tokens);
            // Extract dependencies from lexical tree
            var tlp  = new PennTreebankLanguagePack();
            var gsf  = tlp.GrammaticalStructureFactory();
            var tree = new ParseTree(parse);

            Console.WriteLine(tree);
            var gs           = gsf.NewGrammaticalStructure(tree);
            var dependencies = gs.TypedDependencies();

            foreach (var dep in dependencies)
            {
                Console.WriteLine(dep);
            }

            Console.WriteLine("===========");
            Console.WriteLine("OK");
            Console.ReadKey();
        }
示例#18
0
        static void Main(string[] args)
        {
            // parse phrasal verb on usingEnglish.com

            /*var usingEnglishParser = new UsingEnglishParser();
             * var allPhrasalVerbs = usingEnglishParser.ParseAllPhrasalVerbs();
             * Console.Write("Parsed {0} phrasal verbs on using english", allPhrasalVerbs);*/

            // Persist phrasal verbs

            /*var phrasalVerbFilePath = PathToApplication + "Resources/phrasalVerbs";
             * PersistPhrasalVerbs(allPhrasalVerbs, phrasalVerbFilePath);
             * Console.WriteLine("Phrasal verbs persisted");*/

            // Lemmatizer
            var fullPathToSrcData = PathToApplication + "Resources/lemmatizer/en_lemmatizer_data.lem";
            var stream            = File.OpenRead(fullPathToSrcData);
            var lemmatizer        = new Lemmatizer(stream);

            // load phrasal verbs & examples
            var phrasalVerbFilePath = PathToApplication + "Resources/phrasalVerbs";
            var phrasalVerbs        = ReadFleexPhrasalVerbs()
                                      .Where(pv => !pv.IsMissingOnWordreference && pv.Name != "go to")
                                      .ToList();
            //var phrasalVerbs = ReadPhrasalVerbs(phrasalVerbFilePath);

            //

            /*var results = new List<Tuple<PhrasalVerb, List<TypedDependency>>>();
             * foreach (var phrasalVerb in phrasalVerbs)
             * {
             *  foreach (var usage in phrasalVerb.Usages)
             *  {
             *      var example = LowerCaseAllUpperCasedWords(usage.Example);
             *
             *      var parse = GetParser().DoParse(example);
             *      var particle1 = phrasalVerb.Particle1;
             *
             *      var deps = ParseBasedPhrasalVerbDetector.ComputeDependencies(parse);
             *      var relDeps = deps.Where(d => (d.Gov().GetWord() == particle1 && d.Gov().Index() < d.Dep().Index())
             || (d.Dep().GetWord() == particle1 && d.Dep().Index() < d.Gov().Index()))
             ||                                   .ToList();
             ||     results.Add(new Tuple<PhrasalVerb, List<TypedDependency>>(phrasalVerb, relDeps));
             ||     /*if (relDeps.Count > 1)
             ||     {
             ||         Console.WriteLine("{0}|{1}", phrasalVerb.Name, example);
             ||     }#1#
             || }
             ||}
             ||var groups = results.Where(tup => tup.Item2.Count == 1).Select(tup => tup.Item2.First().Reln()).GroupBy(s => s);
             ||foreach (var g in groups)
             ||{
             || Console.WriteLine("{0} -> {1} occurences", g.Key, g.Count());
             || Console.WriteLine("---");
             ||}*/


            //
            var tokenizerModelPaths = PathToApplication + "Resources/OpenNlp/Models/EnglishTok.nbin";
            var tokenizer           = new EnglishMaximumEntropyTokenizer(tokenizerModelPaths);
            var englishPosPath      = PathToApplication + "Resources/OpenNlp/Models/EnglishPOS.nbin";
            var tagDictPath         = PathToApplication + "Resources/OpenNlp/Models/Parser/tagdict";
            var tagger             = new EnglishMaximumEntropyPosTagger(englishPosPath, tagDictPath);
            var basicDetector      = new BasicPhrasalVerbDetector(tokenizer, lemmatizer);
            var parseBasedDetector = new ParseBasedPhrasalVerbDetector(GetParser(), lemmatizer, tokenizer, tagger);

            var pathToManuallyValidatedPhrasalVerbs   = PathToApplication + "Resources/manual/good.txt";
            var pathToManuallyUnvalidatedPhrasalVerbs = PathToApplication + "Resources/manual/bad.txt";


            /*var sent = "And because fertility rates fell across that very same period that life expectancy was going up, that pyramid that has always represented the distribution of age in the population, with many young ones at the bottom winnowed to a tiny peak of older people who make it and survive to old age is being reshaped into a rectangle.";
             * var pvs = parseBasedDetector.MatchingPhrasalVerbs(sent, phrasalVerbs.ConvertAll(pv => (PhrasalVerb)pv));*/

            // missing pv detections
            var manuallyValidatedExamples = File.ReadAllLines(pathToManuallyValidatedPhrasalVerbs)
                                            .Where(line => phrasalVerbs.Select(pv => pv.Name).Contains(line.Split('|').First()))
                                            .ToList();

            Console.WriteLine("Phrasal verbs not detected:");
            var notDetected = new List <Tuple <string, string> >();

            foreach (var example in manuallyValidatedExamples)
            {
                var sentence    = example.Split('|').Last();
                var phrasalVerb = example.Split('|').First();
                var matchingPvs = parseBasedDetector.MatchingPhrasalVerbs(sentence, phrasalVerbs.ConvertAll(pv => (PhrasalVerb)pv));
                if (!matchingPvs.Any(p => p.Name == phrasalVerb))
                {
                    notDetected.Add(new Tuple <string, string>(sentence, phrasalVerb));
                }
            }
            Console.WriteLine("{0}% phrasal verbs not detected", (float)(notDetected.Count * 100) / manuallyValidatedExamples.Count());
            foreach (var tuple in notDetected)
            {
                Console.WriteLine("{0}; {1}", tuple.Item2, tuple.Item1);
            }
            Console.WriteLine("----------");

            // false positive detection
            var manuallyUnvalidatedExamples = File.ReadAllLines(pathToManuallyUnvalidatedPhrasalVerbs)
                                              .Where(line => phrasalVerbs.Select(pv => pv.Name).Contains(line.Split('|').First()))
                                              .ToList();

            Console.WriteLine("Wrongly detected PV ");
            var wronglyDetected = new List <Tuple <string, string> >();

            foreach (var example in manuallyUnvalidatedExamples)
            {
                var sentence    = example.Split('|').Last();
                var phrasalVerb = example.Split('|').First();
                var matchingPvs = parseBasedDetector.MatchingPhrasalVerbs(sentence, phrasalVerbs.ConvertAll(pv => (PhrasalVerb)pv));
                if (matchingPvs.Any(p => p.Name == phrasalVerb))
                {
                    wronglyDetected.Add(new Tuple <string, string>(sentence, phrasalVerb));
                }
            }
            Console.WriteLine("{0}% of wrongly detected examples:", (float)(wronglyDetected.Count * 100) / manuallyUnvalidatedExamples.Count());
            foreach (var tuple in wronglyDetected)
            {
                Console.WriteLine("'{0}'; {1}", tuple.Item2, tuple.Item1);
            }
            Console.WriteLine("----------");


            // manual input for loosely detected phrasal verb

            /*var pathToSentenceFile = PathToApplication + "Resources/fleex_sentences.txt";
             * var sentences = File.ReadAllLines(pathToSentenceFile);
             * foreach (var sentence in sentences)
             * {
             *  // detect all other phrasal verbs
             *  foreach (var pv in phrasalVerbs)
             *  {
             *      var isMatch = basicDetector.IsMatch(sentence, pv);
             *      if (isMatch)
             *      {
             *          var capitalizedRoot = Regex.Replace(sentence, "\\b" + pv.Root, pv.Root.ToUpper());
             *          var capitalizedParticle = Regex.Replace(capitalizedRoot, "\\b" + pv.Particle1 + "\\b", pv.Particle1.ToUpper());
             *          Console.WriteLine("{0} --> '{1}'; 'y' for OK, n otherwise", pv.Name, capitalizedParticle);
             *          var key = Console.ReadKey();
             *          while (key.KeyChar != 'y' && key.KeyChar != 'n')
             *          {
             *              Console.WriteLine("'y' / 'n' only");
             *              key = Console.ReadKey();
             *          }
             *          Console.WriteLine();
             *          string filePathToWrite = key.KeyChar == 'y'
             *              ? pathToManuallyValidatedPhrasalVerbs
             *              : pathToManuallyUnvalidatedPhrasalVerbs;
             *          using (var writer = new StreamWriter(filePathToWrite, true))
             *          {
             *              writer.WriteLine("{0}|{1}", pv.Name, sentence);
             *          }
             *      }
             *  }
             * }*/


            /*// persisting list of phrasal verbs
             * Console.WriteLine("============");
             * Console.WriteLine("Persisting phrasal verbs to {0}", phrasalVerbFilePath);
             * PersistPhrasalVerbs(phrasalVerbs, phrasalVerbFilePath);
             * Console.WriteLine("Persisted phrasal verbs");*/

            // persisting examples

            /*var pvExamplesFilePath = PathToApplication + "Resources/phrasalVerbsExamples.txt";
             * PersistPhrasalVerbsAndExamples(phrasalVerbs, pvExamplesFilePath);
             * Console.WriteLine("Persisted examples");
             * Console.WriteLine("-------------------------");*/

            // stats on usingenglish phrasal verbs

            /*var verbs = ReadPhrasalVerbs(phrasalVerbFilePath);
             * var nbOfSeparableVerbs = verbs.Count(v => v.Usages.All(u => u.SeparableMandatory));
             * var nbOfInseparableVerbs = verbs.Count(v => v.Usages.All(u => u.Inseparable));
             * Console.WriteLine("{0} separable verbs", nbOfSeparableVerbs);
             * Console.WriteLine("{0} inseparable verbs", nbOfInseparableVerbs);
             * Console.WriteLine("{0} verbs", verbs.Count);*/

            // write a file with examples of phrasal verbs

            /*var pathToOutputFile = PathToApplication + "Resources/phrasalVerbExamples.txt";
             * var verbs = ReadPhrasalVerbs(phrasalVerbFilePath);
             * var lines =
             *  verbs.SelectMany(v => v.Usages.Select(u => new {Usage = u, v.Name}))
             *      .Select(a => string.Format("{0}|{1}", a.Name, a.Usage.Example));
             * foreach (var line in lines)
             * {
             *  Console.WriteLine(line);
             * File.WriteAllLines(pathToOutputFile, lines);*/

            Console.WriteLine("=== END ===");
            Console.ReadKey();
        }
示例#19
0
 public Tokenizer()
 {
     _sentenceDetector = new EnglishMaximumEntropySentenceDetector(modelPath);
     _tokenizer        = new EnglishMaximumEntropyTokenizer(modelTokenPath);
 }