Beispiel #1
0
        // [Explicit-Dispose]
        public void Dispose()
        {
            if (_wn != null)
            {
                _wn.Dispose();
            }

            _tokenizer        = null;
            _sentenceDetector = null;
            _posTagger        = null;
            _chunker          = null;

            // Dispose CLI/C++ Dll
            ap = null;

            // Dispose all KB plugins
            if (PlugInsNumber > 0)
            {
                for (int i = 0; i < PlugInsNumber; i++)
                {
                    KBDrivers[i] = null;
                    KBDriversQueryPointers[i] = null;
                }
            }
        }
Beispiel #2
0
        public TextSentenceSplitter()
        {
            var fullPath = Path.Combine(GetBinariesPath(), "Data", "EnglishSD.nbin");

            SaveResourceToFile(fullPath, "kyciti.Data.EnglishSD.nbin");

            _maximumEntropySentenceDetector = new EnglishMaximumEntropySentenceDetector(fullPath);
        }
Beispiel #3
0
 public WordNetBoostrap(string nlpModelsPath, string wordNetPath)
 {
     this._wordNetPath = wordNetPath;
     _wn               = new WordNetEngine(_wordNetPath, true);
     _tokenizer        = new EnglishRuleBasedTokenizer(false);
     _sentenceDetector = new EnglishMaximumEntropySentenceDetector(nlpModelsPath + "EnglishSD.nbin");
     _posTagger        = new EnglishMaximumEntropyPosTagger(nlpModelsPath + "EnglishPOS.nbin", nlpModelsPath + @"\Parser\tagdict");
 }
Beispiel #4
0
        public string[] SplitSentences(string paragraph)
        {
            if (mSentenceDetector == null)
            {
                mSentenceDetector = new EnglishMaximumEntropySentenceDetector(mModelPath + @"\EnglishSD.nbin");
            }

            return(mSentenceDetector.SentenceDetect(paragraph));
        }
Beispiel #5
0
        // NLP methods -------------------------------------------

        private string[] SplitSentences(string paragraph)
        {
            if (_sentenceDetector == null)
            {
                _sentenceDetector = new EnglishMaximumEntropySentenceDetector(_modelPath + "EnglishSD.nbin");
            }

            return(_sentenceDetector.SentenceDetect(paragraph));
        }
Beispiel #6
0
        public void Dispose()
        {
            _tokenizer        = null;
            _sentenceDetector = null;
            _posTagger        = null;

            SynsetArray = null;
            IsStopWord  = null;

            _wn.Dispose();
        }
Beispiel #7
0
        // Defualt Constructor
        public SemCluster(string DataFolder)
        {
            try
            {
                Console.WriteLine("\tSemCluster Text Analytics Tool");
                Console.WriteLine("\t------------------------------");
                Console.WriteLine("\t-Wikipedia local server couldn't be found!");
                Console.WriteLine("\t-Seeds SemAve is in manual mode!");
                Console.WriteLine();
                Console.WriteLine();
                Console.WriteLine("-> Resources loading ...");
                Console.WriteLine();

                #region Loading External Resources
                _wn               = new WordNetEngine(DataFolder + "WordNet", InMemoryWordNet);
                _tokenizer        = new EnglishRuleBasedTokenizer(TokenizeHyphen);
                _sentenceDetector = new EnglishMaximumEntropySentenceDetector(DataFolder + "EnglishSD.nbin");
                _posTagger        = new EnglishMaximumEntropyPosTagger(DataFolder + "EnglishPOS.nbin", DataFolder + "\\Build\\tagdict");
                _chunker          = new EnglishTreebankChunker(DataFolder + "EnglishChunk.nbin");
                #endregion

                PlugInsManager(DataFolder);

                Console.WriteLine("\tResources loaded successfully");
                Console.WriteLine("\t" + PlugInsNumber + " KB plug-ins found in the repository");
                Console.WriteLine("\tPress any key to continue ...");
                Console.ReadKey();
                Console.WriteLine();

                RootVirtualNode = _wn.GetSynSet("Noun:1740");
                ap = new AffinityPropagationClustering();

                SynSetRelationTypes    = new WordNetApi.Core.WordNetEngine.SynSetRelation[2];
                SynSetRelationTypes[0] = WordNetApi.Core.WordNetEngine.SynSetRelation.Hypernym;
                SynSetRelationTypes[1] = WordNetApi.Core.WordNetEngine.SynSetRelation.InstanceHypernym;
            }
            catch (Exception ex)
            {
                Dispose();
                throw new Exception(ex.Message);
            }
        }
Beispiel #8
0
        private static void OptimizeSentenceDetectionTraining()
        {
            // all directories in Input folder
            var inputFolderPath = CurrentDirectory + "Input/";
            var allDirectories  = Directory.GetDirectories(inputFolderPath);

            Console.WriteLine("Pick the model to train:");
            for (var i = 0; i < allDirectories.Length; i++)
            {
                Console.WriteLine("{0} - {1}", i, Path.GetFileName(allDirectories[i]));
            }

            // read directory chosen by user
            int directoryIndexPicked = LoopUntilValidUserInput(input => int.Parse(input),
                                                               i => i < allDirectories.Length, string.Format("Please enter a number in [0..{0}]", allDirectories.Length - 1));

            // read user parameters
            Console.WriteLine("Enter the iteration values to test, separated by a comma (ex: 10,100,200)");
            var iterations = LoopUntilValidUserInput(input => input.Split(',').Select(s => int.Parse(s.Trim())).ToList(),
                                                     li => li != null && li.Any(),
                                                     "At least one iteration value is required");

            Console.WriteLine("Enter the cut values to test, separated by a comma (ex: 1,2,5)");
            var cuts = LoopUntilValidUserInput(input => input.Split(',').Select(s => int.Parse(s.Trim())).ToList(),
                                               li => li != null && li.Any(),
                                               "At least one cut value is required");

            // train model file
            var directory     = allDirectories[directoryIndexPicked];
            var allTrainFiles = Directory.GetFiles(directory, "*.train");

            Console.WriteLine("Training model with files {0}", string.Join(", ", allTrainFiles.Select(f => Path.GetFileNameWithoutExtension(f))));

            // load training data
            var allSentences = new List <string>();

            foreach (var file in allTrainFiles.Where(f => !f.Contains("wsj")))
            {
                allSentences.AddRange(File.ReadAllLines(file));
            }
            var testData = string.Join(" ", allSentences);

            var bestIterationValue = iterations.First();
            var bestCutValue       = iterations.First();
            var bestAccuracy       = 0.0d;

            var endOfSentenceScanner = new CharactersSpecificEndOfSentenceScanner('.', '?', '!', '"', '-', '…');

            foreach (var iteration in iterations)
            {
                foreach (var cut in cuts)
                {
                    var model = MaximumEntropySentenceDetector.TrainModel(allTrainFiles, iteration, cut, endOfSentenceScanner);
                    // compute accuracy
                    var sentenceDetector = new MaximumEntropySentenceDetector(model, endOfSentenceScanner);
                    var results          = sentenceDetector.SentenceDetect(testData);

                    // not perfect for comparing files but it gives a very good approximation
                    //var commonValues = allSentences.Intersect(results).Count();
                    var nbOfCommonSentences = 0;
                    foreach (var result in results)
                    {
                        if (allSentences.Contains(result))
                        {
                            nbOfCommonSentences++;
                        }
                        else
                        {
                            //Console.WriteLine(result);
                        }
                    }
                    var accuracyScore = (float)2 * nbOfCommonSentences / (allSentences.Count + results.Count());
                    Console.WriteLine("Accuracy for iteration={0} and cut={1}: {2}", iteration, cut, accuracyScore);
                    if (accuracyScore > bestAccuracy)
                    {
                        bestAccuracy       = accuracyScore;
                        bestIterationValue = iteration;
                        bestCutValue       = cut;
                    }
                }
            }

            // Persit model
            var outputFilePath = CurrentDirectory + "Output/" + Path.GetFileName(directory) + ".nbin";

            Console.WriteLine("Persisting model for iteration={0} and cut={1} to file '{2}'...", bestIterationValue, bestCutValue, outputFilePath);
            var bestModel = MaximumEntropySentenceDetector.TrainModel(allTrainFiles, bestIterationValue, bestCutValue, endOfSentenceScanner);

            new BinaryGisModelWriter().Persist(bestModel, outputFilePath);
            Console.WriteLine("Output file written.");
        }