示例#1
0
文件: Parser.cs 项目: fossabot/atpr
        /// <summary>
        /// Parse the document searching for sentences where the entity found.
        /// Returns a csv line with the file, the entity the sentence and the sintax analisis of the sentences
        /// </summary>
        /// <param name="text">Document text</param>
        /// <param name="entity">Entity.</param>
        /// <param name="origFile">Original file.</param>
        public static List <string[]> Parse(string text, string entity, string origFile, string language)
        {
            var results = new List <string[]>();
            //Load spanish models.
            var modelsDirectory    = StanfordEnv.PARSER_MODELS;
            var lexparserDirectory = modelsDirectory + StanfordEnv.GetParserLanguageFiles(language);
            var lp = LexicalizedParser.loadModel(lexparserDirectory);

            string[]      splittedText = SplitText(text);
            List <string> entityLines  = GetEntitiesLines(splittedText, entity);

            foreach (var line in entityLines)
            {
                //Parser sentence.
                var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
                var sent2Reader      = new java.io.StringReader(line);
                var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
                sent2Reader.close();
                var tree2 = lp.apply(rawWords2);

                results.Add(new string[] { origFile, entity, line, tree2.ToString() });
            }

            return(results);
        }
示例#2
0
        /// <summary>
        /// Global method for entities generation
        /// </summary>
        /// <param name="inputPath">The input path</param>
        /// <param name="output">Output stream</param>
        static void GenerateEntities(string inputPath, TextWriter output, string language)
        {
            output.WriteLine("<wis>");

            var jarRoot = StanfordEnv.GetStanfordHome();
            var classifiersDirectory = jarRoot + StanfordEnv.CLASIFIERS;

            string[] fileEntries = FilesUtils.GetFiles(inputPath);

            foreach (var document in fileEntries)
            {
                string text = FilesUtils.FileToText(document);
                // XXX: Better a NullObject, but string can't be inherited I think.
                if (text == null)
                {
                    var stderr = new StreamWriter(Console.OpenStandardError());
                    stderr.WriteLine($"The file '{document}' is not supported");
                    stderr.Close();
                    continue;
                }

                var classifier = CRFClassifiers.GetClassifierByLang(language);                 //CRFClassifier.getClassifierNoExceptions(classifiersDirectory + StanfordEnv.GetNerLanguageFiles(language));

                output.WriteLine(classifier.classifyToString(text, "xml", true));
            }
            output.WriteLine("</wis>");
        }
示例#3
0
 public static CRFClassifier GetClassifierByLang(string lang)
 {
     if (!classifiers.ContainsKey(lang))
     {
         classifiers.Add(lang,
                         CRFClassifier.getClassifierNoExceptions(classifiersDirectory + StanfordEnv.GetNerLanguageFiles(lang)));
     }
     return(classifiers[lang]);
 }
示例#4
0
 public static bool CheckLangFiles(String language)
 {
     return(FilesUtils.ExistsModels(StanfordEnv.GetStanfordHome() +
                                    StanfordEnv.CLASIFIERS + @"/" + language
                                    + ".ancora.distsim.s512.crf.ser.gz"));
 }