/// <summary> /// Parse the document searching for sentences where the entity found. /// Returns a csv line with the file, the entity the sentence and the sintax analisis of the sentences /// </summary> /// <param name="text">Document text</param> /// <param name="entity">Entity.</param> /// <param name="origFile">Original file.</param> public static List <string[]> Parse(string text, string entity, string origFile, string language) { var results = new List <string[]>(); //Load spanish models. var modelsDirectory = StanfordEnv.PARSER_MODELS; var lexparserDirectory = modelsDirectory + StanfordEnv.GetParserLanguageFiles(language); var lp = LexicalizedParser.loadModel(lexparserDirectory); string[] splittedText = SplitText(text); List <string> entityLines = GetEntitiesLines(splittedText, entity); foreach (var line in entityLines) { //Parser sentence. var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); var sent2Reader = new java.io.StringReader(line); var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize(); sent2Reader.close(); var tree2 = lp.apply(rawWords2); results.Add(new string[] { origFile, entity, line, tree2.ToString() }); } return(results); }
/// <summary> /// Global method for entities generation /// </summary> /// <param name="inputPath">The input path</param> /// <param name="output">Output stream</param> static void GenerateEntities(string inputPath, TextWriter output, string language) { output.WriteLine("<wis>"); var jarRoot = StanfordEnv.GetStanfordHome(); var classifiersDirectory = jarRoot + StanfordEnv.CLASIFIERS; string[] fileEntries = FilesUtils.GetFiles(inputPath); foreach (var document in fileEntries) { string text = FilesUtils.FileToText(document); // XXX: Better a NullObject, but string can't be inherited I think. if (text == null) { var stderr = new StreamWriter(Console.OpenStandardError()); stderr.WriteLine($"The file '{document}' is not supported"); stderr.Close(); continue; } var classifier = CRFClassifiers.GetClassifierByLang(language); //CRFClassifier.getClassifierNoExceptions(classifiersDirectory + StanfordEnv.GetNerLanguageFiles(language)); output.WriteLine(classifier.classifyToString(text, "xml", true)); } output.WriteLine("</wis>"); }
public static CRFClassifier GetClassifierByLang(string lang) { if (!classifiers.ContainsKey(lang)) { classifiers.Add(lang, CRFClassifier.getClassifierNoExceptions(classifiersDirectory + StanfordEnv.GetNerLanguageFiles(lang))); } return(classifiers[lang]); }
public static bool CheckLangFiles(String language) { return(FilesUtils.ExistsModels(StanfordEnv.GetStanfordHome() + StanfordEnv.CLASIFIERS + @"/" + language + ".ancora.distsim.s512.crf.ser.gz")); }