public SUTimePipeline(Properties props) { // By default, we want to tokenize the text, split it into // sentences, and then put it through the sutime annotator. // We also want to pos tag it and put it through the number and // qen annotators. // Since there will be different options for the sutime annotator, // we will actually create a new sutime annotator for each query. // This should be inexpensive. if (props.GetProperty("annotators") == null) { props.SetProperty("annotators", "tokenize, ssplit, pos"); } // "tokenize, ssplit, pos, number, qen"); /* if (props.getProperty("customAnnotatorClass.number") == null) { * props.setProperty("customAnnotatorClass.number", * "edu.stanford.nlp.pipeline.NumberAnnotator"); * } * if (props.getProperty("customAnnotatorClass.qen") == null) { * props.setProperty("customAnnotatorClass.qen", * "edu.stanford.nlp.pipeline.QuantifiableEntityNormalizingAnnotator"); * } */ // this replicates the tokenizer behavior in StanfordCoreNLP props.SetProperty("tokenize.options", "invertible,ptb3Escaping=true"); this.pipeline = new StanfordCoreNLP(props); }
static ChinesePOSExtractor() { var props = new java.util.Properties(); props.setProperty("annotators", "segment, ssplit, pos"); props.setProperty("customAnnotatorClass.segment", "edu.stanford.nlp.pipeline.ChineseSegmenterAnnotator"); props.setProperty("segment.model", "edu/stanford/nlp/models/segmenter/chinese/ctb.gz"); props.setProperty("segment.sighanCorporaDict", "edu/stanford/nlp/models/segmenter/chinese"); props.setProperty("segment.serDictionary", "edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz"); props.setProperty("segment.sighanPostProcessing", "true"); //sentence split props.setProperty("ssplit.boundaryTokenRegex", "[.]|[!?]+|[。]|[!?]+"); //pos props.setProperty("pos.model", "edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger"); //ner props.setProperty("ner.model", "edu/stanford/nlp/models/ner/chinese.misc.distsim.crf.ser.gz"); props.setProperty("ner.applyNumericClassifiers", "false"); props.setProperty("ner.useSUTime", "false"); //# parse props.setProperty("parse.model", "edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz"); pipeline = new StanfordCoreNLP(props); }
/// <summary> /// Training and configuration takes several seconds. Training will happen automatically if you just call parse, but I've broken /// this out to enable front-loading. this way, you can train once and re-use the instance. Or perhaps build a singleton. /// </summary> public void TrainAndConfigure() { var config = new PipelineConfigurator(); Pipeline = config.GetPipeline(); IsTrainedAndReady = true; }
public virtual void TestPrereqAnnotatorsBasic() { NUnit.Framework.Assert.AreEqual("tokenize,ssplit,pos,parse", StanfordCoreNLP.EnsurePrerequisiteAnnotators(new string[] { "parse" }, new Properties())); NUnit.Framework.Assert.AreEqual("tokenize,ssplit,pos,depparse", StanfordCoreNLP.EnsurePrerequisiteAnnotators(new string[] { "depparse" }, new Properties())); NUnit.Framework.Assert.AreEqual("tokenize,ssplit,pos,depparse", StanfordCoreNLP.EnsurePrerequisiteAnnotators(new string[] { "depparse", "tokenize" }, new Properties())); NUnit.Framework.Assert.AreEqual("tokenize,ssplit,pos,lemma,depparse,natlog", StanfordCoreNLP.EnsurePrerequisiteAnnotators(new string[] { "natlog", "tokenize" }, new Properties())); }
/// <summary>Set the properties to the paths they appear at on the servlet.</summary> /// <remarks> /// Set the properties to the paths they appear at on the servlet. /// See build.xml for where these paths get copied. /// </remarks> /// <exception cref="Javax.Servlet.ServletException">Thrown by the implementation</exception> public override void Init() { Properties commonProps = new _Properties_43(); try { string dataDir = GetServletContext().GetRealPath("/WEB-INF/data"); Runtime.SetProperty("de.jollyday.config", GetServletContext().GetRealPath("/WEB-INF/classes/holidays/jollyday.properties")); commonProps.SetProperty("pos.model", dataDir + "/english-left3words-distsim.tagger"); commonProps.SetProperty("ner.model", dataDir + "/english.all.3class.distsim.crf.ser.gz," + dataDir + "/english.conll.4class.distsim.crf.ser.gz," + dataDir + "/english.muc.7class.distsim.crf.ser.gz"); commonProps.SetProperty("depparse.model", dataDir + "/english_SD.gz"); commonProps.SetProperty("parse.model", dataDir + "/englishPCFG.ser.gz"); commonProps.SetProperty("sutime.rules", dataDir + "/defs.sutime.txt," + dataDir + "/english.sutime.txt," + dataDir + "/english.hollidays.sutime.txt"); commonProps.SetProperty("openie.splitter.model", dataDir + "/clauseSplitterModel.ser.gz"); commonProps.SetProperty("openie.affinity_models", dataDir); } catch (ArgumentNullException) { log.Info("Could not load servlet context. Are you on the command line?"); } if (this.pipeline == null) { Properties fullProps = new Properties(commonProps); fullProps.SetProperty("annotators", "tokenize,ssplit,pos,lemma,depparse,ner,natlog,openie"); this.pipeline = new StanfordCoreNLP(fullProps); } if (this.backoff == null) { Properties backoffProps = new Properties(commonProps); backoffProps.SetProperty("annotators", "parse,natlog,openie"); backoffProps.SetProperty("enforceRequirements", "false"); this.backoff = new StanfordCoreNLP(backoffProps); } }
public StanfordCoreNLP LoadNlp(IContentProvider jsonContentProvider) { _jsonContentProvider = jsonContentProvider; // Annotation pipeline configuration var props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, entitymentions"); props.setProperty("ner.useSUTime", "0"); // We should change current directory, so StanfordCoreNLP could find all the model files automatically var curDir = Environment.CurrentDirectory; if (!_jarRoot.IsNullOrEmpty()) { Directory.SetCurrentDirectory(_jarRoot); } else { throw new NullReferenceException("ERROR: Core NLP Java model directory not specified. Set string JarRootDir."); } // UNCOMMENT ME TO SUPRESS LOGGING OUTPUT //RedwoodConfiguration.empty().capture(java.lang.System.err).apply(); var pipeline = new StanfordCoreNLP(props); //RedwoodConfiguration.current().clear().apply(); Directory.SetCurrentDirectory(curDir); return(pipeline); }
public NLPProcessor() { const string pathToCoreModels = @"C:\NLPModels\"; const string pathToTaggerModel = @"C:\NLPModels\english-left3words-distsim.tagger"; const string pathToNerTagger = @"C:\NLPModels\english.all.7class.distsim.crf.ser.gz"; const string sutimeRules = pathToCoreModels + @"sutime\defs.sutime.txt," + pathToCoreModels + @"sutime\english.holidays.sutime.txt," + pathToCoreModels + @"sutime\english.sutime.txt"; // Annotation pipeline configuration var props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner"); props.setProperty("sutime.rules", sutimeRules); props.setProperty("sutime.binders", "0"); props.setProperty("pos.model", pathToTaggerModel); props.setProperty("ner.model", pathToNerTagger); // We should change current directory, so StanfordCoreNLP could find all the model files automatically var curDir = Environment.CurrentDirectory; Directory.SetCurrentDirectory(pathToCoreModels); _pipeline = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(curDir); }
private void SetupCoreNLP() { try { var propsFile = System.IO.Path.Combine(_modelPath, "StanfordCoreNLP-spanish.properties"); // Annotation pipeline configuration var props = new Properties(); props.load(new FileReader(propsFile)); props.put("ner.useSUTime", "0"); props.put("threads", "10"); props.put("tokenize.verbose", "true"); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, depparse, kbp, coref,entitymentions, quote"); var curDir = Environment.CurrentDirectory; Directory.SetCurrentDirectory(_modelPath); _pipeline = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(curDir); } catch (Exception e) { throw new SpanishCoreNLPSetupException(e.Message, e); } }
public virtual void TestDefaultNoNLsPipeline() { string t = "Text with \n\n a new \nline."; IList <string> tWords = Arrays.AsList("Text", "with", "a", "new", "line", "."); Properties props = new Properties(); props.SetProperty("annotators", "tokenize"); Annotation ann = new Annotation(t); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.Annotate(ann); IEnumerator <string> it = tWords.GetEnumerator(); foreach (CoreLabel word in ann.Get(typeof(CoreAnnotations.TokensAnnotation))) { NUnit.Framework.Assert.AreEqual("Bung token in new CoreLabel usage", it.Current, word.Word()); } NUnit.Framework.Assert.IsFalse("Too few tokens in new CoreLabel usage", it.MoveNext()); IEnumerator <string> it2 = tWords.GetEnumerator(); foreach (CoreLabel word_1 in ann.Get(typeof(CoreAnnotations.TokensAnnotation))) { NUnit.Framework.Assert.AreEqual("Bung token in new CoreLabel usage", it2.Current, word_1.Get(typeof(CoreAnnotations.TextAnnotation))); } NUnit.Framework.Assert.IsFalse("Too few tokens in new CoreLabel usage", it2.MoveNext()); }
private static void ModifyUsingCoreNLPNER(Annotation doc) { Properties ann = new Properties(); ann.SetProperty("annotators", "pos, lemma, ner"); StanfordCoreNLP pipeline = new StanfordCoreNLP(ann, false); pipeline.Annotate(doc); foreach (ICoreMap sentence in doc.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <EntityMention> entities = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)); if (entities != null) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); foreach (EntityMention en in entities) { //System.out.println("old ner tag for " + en.getExtentString() + " was " + en.getType()); Span s = en.GetExtent(); ICounter <string> allNertagforSpan = new ClassicCounter <string>(); for (int i = s.Start(); i < s.End(); i++) { allNertagforSpan.IncrementCount(tokens[i].Ner()); } string entityNertag = Counters.Argmax(allNertagforSpan); en.SetType(entityNertag); } } } }
static void Main() { // Path to the folder with models extracted from `stanford-corenlp-3.9.1-models.jar` var jarRoot = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-corenlp-full-2018-10-05\models"; // Text for processing var text = "Kosgi Santosh sent an email to Stanford University. He didn't get a reply."; // Annotation pipeline configuration var props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse, ner,dcoref"); props.setProperty("ner.useSUTime", "0"); // We should change current directory, so StanfordCoreNLP could find all the model files automatically var curDir = Environment.CurrentDirectory; Directory.SetCurrentDirectory(jarRoot); var pipeline = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(curDir); // Annotation var annotation = new Annotation(text); pipeline.annotate(annotation); // Result - Pretty Print using (var stream = new ByteArrayOutputStream()) { pipeline.prettyPrint(annotation, new PrintWriter(stream)); Console.WriteLine(stream.toString()); stream.close(); } }
/// <summary> /// Executes Sentiment and EntitiesMentioned analysis. /// </summary> public IOutcome<AnalysisResult> Analyze(StanfordCoreNLP pipeline, string text) { //Create annotated document Annotation doc = new Annotation(text); pipeline.annotate(doc); //Validate var sentences = doc.get(typeof(CoreAnnotations.SentencesAnnotation)); if (sentences == null) return Outcomes.Outcomes .Failure<AnalysisResult>() .WithMessage("No sentences detected."); //Analyze var result = new AnalysisResult() { Sentiment = GetSentiment((ArrayList)sentences), MentionedEntities = GetMentions(doc) }; return Outcomes.Outcomes .Success<AnalysisResult>() .WithValue(result); }
private static void RunPipeline(StanfordCoreNLP pipeline, string text, PrintWriter @out) { Annotation annotation = new Annotation(text); pipeline.Annotate(annotation); // An Annotation is a Map and you can get and use the various analyses individually. @out.Println(); // The toString() method on an Annotation just prints the text of the Annotation // But you can see what is in it with other methods like toShorterString() @out.Println("The top level annotation"); @out.Println(annotation.ToShorterString()); IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); foreach (ICoreMap sentence in sentences) { // Print out token annotations foreach (CoreLabel token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation))) { // Print out words, lemma, ne, and normalized ne string word = token.Get(typeof(CoreAnnotations.TextAnnotation)); string lemma = token.Get(typeof(CoreAnnotations.LemmaAnnotation)); string pos = token.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)); string ne = token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); string normalized = token.Get(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation)); @out.Println("token: " + "word=" + word + ", lemma=" + lemma + ", pos=" + pos + ", ne=" + ne + ", normalized=" + normalized); } } @out.Flush(); }
} = false; // I'd, don't, you're, ... protected void InitializeProcessor() { this.props.setProperty("annotators", "tokenize, ssplit, pos, lemma"); // lemma needs ssplit and pos, pos needs ssplit this.pipeline = new StanfordCoreNLP(this.props); this.InitializeStopwords(); }
/// <exception cref="System.Exception"/> public static void Main(string[] args) { Properties props = StringUtils.ArgsToProperties(args); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); string file = props.GetProperty("file"); string loadFile = props.GetProperty("loadFile"); if (loadFile != null && !loadFile.IsEmpty()) { Edu.Stanford.Nlp.Pipeline.CustomAnnotationSerializer ser = new Edu.Stanford.Nlp.Pipeline.CustomAnnotationSerializer(false, false); InputStream @is = new FileInputStream(loadFile); Pair <Annotation, InputStream> pair = ser.Read(@is); pair.second.Close(); Annotation anno = pair.first; System.Console.Out.WriteLine(anno.ToShorterString(StringUtils.EmptyStringArray)); @is.Close(); } else { if (file != null && !file.Equals(string.Empty)) { string text = IOUtils.SlurpFile(file); Annotation doc = new Annotation(text); pipeline.Annotate(doc); Edu.Stanford.Nlp.Pipeline.CustomAnnotationSerializer ser = new Edu.Stanford.Nlp.Pipeline.CustomAnnotationSerializer(false, false); TextWriter os = new TextWriter(new FileOutputStream(file + ".ser")); ser.Write(doc, os).Close(); log.Info("Serialized annotation saved in " + file + ".ser"); } else { log.Info("usage: CustomAnnotationSerializer [-file file] [-loadFile file]"); } } }
static void Main() { // Path to the folder with models extracted from `stanford-corenlp-3.7.0-models.jar` var jarRoot = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-corenlp-full-2016-10-31\models"; // Text for processing var text = "Kosgi Santosh sent an email to Stanford University. He didn't get a reply."; // Annotation pipeline configuration var props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse, ner,dcoref"); props.setProperty("ner.useSUTime", "0"); // We should change current directory, so StanfordCoreNLP could find all the model files automatically var curDir = Environment.CurrentDirectory; Directory.SetCurrentDirectory(jarRoot); var pipeline = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(curDir); // Annotation var annotation = new Annotation(text); pipeline.annotate(annotation); // Result - Pretty Print using (var stream = new ByteArrayOutputStream()) { pipeline.prettyPrint(annotation, new PrintWriter(stream)); Console.WriteLine(stream.toString()); stream.close(); } }
/// <summary>Annotate a document (which is usually just a sentence).</summary> public virtual void Annotate(StanfordCoreNLP pipeline, Annotation ann) { if (ann.Get(typeof(CoreAnnotations.SentencesAnnotation)) == null) { pipeline.Annotate(ann); } else { if (ann.Get(typeof(CoreAnnotations.SentencesAnnotation)).Count == 1) { ICoreMap sentence = ann.Get(typeof(CoreAnnotations.SentencesAnnotation))[0]; foreach (CoreLabel token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation))) { token.Remove(typeof(NaturalLogicAnnotations.OperatorAnnotation)); token.Remove(typeof(NaturalLogicAnnotations.PolarityAnnotation)); } sentence.Remove(typeof(NaturalLogicAnnotations.RelationTriplesAnnotation)); sentence.Remove(typeof(NaturalLogicAnnotations.EntailedSentencesAnnotation)); sentence.Remove(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)); sentence.Remove(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)); sentence.Remove(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation)); pipeline.Annotate(ann); } } }
/// <summary> /// Setup extended tagger that includes POS, lemma and entity analysis /// </summary> private void SetupExtendedTagger() { PerformanceTester.StartMET("NLP"); // Get path to Stanford NLP models var jarRoot = Path.Combine(Utility.GetResourcesFolder(), @"stanford-corenlp-3.9.2-models"); // Turn off logging RedwoodConfiguration.current().clear().apply(); var props = new java.util.Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner"); // Makes Named Entity Recognition work in the library props.setProperty("ner.useSUTime", "0"); props.put("ner.applyFineGrained", "0"); props.put("ner.fine.regexner.mapping", jarRoot + @"\edu\stanford\nlp\models\kbp\english\"); // Set current directory var curDir = Environment.CurrentDirectory; var modelsDirectory = curDir + "\\" + jarRoot + @"\edu\stanford\nlp\models"; Directory.SetCurrentDirectory(jarRoot); // Load Stanford NLP Tagger = new StanfordCoreNLP(props); PerformanceTester.StopMET("NLP"); }
public static void Main(string[] args) { try { Properties props = StringUtils.ArgsToProperties(args); props.SetProperty("annotators", "tokenize,ssplit,lemma,pos,parse,ner"); StanfordCoreNLP pipeline = new StanfordCoreNLP(); string sentence = "Barack Obama lives in America. Obama works for the Federal Goverment."; Annotation doc = new Annotation(sentence); pipeline.Annotate(doc); Edu.Stanford.Nlp.Pipeline.RelationExtractorAnnotator r = new Edu.Stanford.Nlp.Pipeline.RelationExtractorAnnotator(props); r.Annotate(doc); foreach (ICoreMap s in doc.Get(typeof(CoreAnnotations.SentencesAnnotation))) { System.Console.Out.WriteLine("For sentence " + s.Get(typeof(CoreAnnotations.TextAnnotation))); IList <RelationMention> rls = s.Get(typeof(MachineReadingAnnotations.RelationMentionsAnnotation)); foreach (RelationMention rl in rls) { System.Console.Out.WriteLine(rl.ToString()); } } } catch (Exception e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <exception cref="System.Exception"/> public static void Main(string[] args) { long startTime = Runtime.CurrentTimeMillis(); string text = "俄罗斯 航空 公司 一 名 官员 在 9号 说 , " + "米洛舍维奇 的 儿子 马可·米洛舍维奇 9号 早上 持 外交 护照 从 俄国 首都 莫斯科 搭机 飞往 中国 大陆 北京 , " + "可是 就 在 稍后 就 返回 莫斯科 。 " + "这 名 俄国 航空 公司 官员 说 马可 是 因为 护照 问题 而 在 北京 机场 被 中共 遣返 莫斯科 。 " + "北京 机场 方面 的 这 项 举动 清楚 显示 中共 有意 放弃 在 总统 大选 落败 的 前 南斯拉夫 总统 米洛舍维奇 , " + "因此 他 在 南斯拉夫 受到 民众 厌恶 的 儿子 马可 才 会 在 北京 机场 被 中共 当局 送回 莫斯科 。 " + "马可 持 外交 护照 能够 顺利 搭机 离开 莫斯科 , 但是 却 在 北京 受阻 , 可 算是 踢到 了 铁板 。 " + "可是 这 项 消息 和 先前 外界 谣传 中共 当局 准备 提供 米洛舍维奇 和 他 的 家人 安全 庇护所 有 着 很 大 的 出入 ," + " 一般 认为 在 去年 米洛舍维奇 挥兵 攻打 科索沃 境内 阿尔巴尼亚 一 分离主义 分子 的 时候 , " + "强力 反对 北约 组织 攻击 南斯拉夫 的 中共 , 会 全力 保护 米洛舍维奇 和 他 的 家人 及 亲信 。 " + "可是 从 9号 马可 被 送回 莫斯科 一 事 看 起来 , 中共 很 可能 会 放弃 米洛舍维奇 。"; args = new string[] { "-props", "edu/stanford/nlp/hcoref/properties/zh-coref-default.properties" }; Annotation document = new Annotation(text); Properties props = StringUtils.ArgsToProperties(args); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.Annotate(document); System.Console.Out.WriteLine("---"); System.Console.Out.WriteLine("coref chains"); foreach (CorefChain cc in document.Get(typeof(CorefCoreAnnotations.CorefChainAnnotation)).Values) { System.Console.Out.WriteLine("\t" + cc); } foreach (ICoreMap sentence in document.Get(typeof(CoreAnnotations.SentencesAnnotation))) { System.Console.Out.WriteLine("---"); System.Console.Out.WriteLine("mentions"); foreach (Mention m in sentence.Get(typeof(CorefCoreAnnotations.CorefMentionsAnnotation))) { System.Console.Out.WriteLine("\t" + m); } } long endTime = Runtime.CurrentTimeMillis(); long time = (endTime - startTime) / 1000; System.Console.Out.WriteLine("Running time " + time / 60 + "min " + time % 60 + "s"); }
private IAnnotator GetParser() { if (parserProcessor == null) { IAnnotator parser = StanfordCoreNLP.GetExistingAnnotator("parse"); if (parser == null) { Properties emptyProperties = new Properties(); parser = new ParserAnnotator("coref.parse.md", emptyProperties); } if (parser == null) { // TODO: these assertions rule out the possibility of alternately named parse/pos annotators throw new AssertionError("Failed to get parser - this should not be possible"); } if (parser.Requires().Contains(typeof(CoreAnnotations.PartOfSpeechAnnotation))) { IAnnotator tagger = StanfordCoreNLP.GetExistingAnnotator("pos"); if (tagger == null) { throw new AssertionError("Parser required tagger, but failed to find the pos annotator"); } IList <IAnnotator> annotators = Generics.NewArrayList(); annotators.Add(tagger); annotators.Add(parser); parserProcessor = new AnnotationPipeline(annotators); } else { parserProcessor = parser; } } return(parserProcessor); }
//-------------------------------------------------------------------------------------- Stanford Core NLP ----------------------------------------- //-- Better for Entity recognition public static void buildPipeline(string text) {//https://interviewbubble.com/getting-started-with-stanford-corenlp-a-stanford-corenlp-tutorial/ // Path to the folder with models extracted from `stanford-corenlp-3.7.0-models.jar` var jarRoot = @"..\..\..\..\data\paket-files\nlp.stanford.edu\stanford-corenlp-full-2016-10-31\models"; // creates a StanfordCoreNLP object, with POS tagging, lemmatization, // NER, parsing, and coreference resolution Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); //Finished processing the document here // Result - Pretty Print using (var stream = new ByteArrayOutputStream()) { pipeline.prettyPrint(document, new PrintWriter(stream)); Debug.WriteLine(stream.toString()); stream.close(); } }
/// <summary> /// /// </summary> /// <param name="jarRootPath">Path to the folder with models extracted from 'stanford-corenlp-3.5.2-models.jar'</param> public StanfordLemmatizer(string jarRootPath) { if (!Directory.Exists(jarRootPath)) { string fullPath = Path.GetFullPath(jarRootPath); throw new DirectoryNotFoundException("Folder(s) extracted from 'stanford-corenlp-3.5.2-models.jar' was not found in path: . " + "-->" + fullPath + "<--. " + "Please make sure correct path is listed in .config file."); } // Set properties required for lemma java.util.Properties props = new java.util.Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma"); props.setProperty("ner.useSUTime", "0"); // We should change current directory, so StanfordCoreNLP could find all the model files automatically string curDir = Environment.CurrentDirectory; Directory.SetCurrentDirectory(jarRootPath); _pipeLine = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(curDir); // Instantiate annotation _sentencesAnnotation = new CoreAnnotations.SentencesAnnotation(); _tokensAnnotation = new CoreAnnotations.TokensAnnotation(); _lemmaAnnotation = new CoreAnnotations.LemmaAnnotation(); }
public virtual void TestHyphens() { string test = "Hyphen-ated words should be split except when school-aged-children eat " + "anti-disestablishmentariansm for breakfast at the o-kay choral infront of some explor-o-toriums."; Properties props = new Properties(); props.SetProperty("annotators", "tokenize"); Annotation ann = new Annotation(test); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); pipeline.Annotate(ann); IList <CoreLabel> toks = ann.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(21, toks.Count); Properties props2 = new Properties(); props2.SetProperty("annotators", "tokenize"); props2.SetProperty("tokenize.options", "splitHyphenated=true"); Annotation ann2 = new Annotation(test); StanfordCoreNLP pipeline2 = new StanfordCoreNLP(props2); pipeline2.Annotate(ann2); IList <CoreLabel> toks2 = ann2.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(27, toks2.Count); }
// static demo class /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { if (args.Length < 2) { System.Console.Error.WriteLine("TokensRegexMatcher rules file [outFile]"); return; } string rules = args[0]; PrintWriter @out; if (args.Length > 2) { @out = new PrintWriter(args[2]); } else { @out = new PrintWriter(System.Console.Out); } StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner")); Annotation annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[1])); pipeline.Annotate(annotation); // Load lines of file as TokenSequencePatterns IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>(); foreach (string line in ObjectBank.GetLineIterator(rules)) { TokenSequencePattern pattern = TokenSequencePattern.Compile(line); tokenSequencePatterns.Add(pattern); } IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); int i = 0; foreach (ICoreMap sentence in sentences) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); @out.Println("Sentence #" + ++i); @out.Print(" Tokens:"); foreach (CoreLabel token in tokens) { @out.Print(' '); @out.Print(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag")); } @out.Println(); MultiPatternMatcher <ICoreMap> multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns); IList <ISequenceMatchResult <ICoreMap> > answers = multiMatcher.FindNonOverlapping(tokens); int j = 0; foreach (ISequenceMatchResult <ICoreMap> matched in answers) { @out.Println(" Match #" + ++j); for (int k = 0; k <= matched.GroupCount(); k++) { @out.Println(" group " + k + " = " + matched.Group(k)); } } } @out.Flush(); }
public ParserAnnotator(string annotatorName, Properties props) { string model = props.GetProperty(annotatorName + ".model", LexicalizedParser.DefaultParserLoc); if (model == null) { throw new ArgumentException("No model specified for Parser annotator " + annotatorName); } this.Verbose = PropertiesUtils.GetBool(props, annotatorName + ".debug", false); string[] flags = ConvertFlagsToArray(props.GetProperty(annotatorName + ".flags")); this.parser = LoadModel(model, Verbose, flags); this.maxSentenceLength = PropertiesUtils.GetInt(props, annotatorName + ".maxlen", -1); string treeMapClass = props.GetProperty(annotatorName + ".treemap"); if (treeMapClass == null) { this.treeMap = null; } else { this.treeMap = ReflectionLoading.LoadByReflection(treeMapClass, props); } this.maxParseTime = PropertiesUtils.GetLong(props, annotatorName + ".maxtime", -1); this.kBest = PropertiesUtils.GetInt(props, annotatorName + ".kbest", 1); this.keepPunct = PropertiesUtils.GetBool(props, annotatorName + ".keepPunct", true); string buildGraphsProperty = annotatorName + ".buildgraphs"; if (!this.parser.GetTLPParams().SupportsBasicDependencies()) { if (PropertiesUtils.GetBool(props, buildGraphsProperty)) { log.Info("WARNING: " + buildGraphsProperty + " set to true, but " + this.parser.GetTLPParams().GetType() + " does not support dependencies"); } this.BuildGraphs = false; } else { this.BuildGraphs = PropertiesUtils.GetBool(props, buildGraphsProperty, true); } if (this.BuildGraphs) { bool generateOriginalDependencies = PropertiesUtils.GetBool(props, annotatorName + ".originalDependencies", false); parser.GetTLPParams().SetGenerateOriginalDependencies(generateOriginalDependencies); ITreebankLanguagePack tlp = parser.GetTLPParams().TreebankLanguagePack(); IPredicate <string> punctFilter = this.keepPunct ? Filters.AcceptFilter() : tlp.PunctuationWordRejectFilter(); this.gsf = tlp.GrammaticalStructureFactory(punctFilter, parser.GetTLPParams().TypedDependencyHeadFinder()); } else { this.gsf = null; } this.nThreads = PropertiesUtils.GetInt(props, annotatorName + ".nthreads", PropertiesUtils.GetInt(props, "nthreads", 1)); bool usesBinary = StanfordCoreNLP.UsesBinaryTrees(props); this.saveBinaryTrees = PropertiesUtils.GetBool(props, annotatorName + ".binaryTrees", usesBinary); this.noSquash = PropertiesUtils.GetBool(props, annotatorName + ".nosquash", false); this.extraDependencies = MetaClass.Cast(props.GetProperty(annotatorName + ".extradependencies", "NONE"), typeof(GrammaticalStructure.Extras)); }
/// <summary>A debugging method to try entity linking sentences from the console.</summary> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { Properties props = StringUtils.ArgsToProperties(args); props.SetProperty("annotators", "tokenize,ssplit,pos,lemma,ner,entitymentions,entitylink"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); IOUtils.Console("sentence> ", null); }
public virtual IAnnotator GetParser() { if (parserProcessor == null) { parserProcessor = StanfordCoreNLP.GetExistingAnnotator("parse"); System.Diagnostics.Debug.Assert((parserProcessor != null)); } return(parserProcessor); }
public void StanfordCore(string jarRoot = @"..\..\models") { var props = new java.util.Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma"); // props.setProperty("ner.useSUTime", "0"); var curDir = Environment.CurrentDirectory; Directory.SetCurrentDirectory(jarRoot); pipeline = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(curDir); }
public virtual void TestFromCoreMapCrashCheck() { StanfordCoreNLP pipeline = new StanfordCoreNLP(new _Properties_107()); Annotation ann = new Annotation("This is a sentence."); pipeline.Annotate(ann); ICoreMap map = ann.Get(typeof(CoreAnnotations.SentencesAnnotation))[0]; new Sentence(map); }
/// <summary>A debugging method to try relation extraction from the console.</summary> /// <exception cref="System.IO.IOException">If any IO problem</exception> public static void Main(string[] args) { Properties props = StringUtils.ArgsToProperties(args); props.SetProperty("annotators", "tokenize,ssplit,pos,lemma,ner,regexner,parse,mention,coref,kbp"); props.SetProperty("regexner.mapping", "ignorecase=true,validpospattern=^(NN|JJ).*,edu/stanford/nlp/models/kbp/regexner_caseless.tab;edu/stanford/nlp/models/kbp/regexner_cased.tab"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); IOUtils.Console("sentence> ", null); }
private static Annotation TestAnnoation(string text, string[] args) { Annotation document = new Annotation(text); Properties props = StringUtils.ArgsToProperties(args); StanfordCoreNLP corenlp = new StanfordCoreNLP(props); corenlp.Annotate(document); Edu.Stanford.Nlp.Pipeline.HybridCorefAnnotator hcoref = new Edu.Stanford.Nlp.Pipeline.HybridCorefAnnotator(props); hcoref.Annotate(document); return(document); }
/// <summary>Populates options from StanfordCoreNLP pipeline.</summary> public static AnnotationOutputter.Options GetOptions(StanfordCoreNLP pipeline) { AnnotationOutputter.Options options = new AnnotationOutputter.Options(); options.relationsBeam = pipeline.GetBeamPrintingOption(); options.constituentTreePrinter = pipeline.GetConstituentTreePrinter(); options.encoding = pipeline.GetEncoding(); options.printSingletons = pipeline.GetPrintSingletons(); options.beamPrintingOption = pipeline.GetBeamPrintingOption(); options.pretty = pipeline.GetPrettyPrint(); options.includeText = pipeline.GetIncludeText(); return(options); }
public static ProcessedEntity Classify(this StanfordCoreNLP nlp, string source) { CoreDocument document = new CoreDocument(source); nlp.annotate(document); return(document.sentences() .toArray() .OfType <CoreSentence>() .Select(s => new ParsedSentence(s)) .Aggregate(new ProcessedEntity(), (r, s) => ProcessedEntity.Union(r, s.ToProcessedEntity()))); }
private Annotation PrepareAnnotation() { var props = new Properties(); props.put("pos.model", modelsDir + "pos-tagger/english-left3words/english-left3words-distsim.tagger"); props.put("ner.model", modelsDir + "ner/english.conll.4class.distsim.crf.ser.gz"); props.put("parse.model", modelsDir + "lexparser/englishPCFG.ser.gz"); props.put("sutime.rules", modelsDir + "sutime/defs.sutime.txt, " + modelsDir + "sutime/english.sutime.txt"); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse"); props.setProperty("sutime.binders", "0"); props.setProperty("ner.useSUTime", "0"); var pipeline = new StanfordCoreNLP(props); var annotatedText = new Annotation(text); pipeline.annotate(annotatedText); return annotatedText; }
/// <summary> /// Creates the StanfordCoreNLP instance, which annotates chunks of text. We then use the /// annotations to perform our analysis. The pipeline can be configured with various annotators; /// we use only the ones we need, for perf reasons. /// </summary> public StanfordCoreNLP GetPipeline() { var modelFolder = ConfigurationManager.AppSettings["CoreNLP.ModelDirectory"]; ValidateModelFolder(modelFolder); //Switch the current directory momentarily so Pipeline can find the models. //Wish there was a better way. var currentDir = Environment.CurrentDirectory; Directory.SetCurrentDirectory(modelFolder); var props = GetPipelineProperties(); var pipeline = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(currentDir); //Switch it back. return pipeline; }
public StanfordLemmatizer() { // Path to the folder with models extracted from `stanford-corenlp-3.6.0-models.jar` var jarRoot = @"C:\Work\NLP\Stanford\stanford-corenlp-full-2015-12-09\stanford-corenlp-3.6.0-models"; _separator = Guid.NewGuid().ToString(); // Text for processing // Annotation pipeline configuration var props = new Properties(); //props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse, ner,dcoref"); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse, ner"); props.setProperty("ner.useSUTime", "0"); // We should change current directory, so StanfordCoreNLP could find all the model files automatically var curDir = Environment.CurrentDirectory; Directory.SetCurrentDirectory(jarRoot); _pipeline = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(curDir); }
public static void Start(string modelLocation = null) { var curDir = Environment.CurrentDirectory; if (!string.IsNullOrEmpty(modelLocation)) { _modelLocation = modelLocation; } try { // Annotation pipeline configuration var props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); props.setProperty("sutime.binders", "false"); props.setProperty("ner.useSUTime", "false"); // We should change current directory, so StanfordCoreNLP could find all the model files automatically Directory.SetCurrentDirectory(HostingEnvironment.MapPath(ModelLocation)); pipeline = new StanfordCoreNLP(props); } finally { Directory.SetCurrentDirectory(curDir); } }
/// <summary> /// Initializes this instance. /// </summary> private void Initialize() { if (pipeline == null) { // Todo: How to get this ourselves //var jarRoot = @"C:\Users\karlbuha\Documents\Visual Studio 2012\Projects\ServiceMe\RestServiceV1\NLPModules\"; var jarRoot = ConfigurationManager.AppSettings["NlpModulePath"]; //var jarRoot = @"F:\sitesroot\0\bin\NlpModules\"; // Annotation pipeline configuration var props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma"); props.setProperty("sutime.binders", "0"); props.setProperty("ner.useSUTime", "false"); // We should change current directory, so StanfordCoreNLP could find all the model files automatically var curDir = Environment.CurrentDirectory; Directory.SetCurrentDirectory(jarRoot); NlpProvider.pipeline = new StanfordCoreNLP(props); Directory.SetCurrentDirectory(curDir); NlpProvider.relevantPos = ConfigurationManager.AppSettings["NlpFos"].Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries).ToList(); } }
//使用nlp將文章分析後回傳key private List<string> nlp(string sentence) { List<string> return_key = new List<string>(); string Relay_file = ".\\xml"; string Relay_name = "Relay.xml"; string Relay_path = Relay_file+ "\\" + Relay_name; // Path to the folder with models extracted from `stanford-corenlp-3.4-models.jar` var jarRoot = @"stanford-corenlp-3.5.2-models\"; // Annotation pipeline configuration var props = new java.util.Properties(); props.setProperty("ner.useSUTime", "false"); props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); props.setProperty("sutime.binders", "0"); // We should change current directory, so StanfordCoreNLP could find all the model files automatically var curDir = Environment.CurrentDirectory; System.IO.Directory.SetCurrentDirectory(jarRoot); var pipeline = new StanfordCoreNLP(props); System.IO.Directory.SetCurrentDirectory(curDir); // Annotation var annotation = new Annotation(sentence); pipeline.annotate(annotation); //輸出nlp分析結果至Relay.xml FileOutputStream os = new FileOutputStream(new File(Relay_file, Relay_name)); pipeline.xmlPrint(annotation, os); os.close(); //呼叫ner將單字組合為有意義的key組裝 foreach(string k in ner(Relay_path)) { return_key.Add(k); } return return_key; }