// static demo class /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { if (args.Length < 2) { System.Console.Error.WriteLine("TokensRegexMatcher rules file [outFile]"); return; } string rules = args[0]; PrintWriter @out; if (args.Length > 2) { @out = new PrintWriter(args[2]); } else { @out = new PrintWriter(System.Console.Out); } StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner")); Annotation annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[1])); pipeline.Annotate(annotation); // Load lines of file as TokenSequencePatterns IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>(); foreach (string line in ObjectBank.GetLineIterator(rules)) { TokenSequencePattern pattern = TokenSequencePattern.Compile(line); tokenSequencePatterns.Add(pattern); } IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); int i = 0; foreach (ICoreMap sentence in sentences) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); @out.Println("Sentence #" + ++i); @out.Print(" Tokens:"); foreach (CoreLabel token in tokens) { @out.Print(' '); @out.Print(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag")); } @out.Println(); MultiPatternMatcher <ICoreMap> multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns); IList <ISequenceMatchResult <ICoreMap> > answers = multiMatcher.FindNonOverlapping(tokens); int j = 0; foreach (ISequenceMatchResult <ICoreMap> matched in answers) { @out.Println(" Match #" + ++j); for (int k = 0; k <= matched.GroupCount(); k++) { @out.Println(" group " + k + " = " + matched.Group(k)); } } } @out.Flush(); }
// static main only /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner")); Annotation annotation = new Annotation("Casey is 21. Sally Atkinson's age is 30."); pipeline.Annotate(annotation); IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>(); string[] patterns = new string[] { "(?$who [ ner: PERSON]+ ) /is/ (?$age [ pos: CD ] )", "(?$who [ ner: PERSON]+ ) /'s/ /age/ /is/ (?$age [ pos: CD ] )" }; foreach (string line in patterns) { TokenSequencePattern pattern = TokenSequencePattern.Compile(line); tokenSequencePatterns.Add(pattern); } MultiPatternMatcher <ICoreMap> multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns); int i = 0; foreach (ICoreMap sentence in sentences) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); System.Console.Out.WriteLine("Sentence #" + ++i); System.Console.Out.Write(" Tokens:"); foreach (CoreLabel token in tokens) { System.Console.Out.Write(' '); System.Console.Out.Write(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag")); } System.Console.Out.WriteLine(); IList <ISequenceMatchResult <ICoreMap> > answers = multiMatcher.FindNonOverlapping(tokens); int j = 0; foreach (ISequenceMatchResult <ICoreMap> matched in answers) { System.Console.Out.WriteLine(" Match #" + ++j); System.Console.Out.WriteLine(" match: " + matched.Group(0)); System.Console.Out.WriteLine(" who: " + matched.Group("$who")); System.Console.Out.WriteLine(" age: " + matched.Group("$age")); } } }
private void RunParallelApplyPats(IDictionary <string, DataInstance> sents, string label, E pattern, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICollection <CandidatePhrase> alreadyLabeledWords) { Redwood.Log(Redwood.Dbg, "Applying pattern " + pattern + " to a total of " + sents.Count + " sentences "); IList <string> notAllowedClasses = new List <string>(); IList <string> sentids = CollectionUtils.ToList(sents.Keys); if (constVars.doNotExtractPhraseAnyWordLabeledOtherClass) { foreach (string l in constVars.GetAnswerClass().Keys) { if (!l.Equals(label)) { notAllowedClasses.Add(l); } } notAllowedClasses.Add("OTHERSEM"); } IDictionary <TokenSequencePattern, E> surfacePatternsLearnedThisIterConverted = null; IDictionary <SemgrexPattern, E> depPatternsLearnedThisIterConverted = null; if (constVars.patternType.Equals(PatternFactory.PatternType.Surface)) { surfacePatternsLearnedThisIterConverted = new Dictionary <TokenSequencePattern, E>(); string patternStr = null; try { patternStr = pattern.ToString(notAllowedClasses); TokenSequencePattern pat = ((TokenSequencePattern)TokenSequencePattern.Compile(constVars.env[label], patternStr)); surfacePatternsLearnedThisIterConverted[pat] = pattern; } catch (Exception e) { log.Info("Error applying pattern " + patternStr + ". Probably an ill formed pattern (can be because of special symbols in label names). Contact the software developer."); throw; } } else { if (constVars.patternType.Equals(PatternFactory.PatternType.Dep)) { depPatternsLearnedThisIterConverted = new Dictionary <SemgrexPattern, E>(); SemgrexPattern pat = SemgrexPattern.Compile(pattern.ToString(notAllowedClasses), new Env(constVars.env[label].GetVariables())); depPatternsLearnedThisIterConverted[pat] = pattern; } else { throw new NotSupportedException(); } } //Apply the patterns and extract candidate phrases int num; int numThreads = constVars.numThreads; //If number of sentences is less, do not create so many threads if (sents.Count < 50) { numThreads = 1; } if (numThreads == 1) { num = sents.Count; } else { num = sents.Count / (numThreads - 1); } IExecutorService executor = Executors.NewFixedThreadPool(constVars.numThreads); IList <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > > list = new List <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E , Triple <string, int, int> >, ICollection <CandidatePhrase> > > >(); for (int i = 0; i < numThreads; i++) { ICallable <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > task = null; if (pattern.type.Equals(PatternFactory.PatternType.Surface)) { //Redwood.log(Redwood.DBG, "Applying pats: assigning sentences " + i*num + " to " +Math.min(sentids.size(), (i + 1) * num) + " to thread " + (i+1)); task = new ApplyPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), surfacePatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords , constVars); } else { task = new ApplyDepPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), depPatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords , constVars); } IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > submit = executor.Submit(task); list.Add(submit); } // Now retrieve the result foreach (IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > future in list) { try { Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > result = future.Get(); Redwood.Log(ConstantsAndVariables.extremedebug, "Pattern " + pattern + " extracted phrases " + result.First()); wordsandLemmaPatExtracted.AddAll(result.First()); matchedTokensByPat.AddAll(result.Second()); Sharpen.Collections.AddAll(alreadyLabeledWords, result.Third()); } catch (Exception e) { executor.ShutdownNow(); throw new Exception(e); } } executor.Shutdown(); }
public WordsToSentencesAnnotator(Properties properties) { bool nlSplitting = bool.ValueOf(properties.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false")); if (nlSplitting) { bool whitespaceTokenization = bool.ValueOf(properties.GetProperty("tokenize.whitespace", "false")); if (whitespaceTokenization) { if (Runtime.LineSeparator().Equals("\n")) { // this constructor will keep empty lines as empty sentences WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { "\n" })); this.countLineNumbers = true; this.wts = wts1; } else { // throw "\n" in just in case files use that instead of // the system separator // this constructor will keep empty lines as empty sentences WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { Runtime.LineSeparator(), "\n" })); this.countLineNumbers = true; this.wts = wts1; } } else { // this constructor will keep empty lines as empty sentences WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { PTBTokenizer.GetNewlineToken() })); this.countLineNumbers = true; this.wts = wts1; } } else { string isOneSentence = properties.GetProperty("ssplit.isOneSentence"); if (bool.Parse(isOneSentence)) { // this method treats null as false // Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence. WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(true); this.countLineNumbers = false; this.wts = wts1; } else { // multi token sentence boundaries string boundaryMultiTokenRegex = properties.GetProperty("ssplit.boundaryMultiTokenRegex"); // Discard these tokens without marking them as sentence boundaries string tokenPatternsToDiscardProp = properties.GetProperty("ssplit.tokenPatternsToDiscard"); ICollection <string> tokenRegexesToDiscard = null; if (tokenPatternsToDiscardProp != null) { string[] toks = tokenPatternsToDiscardProp.Split(","); tokenRegexesToDiscard = Generics.NewHashSet(Arrays.AsList(toks)); } // regular boundaries string boundaryTokenRegex = properties.GetProperty("ssplit.boundaryTokenRegex"); string boundaryFollowersRegex = properties.GetProperty("ssplit.boundaryFollowersRegex"); // newline boundaries which are discarded. ICollection <string> boundariesToDiscard = null; string bounds = properties.GetProperty("ssplit.boundariesToDiscard"); if (bounds != null) { string[] toks = bounds.Split(","); boundariesToDiscard = Generics.NewHashSet(Arrays.AsList(toks)); } ICollection <string> htmlElementsToDiscard = null; // HTML boundaries which are discarded bounds = properties.GetProperty("ssplit.htmlBoundariesToDiscard"); if (bounds != null) { string[] elements = bounds.Split(","); htmlElementsToDiscard = Generics.NewHashSet(Arrays.AsList(elements)); } string nlsb = properties.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, StanfordCoreNLP.DefaultNewlineIsSentenceBreak); this.countLineNumbers = false; this.wts = new WordToSentenceProcessor <CoreLabel>(boundaryTokenRegex, boundaryFollowersRegex, boundariesToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.StringToNewlineIsSentenceBreak(nlsb), (boundaryMultiTokenRegex != null) ? TokenSequencePattern .Compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard); } } Verbose = bool.ValueOf(properties.GetProperty("ssplit.verbose", "false")); }
public WordsToSentencesAnnotator(bool verbose, string boundaryTokenRegex, ICollection <string> boundaryToDiscard, ICollection <string> htmlElementsToDiscard, string newlineIsSentenceBreak, string boundaryMultiTokenRegex, ICollection <string> tokenRegexesToDiscard ) : this(verbose, false, new WordToSentenceProcessor <CoreLabel>(boundaryTokenRegex, null, boundaryToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.StringToNewlineIsSentenceBreak(newlineIsSentenceBreak), (boundaryMultiTokenRegex != null ) ? TokenSequencePattern.Compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard)) { }