Exemple #1
0
        // static demo class
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                System.Console.Error.WriteLine("TokensRegexMatcher rules file [outFile]");
                return;
            }
            string      rules = args[0];
            PrintWriter @out;

            if (args.Length > 2)
            {
                @out = new PrintWriter(args[2]);
            }
            else
            {
                @out = new PrintWriter(System.Console.Out);
            }
            StanfordCoreNLP pipeline   = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner"));
            Annotation      annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[1]));

            pipeline.Annotate(annotation);
            // Load lines of file as TokenSequencePatterns
            IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>();

            foreach (string line in ObjectBank.GetLineIterator(rules))
            {
                TokenSequencePattern pattern = TokenSequencePattern.Compile(line);
                tokenSequencePatterns.Add(pattern);
            }
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
            int i = 0;

            foreach (ICoreMap sentence in sentences)
            {
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                @out.Println("Sentence #" + ++i);
                @out.Print("  Tokens:");
                foreach (CoreLabel token in tokens)
                {
                    @out.Print(' ');
                    @out.Print(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag"));
                }
                @out.Println();
                MultiPatternMatcher <ICoreMap>           multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns);
                IList <ISequenceMatchResult <ICoreMap> > answers      = multiMatcher.FindNonOverlapping(tokens);
                int j = 0;
                foreach (ISequenceMatchResult <ICoreMap> matched in answers)
                {
                    @out.Println("  Match #" + ++j);
                    for (int k = 0; k <= matched.GroupCount(); k++)
                    {
                        @out.Println("    group " + k + " = " + matched.Group(k));
                    }
                }
            }
            @out.Flush();
        }
        // static main only
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            StanfordCoreNLP pipeline   = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner"));
            Annotation      annotation = new Annotation("Casey is 21. Sally Atkinson's age is 30.");

            pipeline.Annotate(annotation);
            IList <ICoreMap>             sentences             = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
            IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>();

            string[] patterns = new string[] { "(?$who [ ner: PERSON]+ ) /is/ (?$age [ pos: CD ] )", "(?$who [ ner: PERSON]+ ) /'s/ /age/ /is/ (?$age [ pos: CD ] )" };
            foreach (string line in patterns)
            {
                TokenSequencePattern pattern = TokenSequencePattern.Compile(line);
                tokenSequencePatterns.Add(pattern);
            }
            MultiPatternMatcher <ICoreMap> multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns);
            int i = 0;

            foreach (ICoreMap sentence in sentences)
            {
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                System.Console.Out.WriteLine("Sentence #" + ++i);
                System.Console.Out.Write("  Tokens:");
                foreach (CoreLabel token in tokens)
                {
                    System.Console.Out.Write(' ');
                    System.Console.Out.Write(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag"));
                }
                System.Console.Out.WriteLine();
                IList <ISequenceMatchResult <ICoreMap> > answers = multiMatcher.FindNonOverlapping(tokens);
                int j = 0;
                foreach (ISequenceMatchResult <ICoreMap> matched in answers)
                {
                    System.Console.Out.WriteLine("  Match #" + ++j);
                    System.Console.Out.WriteLine("    match: " + matched.Group(0));
                    System.Console.Out.WriteLine("      who: " + matched.Group("$who"));
                    System.Console.Out.WriteLine("      age: " + matched.Group("$age"));
                }
            }
        }
Exemple #3
0
        private void RunParallelApplyPats(IDictionary <string, DataInstance> sents, string label, E pattern, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICollection
                                          <CandidatePhrase> alreadyLabeledWords)
        {
            Redwood.Log(Redwood.Dbg, "Applying pattern " + pattern + " to a total of " + sents.Count + " sentences ");
            IList <string> notAllowedClasses = new List <string>();
            IList <string> sentids           = CollectionUtils.ToList(sents.Keys);

            if (constVars.doNotExtractPhraseAnyWordLabeledOtherClass)
            {
                foreach (string l in constVars.GetAnswerClass().Keys)
                {
                    if (!l.Equals(label))
                    {
                        notAllowedClasses.Add(l);
                    }
                }
                notAllowedClasses.Add("OTHERSEM");
            }
            IDictionary <TokenSequencePattern, E> surfacePatternsLearnedThisIterConverted = null;
            IDictionary <SemgrexPattern, E>       depPatternsLearnedThisIterConverted     = null;

            if (constVars.patternType.Equals(PatternFactory.PatternType.Surface))
            {
                surfacePatternsLearnedThisIterConverted = new Dictionary <TokenSequencePattern, E>();
                string patternStr = null;
                try
                {
                    patternStr = pattern.ToString(notAllowedClasses);
                    TokenSequencePattern pat = ((TokenSequencePattern)TokenSequencePattern.Compile(constVars.env[label], patternStr));
                    surfacePatternsLearnedThisIterConverted[pat] = pattern;
                }
                catch (Exception e)
                {
                    log.Info("Error applying pattern " + patternStr + ". Probably an ill formed pattern (can be because of special symbols in label names). Contact the software developer.");
                    throw;
                }
            }
            else
            {
                if (constVars.patternType.Equals(PatternFactory.PatternType.Dep))
                {
                    depPatternsLearnedThisIterConverted = new Dictionary <SemgrexPattern, E>();
                    SemgrexPattern pat = SemgrexPattern.Compile(pattern.ToString(notAllowedClasses), new Env(constVars.env[label].GetVariables()));
                    depPatternsLearnedThisIterConverted[pat] = pattern;
                }
                else
                {
                    throw new NotSupportedException();
                }
            }
            //Apply the patterns and extract candidate phrases
            int num;
            int numThreads = constVars.numThreads;

            //If number of sentences is less, do not create so many threads
            if (sents.Count < 50)
            {
                numThreads = 1;
            }
            if (numThreads == 1)
            {
                num = sents.Count;
            }
            else
            {
                num = sents.Count / (numThreads - 1);
            }
            IExecutorService executor = Executors.NewFixedThreadPool(constVars.numThreads);
            IList <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > > list = new List <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E
                                                                                                                                                                                                                                                                            , Triple <string, int, int> >, ICollection <CandidatePhrase> > > >();

            for (int i = 0; i < numThreads; i++)
            {
                ICallable <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > task = null;
                if (pattern.type.Equals(PatternFactory.PatternType.Surface))
                {
                    //Redwood.log(Redwood.DBG, "Applying pats: assigning sentences " + i*num + " to " +Math.min(sentids.size(), (i + 1) * num) + " to thread " + (i+1));
                    task = new ApplyPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), surfacePatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords
                                             , constVars);
                }
                else
                {
                    task = new ApplyDepPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), depPatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords
                                                , constVars);
                }
                IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > submit = executor.Submit(task);
                list.Add(submit);
            }
            // Now retrieve the result
            foreach (IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > future in list)
            {
                try
                {
                    Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > result = future.Get();
                    Redwood.Log(ConstantsAndVariables.extremedebug, "Pattern " + pattern + " extracted phrases " + result.First());
                    wordsandLemmaPatExtracted.AddAll(result.First());
                    matchedTokensByPat.AddAll(result.Second());
                    Sharpen.Collections.AddAll(alreadyLabeledWords, result.Third());
                }
                catch (Exception e)
                {
                    executor.ShutdownNow();
                    throw new Exception(e);
                }
            }
            executor.Shutdown();
        }
Exemple #4
0
        public WordsToSentencesAnnotator(Properties properties)
        {
            bool nlSplitting = bool.ValueOf(properties.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false"));

            if (nlSplitting)
            {
                bool whitespaceTokenization = bool.ValueOf(properties.GetProperty("tokenize.whitespace", "false"));
                if (whitespaceTokenization)
                {
                    if (Runtime.LineSeparator().Equals("\n"))
                    {
                        // this constructor will keep empty lines as empty sentences
                        WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { "\n" }));
                        this.countLineNumbers = true;
                        this.wts = wts1;
                    }
                    else
                    {
                        // throw "\n" in just in case files use that instead of
                        // the system separator
                        // this constructor will keep empty lines as empty sentences
                        WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { Runtime.LineSeparator(), "\n" }));
                        this.countLineNumbers = true;
                        this.wts = wts1;
                    }
                }
                else
                {
                    // this constructor will keep empty lines as empty sentences
                    WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { PTBTokenizer.GetNewlineToken() }));
                    this.countLineNumbers = true;
                    this.wts = wts1;
                }
            }
            else
            {
                string isOneSentence = properties.GetProperty("ssplit.isOneSentence");
                if (bool.Parse(isOneSentence))
                {
                    // this method treats null as false
                    // Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence.
                    WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(true);
                    this.countLineNumbers = false;
                    this.wts = wts1;
                }
                else
                {
                    // multi token sentence boundaries
                    string boundaryMultiTokenRegex = properties.GetProperty("ssplit.boundaryMultiTokenRegex");
                    // Discard these tokens without marking them as sentence boundaries
                    string tokenPatternsToDiscardProp          = properties.GetProperty("ssplit.tokenPatternsToDiscard");
                    ICollection <string> tokenRegexesToDiscard = null;
                    if (tokenPatternsToDiscardProp != null)
                    {
                        string[] toks = tokenPatternsToDiscardProp.Split(",");
                        tokenRegexesToDiscard = Generics.NewHashSet(Arrays.AsList(toks));
                    }
                    // regular boundaries
                    string boundaryTokenRegex     = properties.GetProperty("ssplit.boundaryTokenRegex");
                    string boundaryFollowersRegex = properties.GetProperty("ssplit.boundaryFollowersRegex");
                    // newline boundaries which are discarded.
                    ICollection <string> boundariesToDiscard = null;
                    string bounds = properties.GetProperty("ssplit.boundariesToDiscard");
                    if (bounds != null)
                    {
                        string[] toks = bounds.Split(",");
                        boundariesToDiscard = Generics.NewHashSet(Arrays.AsList(toks));
                    }
                    ICollection <string> htmlElementsToDiscard = null;
                    // HTML boundaries which are discarded
                    bounds = properties.GetProperty("ssplit.htmlBoundariesToDiscard");
                    if (bounds != null)
                    {
                        string[] elements = bounds.Split(",");
                        htmlElementsToDiscard = Generics.NewHashSet(Arrays.AsList(elements));
                    }
                    string nlsb = properties.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, StanfordCoreNLP.DefaultNewlineIsSentenceBreak);
                    this.countLineNumbers = false;
                    this.wts = new WordToSentenceProcessor <CoreLabel>(boundaryTokenRegex, boundaryFollowersRegex, boundariesToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.StringToNewlineIsSentenceBreak(nlsb), (boundaryMultiTokenRegex != null) ? TokenSequencePattern
                                                                       .Compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard);
                }
            }
            Verbose = bool.ValueOf(properties.GetProperty("ssplit.verbose", "false"));
        }
Exemple #5
0
 public WordsToSentencesAnnotator(bool verbose, string boundaryTokenRegex, ICollection <string> boundaryToDiscard, ICollection <string> htmlElementsToDiscard, string newlineIsSentenceBreak, string boundaryMultiTokenRegex, ICollection <string> tokenRegexesToDiscard
                                  )
     : this(verbose, false, new WordToSentenceProcessor <CoreLabel>(boundaryTokenRegex, null, boundaryToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.StringToNewlineIsSentenceBreak(newlineIsSentenceBreak), (boundaryMultiTokenRegex != null
                                                                                                                                                                                                                         ) ? TokenSequencePattern.Compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard))
 {
 }