private void InitEnv()
 {
     env = TokenSequencePattern.GetNewEnv();
     env.SetDefaultTokensAnnotationKey(typeof(CoreAnnotations.NumerizedTokensAnnotation));
     // Do case insensitive matching
     env.SetDefaultStringMatchFlags(Pattern.CaseInsensitive | Pattern.UnicodeCase);
     env.SetDefaultStringPatternFlags(Pattern.CaseInsensitive | Pattern.UnicodeCase);
     try
     {
         Units.RegisterUnits(env, options.unitsFilename);
     }
     catch (IOException ex)
     {
         throw new Exception("Error loading units from " + options.unitsFilename, ex);
     }
     try
     {
         UnitPrefix.RegisterPrefixes(env, options.prefixFilename);
     }
     catch (IOException ex)
     {
         throw new Exception("Error loading prefixes from " + options.prefixFilename, ex);
     }
     env.Bind("options", options);
     env.Bind("numcomptype", typeof(CoreAnnotations.NumericCompositeTypeAnnotation));
     env.Bind("numcompvalue", typeof(CoreAnnotations.NumericCompositeValueAnnotation));
 }
        public TokenSequencePattern Build()
        {
            var res = new TokenSequencePattern(Seq);

            Reset();
            return(res);
        }
 public KBPTokensregexExtractor(string tokensregexDir, bool verbose)
 {
     if (verbose)
     {
         logger.Log("Creating TokensRegexExtractor");
     }
     // Create extractors
     foreach (KBPRelationExtractor.RelationType rel in KBPRelationExtractor.RelationType.Values())
     {
         string relFileNameComponent = rel.canonicalName.ReplaceAll(":", "_");
         string path = tokensregexDir + File.separator + relFileNameComponent.ReplaceAll("/", "SLASH") + ".rules";
         if (IOUtils.ExistsInClasspathOrFileSystem(path))
         {
             IList <string> listFiles = new List <string>();
             listFiles.Add(tokensregexDir + File.separator + "defs.rules");
             listFiles.Add(path);
             if (verbose)
             {
                 logger.Log("Rule files for relation " + rel + " is " + path);
             }
             Env env = TokenSequencePattern.GetNewEnv();
             env.Bind("collapseExtractionRules", true);
             env.Bind("verbose", verbose);
             CoreMapExpressionExtractor extr = CoreMapExpressionExtractor.CreateExtractorFromFiles(env, listFiles).KeepTemporaryTags();
             rules[rel] = extr;
         }
     }
 }
Пример #4
0
        // static demo class
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                System.Console.Error.WriteLine("TokensRegexMatcher rules file [outFile]");
                return;
            }
            string      rules = args[0];
            PrintWriter @out;

            if (args.Length > 2)
            {
                @out = new PrintWriter(args[2]);
            }
            else
            {
                @out = new PrintWriter(System.Console.Out);
            }
            StanfordCoreNLP pipeline   = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner"));
            Annotation      annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[1]));

            pipeline.Annotate(annotation);
            // Load lines of file as TokenSequencePatterns
            IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>();

            foreach (string line in ObjectBank.GetLineIterator(rules))
            {
                TokenSequencePattern pattern = TokenSequencePattern.Compile(line);
                tokenSequencePatterns.Add(pattern);
            }
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
            int i = 0;

            foreach (ICoreMap sentence in sentences)
            {
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                @out.Println("Sentence #" + ++i);
                @out.Print("  Tokens:");
                foreach (CoreLabel token in tokens)
                {
                    @out.Print(' ');
                    @out.Print(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag"));
                }
                @out.Println();
                MultiPatternMatcher <ICoreMap>           multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns);
                IList <ISequenceMatchResult <ICoreMap> > answers      = multiMatcher.FindNonOverlapping(tokens);
                int j = 0;
                foreach (ISequenceMatchResult <ICoreMap> matched in answers)
                {
                    @out.Println("  Match #" + ++j);
                    for (int k = 0; k <= matched.GroupCount(); k++)
                    {
                        @out.Println("    group " + k + " = " + matched.Group(k));
                    }
                }
            }
            @out.Flush();
        }
 public ApplyPatternsMulti(IDictionary <string, DataInstance> sents, IList <string> sentids, IDictionary <TokenSequencePattern, E> patterns, string label, bool removeStopWordsFromSelectedPhrases, bool removePhrasesWithStopWords, ConstantsAndVariables
                           cv)
 {
     //Set<String> ignoreWords;
     this.sents          = sents;
     this.patterns       = patterns;
     multiPatternMatcher = TokenSequencePattern.GetMultiPatternMatcher(patterns.Keys);
     this.sentids        = sentids;
     this.label          = label;
     this.removeStopWordsFromSelectedPhrases = removeStopWordsFromSelectedPhrases;
     this.removePhrasesWithStopWords         = removePhrasesWithStopWords;
     this.constVars = cv;
 }
        // static main only
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            StanfordCoreNLP pipeline   = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner"));
            Annotation      annotation = new Annotation("Casey is 21. Sally Atkinson's age is 30.");

            pipeline.Annotate(annotation);
            IList <ICoreMap>             sentences             = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
            IList <TokenSequencePattern> tokenSequencePatterns = new List <TokenSequencePattern>();

            string[] patterns = new string[] { "(?$who [ ner: PERSON]+ ) /is/ (?$age [ pos: CD ] )", "(?$who [ ner: PERSON]+ ) /'s/ /age/ /is/ (?$age [ pos: CD ] )" };
            foreach (string line in patterns)
            {
                TokenSequencePattern pattern = TokenSequencePattern.Compile(line);
                tokenSequencePatterns.Add(pattern);
            }
            MultiPatternMatcher <ICoreMap> multiMatcher = TokenSequencePattern.GetMultiPatternMatcher(tokenSequencePatterns);
            int i = 0;

            foreach (ICoreMap sentence in sentences)
            {
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                System.Console.Out.WriteLine("Sentence #" + ++i);
                System.Console.Out.Write("  Tokens:");
                foreach (CoreLabel token in tokens)
                {
                    System.Console.Out.Write(' ');
                    System.Console.Out.Write(token.ToShortString("Text", "PartOfSpeech", "NamedEntityTag"));
                }
                System.Console.Out.WriteLine();
                IList <ISequenceMatchResult <ICoreMap> > answers = multiMatcher.FindNonOverlapping(tokens);
                int j = 0;
                foreach (ISequenceMatchResult <ICoreMap> matched in answers)
                {
                    System.Console.Out.WriteLine("  Match #" + ++j);
                    System.Console.Out.WriteLine("    match: " + matched.Group(0));
                    System.Console.Out.WriteLine("      who: " + matched.Group("$who"));
                    System.Console.Out.WriteLine("      age: " + matched.Group("$age"));
                }
            }
        }
        public TokensRegexAnnotator(string name, Properties props)
        {
            string prefix = (name == null) ? string.Empty : name + '.';

            string[] files = PropertiesUtils.GetStringArray(props, prefix + "rules");
            env = TokenSequencePattern.GetNewEnv();
            env.Bind("options", options);
            if (PropertiesUtils.GetBool(props, prefix + "caseInsensitive"))
            {
                System.Console.Error.WriteLine("using case insensitive!");
                env.SetDefaultStringMatchFlags(NodePattern.CaseInsensitive | Pattern.UnicodeCase);
                env.SetDefaultStringPatternFlags(Pattern.CaseInsensitive | Pattern.UnicodeCase);
            }
            if (files.Length != 0)
            {
                extractor = CoreMapExpressionExtractor.CreateExtractorFromFiles(env, files);
            }
            else
            {
                extractor = null;
            }
            verbose = PropertiesUtils.GetBool(props, prefix + "verbose", false);
            options.setTokenOffsets   = PropertiesUtils.GetBool(props, prefix + "setTokenOffsets", options.setTokenOffsets);
            options.extractWithTokens = PropertiesUtils.GetBool(props, prefix + "extractWithTokens", options.extractWithTokens);
            options.flatten           = PropertiesUtils.GetBool(props, prefix + "flatten", options.flatten);
            string matchedExpressionsAnnotationKeyName = props.GetProperty(prefix + "matchedExpressionsAnnotationKey");

            if (matchedExpressionsAnnotationKeyName != null)
            {
                options.matchedExpressionsAnnotationKey = EnvLookup.LookupAnnotationKeyWithClassname(env, matchedExpressionsAnnotationKeyName);
                if (options.matchedExpressionsAnnotationKey == null)
                {
                    string propName = prefix + "matchedExpressionsAnnotationKey";
                    throw new Exception("Cannot determine annotation key for " + propName + '=' + matchedExpressionsAnnotationKeyName);
                }
            }
        }
Пример #8
0
        private void RunParallelApplyPats(IDictionary <string, DataInstance> sents, string label, E pattern, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICollection
                                          <CandidatePhrase> alreadyLabeledWords)
        {
            Redwood.Log(Redwood.Dbg, "Applying pattern " + pattern + " to a total of " + sents.Count + " sentences ");
            IList <string> notAllowedClasses = new List <string>();
            IList <string> sentids           = CollectionUtils.ToList(sents.Keys);

            if (constVars.doNotExtractPhraseAnyWordLabeledOtherClass)
            {
                foreach (string l in constVars.GetAnswerClass().Keys)
                {
                    if (!l.Equals(label))
                    {
                        notAllowedClasses.Add(l);
                    }
                }
                notAllowedClasses.Add("OTHERSEM");
            }
            IDictionary <TokenSequencePattern, E> surfacePatternsLearnedThisIterConverted = null;
            IDictionary <SemgrexPattern, E>       depPatternsLearnedThisIterConverted     = null;

            if (constVars.patternType.Equals(PatternFactory.PatternType.Surface))
            {
                surfacePatternsLearnedThisIterConverted = new Dictionary <TokenSequencePattern, E>();
                string patternStr = null;
                try
                {
                    patternStr = pattern.ToString(notAllowedClasses);
                    TokenSequencePattern pat = ((TokenSequencePattern)TokenSequencePattern.Compile(constVars.env[label], patternStr));
                    surfacePatternsLearnedThisIterConverted[pat] = pattern;
                }
                catch (Exception e)
                {
                    log.Info("Error applying pattern " + patternStr + ". Probably an ill formed pattern (can be because of special symbols in label names). Contact the software developer.");
                    throw;
                }
            }
            else
            {
                if (constVars.patternType.Equals(PatternFactory.PatternType.Dep))
                {
                    depPatternsLearnedThisIterConverted = new Dictionary <SemgrexPattern, E>();
                    SemgrexPattern pat = SemgrexPattern.Compile(pattern.ToString(notAllowedClasses), new Env(constVars.env[label].GetVariables()));
                    depPatternsLearnedThisIterConverted[pat] = pattern;
                }
                else
                {
                    throw new NotSupportedException();
                }
            }
            //Apply the patterns and extract candidate phrases
            int num;
            int numThreads = constVars.numThreads;

            //If number of sentences is less, do not create so many threads
            if (sents.Count < 50)
            {
                numThreads = 1;
            }
            if (numThreads == 1)
            {
                num = sents.Count;
            }
            else
            {
                num = sents.Count / (numThreads - 1);
            }
            IExecutorService executor = Executors.NewFixedThreadPool(constVars.numThreads);
            IList <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > > list = new List <IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E
                                                                                                                                                                                                                                                                            , Triple <string, int, int> >, ICollection <CandidatePhrase> > > >();

            for (int i = 0; i < numThreads; i++)
            {
                ICallable <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > task = null;
                if (pattern.type.Equals(PatternFactory.PatternType.Surface))
                {
                    //Redwood.log(Redwood.DBG, "Applying pats: assigning sentences " + i*num + " to " +Math.min(sentids.size(), (i + 1) * num) + " to thread " + (i+1));
                    task = new ApplyPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), surfacePatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords
                                             , constVars);
                }
                else
                {
                    task = new ApplyDepPatterns(sents, num == sents.Count ? sentids : sentids.SubList(i * num, Math.Min(sentids.Count, (i + 1) * num)), depPatternsLearnedThisIterConverted, label, constVars.removeStopWordsFromSelectedPhrases, constVars.removePhrasesWithStopWords
                                                , constVars);
                }
                IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > submit = executor.Submit(task);
                list.Add(submit);
            }
            // Now retrieve the result
            foreach (IFuture <Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > > future in list)
            {
                try
                {
                    Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > result = future.Get();
                    Redwood.Log(ConstantsAndVariables.extremedebug, "Pattern " + pattern + " extracted phrases " + result.First());
                    wordsandLemmaPatExtracted.AddAll(result.First());
                    matchedTokensByPat.AddAll(result.Second());
                    Sharpen.Collections.AddAll(alreadyLabeledWords, result.Third());
                }
                catch (Exception e)
                {
                    executor.ShutdownNow();
                    throw new Exception(e);
                }
            }
            executor.Shutdown();
        }
 public TokensRegexAnnotator(params string[] files)
 {
     env       = TokenSequencePattern.GetNewEnv();
     extractor = CoreMapExpressionExtractor.CreateExtractorFromFiles(env, files);
     verbose   = false;
 }
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string rules;

            if (args.Length > 0)
            {
                rules = args[0];
            }
            else
            {
                rules = "edu/stanford/nlp/ling/tokensregex/demo/rules/expr.rules.txt";
            }
            PrintWriter @out;

            if (args.Length > 2)
            {
                @out = new PrintWriter(args[2]);
            }
            else
            {
                @out = new PrintWriter(System.Console.Out);
            }
            CoreMapExpressionExtractor <MatchedExpression> extractor = CoreMapExpressionExtractor.CreateExtractorFromFiles(TokenSequencePattern.GetNewEnv(), rules);
            StanfordCoreNLP pipeline = new StanfordCoreNLP(PropertiesUtils.AsProperties("annotators", "tokenize,ssplit,pos,lemma,ner"));
            Annotation      annotation;

            if (args.Length > 1)
            {
                annotation = new Annotation(IOUtils.SlurpFileNoExceptions(args[1]));
            }
            else
            {
                annotation = new Annotation("( ( five plus three plus four ) * 2 ) divided by three");
            }
            pipeline.Annotate(annotation);
            // An Annotation is a Map and you can get and use the various analyses individually.
            @out.Println();
            // The toString() method on an Annotation just prints the text of the Annotation
            // But you can see what is in it with other methods like toShorterString()
            @out.Println("The top level annotation");
            @out.Println(annotation.ToShorterString());
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
            int i = 0;

            foreach (ICoreMap sentence in sentences)
            {
                @out.Println("Sentence #" + ++i);
                foreach (CoreLabel token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation)))
                {
                    @out.Println("  Token: " + "word=" + token.Get(typeof(CoreAnnotations.TextAnnotation)) + ",  pos=" + token.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)) + ", ne=" + token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)));
                }
                IList <MatchedExpression> matchedExpressions = extractor.ExtractExpressions(sentence);
                foreach (MatchedExpression matched in matchedExpressions)
                {
                    // Print out matched text and value
                    @out.Println("Matched expression: " + matched.GetText() + " with value " + matched.GetValue());
                    // Print out token information
                    ICoreMap cm = matched.GetAnnotation();
                    foreach (CoreLabel token_1 in cm.Get(typeof(CoreAnnotations.TokensAnnotation)))
                    {
                        string word  = token_1.Get(typeof(CoreAnnotations.TextAnnotation));
                        string lemma = token_1.Get(typeof(CoreAnnotations.LemmaAnnotation));
                        string pos   = token_1.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation));
                        string ne    = token_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                        @out.Println("  Matched token: " + "word=" + word + ", lemma=" + lemma + ", pos=" + pos + ", ne=" + ne);
                    }
                }
            }
            @out.Flush();
        }
 private void InitEnv()
 {
     env = TokenSequencePattern.GetNewEnv();
     env.SetDefaultResultsAnnotationExtractor(TimeExpression.TimeExpressionConverter);
     env.SetDefaultTokensAnnotationKey(typeof(CoreAnnotations.NumerizedTokensAnnotation));
     env.SetDefaultResultAnnotationKey(typeof(TimeExpression.Annotation));
     env.SetDefaultNestedResultsAnnotationKey(typeof(TimeExpression.ChildrenAnnotation));
     env.SetDefaultTokensAggregators(CoreMapAttributeAggregator.DefaultNumericTokensAggregators);
     env.Bind("nested", typeof(TimeExpression.ChildrenAnnotation));
     env.Bind("time", new TimeFormatter.TimePatternExtractRuleCreator());
     // Do case insensitive matching
     env.SetDefaultStringPatternFlags(Pattern.CaseInsensitive | Pattern.UnicodeCase);
     env.Bind("options", options);
     env.Bind("TIME_REF", SUTime.TimeRef);
     env.Bind("TIME_REF_UNKNOWN", SUTime.TimeRefUnknown);
     env.Bind("TIME_UNKNOWN", SUTime.TimeUnknown);
     env.Bind("TIME_NONE", SUTime.TimeNone);
     env.Bind("ERA_AD", SUTime.EraAd);
     env.Bind("ERA_BC", SUTime.EraBc);
     env.Bind("ERA_UNKNOWN", SUTime.EraUnknown);
     env.Bind("HALFDAY_AM", SUTime.HalfdayAm);
     env.Bind("HALFDAY_PM", SUTime.HalfdayPm);
     env.Bind("HALFDAY_UNKNOWN", SUTime.HalfdayUnknown);
     env.Bind("RESOLVE_TO_THIS", SUTime.ResolveToThis);
     env.Bind("RESOLVE_TO_PAST", SUTime.ResolveToPast);
     env.Bind("RESOLVE_TO_FUTURE", SUTime.ResolveToFuture);
     env.Bind("RESOLVE_TO_CLOSEST", SUTime.ResolveToClosest);
     env.Bind("numcomptype", typeof(CoreAnnotations.NumericCompositeTypeAnnotation));
     env.Bind("numcompvalue", typeof(CoreAnnotations.NumericCompositeValueAnnotation));
     env.Bind("temporal", typeof(TimeExpression.Annotation));
     //    env.bind("tags", SequenceMatchRules.Tags.TagsAnnotation.class);
     env.Bind("::IS_TIMEX_DATE", new GenericTimeExpressionPatterns.TimexTypeMatchNodePattern(SUTime.TimexType.Date));
     env.Bind("::IS_TIMEX_DURATION", new GenericTimeExpressionPatterns.TimexTypeMatchNodePattern(SUTime.TimexType.Duration));
     env.Bind("::IS_TIMEX_TIME", new GenericTimeExpressionPatterns.TimexTypeMatchNodePattern(SUTime.TimexType.Time));
     env.Bind("::IS_TIMEX_SET", new GenericTimeExpressionPatterns.TimexTypeMatchNodePattern(SUTime.TimexType.Set));
     env.Bind("::IS_TIME_UNIT", new GenericTimeExpressionPatterns.MatchedExpressionValueTypeMatchNodePattern("TIMEUNIT"));
     env.Bind("::MONTH", new GenericTimeExpressionPatterns.MatchedExpressionValueTypeMatchNodePattern("MONTH_OF_YEAR"));
     env.Bind("::DAYOFWEEK", new GenericTimeExpressionPatterns.MatchedExpressionValueTypeMatchNodePattern("DAY_OF_WEEK"));
     // BINDINGS for parsing from file!!!!!!!
     foreach (SUTime.TemporalOp t in SUTime.TemporalOp.Values())
     {
         env.Bind(t.ToString(), new Expressions.PrimitiveValue <SUTime.TemporalOp>("TemporalOp", t));
     }
     foreach (SUTime.TimeUnit t_1 in SUTime.TimeUnit.Values())
     {
         if (!t_1.Equals(SUTime.TimeUnit.Unknown))
         {
             //env.bind(t.name(), new SequenceMatchRules.PrimitiveValue<SUTime.Temporal>("DURATION", t.getDuration(), "TIMEUNIT"));
             env.Bind(t_1.ToString(), new Expressions.PrimitiveValue <SUTime.Temporal>("TIMEUNIT", t_1.GetDuration()));
         }
     }
     foreach (SUTime.StandardTemporalType t_2 in SUTime.StandardTemporalType.Values())
     {
         env.Bind(t_2.ToString(), new Expressions.PrimitiveValue <SUTime.StandardTemporalType>("TemporalType", t_2));
     }
     env.Bind("Duration", new Expressions.PrimitiveValue <IValueFunction>(Expressions.TypeFunction, new _NamedValueFunction_124("Duration")));
     // New so we get different time ids
     // TODO: Check args
     // TODO: Handle Strings...
     // TODO: This should already be in durations....
     //String durationUnitString = (durationUnitTokens != null)? durationUnitTokens.get(0).get(CoreAnnotations.TextAnnotation.class):null;
     //SUTime.Duration durationUnit = getDuration(durationUnitString);
     // TODO: Handle inexactness
     // Create duration range...
     // Add begin and end times
     env.Bind("DayOfWeek", new Expressions.PrimitiveValue <IValueFunction>(Expressions.TypeFunction, new _NamedValueFunction_212("DayOfWeek")));
     env.Bind("MonthOfYear", new Expressions.PrimitiveValue <IValueFunction>(Expressions.TypeFunction, new _NamedValueFunction_235("MonthOfYear")));
     env.Bind("MakePeriodicTemporalSet", new Expressions.PrimitiveValue <IValueFunction>(Expressions.TypeFunction, new _NamedValueFunction_258("MakePeriodicTemporalSet")));
     // First argument is the temporal acting as the base of the periodic set
     // Second argument is the quantifier (string)
     // Third argument is the multiple (how much to scale the natural period)
     /*"P1X"*/
     env.Bind("TemporalCompose", new Expressions.PrimitiveValue <IValueFunction>(Expressions.TypeFunction, new _NamedValueFunction_328("TemporalCompose")));
 }
Пример #12
0
        public WordsToSentencesAnnotator(Properties properties)
        {
            bool nlSplitting = bool.ValueOf(properties.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false"));

            if (nlSplitting)
            {
                bool whitespaceTokenization = bool.ValueOf(properties.GetProperty("tokenize.whitespace", "false"));
                if (whitespaceTokenization)
                {
                    if (Runtime.LineSeparator().Equals("\n"))
                    {
                        // this constructor will keep empty lines as empty sentences
                        WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { "\n" }));
                        this.countLineNumbers = true;
                        this.wts = wts1;
                    }
                    else
                    {
                        // throw "\n" in just in case files use that instead of
                        // the system separator
                        // this constructor will keep empty lines as empty sentences
                        WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { Runtime.LineSeparator(), "\n" }));
                        this.countLineNumbers = true;
                        this.wts = wts1;
                    }
                }
                else
                {
                    // this constructor will keep empty lines as empty sentences
                    WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { PTBTokenizer.GetNewlineToken() }));
                    this.countLineNumbers = true;
                    this.wts = wts1;
                }
            }
            else
            {
                string isOneSentence = properties.GetProperty("ssplit.isOneSentence");
                if (bool.Parse(isOneSentence))
                {
                    // this method treats null as false
                    // Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence.
                    WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(true);
                    this.countLineNumbers = false;
                    this.wts = wts1;
                }
                else
                {
                    // multi token sentence boundaries
                    string boundaryMultiTokenRegex = properties.GetProperty("ssplit.boundaryMultiTokenRegex");
                    // Discard these tokens without marking them as sentence boundaries
                    string tokenPatternsToDiscardProp          = properties.GetProperty("ssplit.tokenPatternsToDiscard");
                    ICollection <string> tokenRegexesToDiscard = null;
                    if (tokenPatternsToDiscardProp != null)
                    {
                        string[] toks = tokenPatternsToDiscardProp.Split(",");
                        tokenRegexesToDiscard = Generics.NewHashSet(Arrays.AsList(toks));
                    }
                    // regular boundaries
                    string boundaryTokenRegex     = properties.GetProperty("ssplit.boundaryTokenRegex");
                    string boundaryFollowersRegex = properties.GetProperty("ssplit.boundaryFollowersRegex");
                    // newline boundaries which are discarded.
                    ICollection <string> boundariesToDiscard = null;
                    string bounds = properties.GetProperty("ssplit.boundariesToDiscard");
                    if (bounds != null)
                    {
                        string[] toks = bounds.Split(",");
                        boundariesToDiscard = Generics.NewHashSet(Arrays.AsList(toks));
                    }
                    ICollection <string> htmlElementsToDiscard = null;
                    // HTML boundaries which are discarded
                    bounds = properties.GetProperty("ssplit.htmlBoundariesToDiscard");
                    if (bounds != null)
                    {
                        string[] elements = bounds.Split(",");
                        htmlElementsToDiscard = Generics.NewHashSet(Arrays.AsList(elements));
                    }
                    string nlsb = properties.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, StanfordCoreNLP.DefaultNewlineIsSentenceBreak);
                    this.countLineNumbers = false;
                    this.wts = new WordToSentenceProcessor <CoreLabel>(boundaryTokenRegex, boundaryFollowersRegex, boundariesToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.StringToNewlineIsSentenceBreak(nlsb), (boundaryMultiTokenRegex != null) ? TokenSequencePattern
                                                                       .Compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard);
                }
            }
            Verbose = bool.ValueOf(properties.GetProperty("ssplit.verbose", "false"));
        }
Пример #13
0
 public WordsToSentencesAnnotator(bool verbose, string boundaryTokenRegex, ICollection <string> boundaryToDiscard, ICollection <string> htmlElementsToDiscard, string newlineIsSentenceBreak, string boundaryMultiTokenRegex, ICollection <string> tokenRegexesToDiscard
                                  )
     : this(verbose, false, new WordToSentenceProcessor <CoreLabel>(boundaryTokenRegex, null, boundaryToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.StringToNewlineIsSentenceBreak(newlineIsSentenceBreak), (boundaryMultiTokenRegex != null
                                                                                                                                                                                                                         ) ? TokenSequencePattern.Compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard))
 {
 }