예제 #1
        internal IndexedFieldLookup(
            IEnumerable <IFieldTokenizationOptions> fieldTokenizationOptions,
            ITokenizerFactory tokenizerFactory,
            TokenizationOptions defaultTokenizationOptions)
            if (fieldTokenizationOptions is null)
                throw new ArgumentNullException(nameof(fieldTokenizationOptions));

            if (tokenizerFactory is null)
                throw new ArgumentNullException(nameof(tokenizerFactory));

            if (defaultTokenizationOptions is null)
                throw new ArgumentNullException(nameof(defaultTokenizationOptions));

            foreach (var field in fieldTokenizationOptions)
                this.RegisterField(field, tokenizerFactory, defaultTokenizationOptions);
예제 #2
        /// <summary>
        /// Saves the results of applying the parser to the current text to
        /// the specified filename.
        /// </summary>
        public virtual void SaveOutput(string filename)
            if (filename == null || filename.Equals(string.Empty))
            string                       text      = textPane.GetText();
            StringReader                 reader    = new StringReader(text);
            DocumentPreprocessor         processor = new DocumentPreprocessor(reader);
            ITokenizerFactory <IHasWord> tf        = tlp.GetTokenizerFactory();

            IList <IList <IHasWord> > sentences = new List <IList <IHasWord> >();

            foreach (IList <IHasWord> sentence in processor)
            JProgressBar progress = new JProgressBar(0, sentences.Count);
            JButton      cancel   = new JButton();
            JDialog      dialog   = new JDialog(new Frame(), "Parser Progress", true);

            dialog.SetSize(300, 150);
            dialog.Add(BorderLayout.North, new JLabel("Parsing " + sentences.Count + " sentences"));
            dialog.Add(BorderLayout.Center, progress);
            dialog.Add(BorderLayout.South, cancel);
            ParserPanel.SaveOutputThread thread = new ParserPanel.SaveOutputThread(this, filename, progress, dialog, cancel, sentences);
예제 #3
        /// <summary>
        /// Returns a factory for FrenchTokenizer that replicates the tokenization of
        /// Green, de Marneffe, and Manning (2011).
        /// </summary>
        public static ITokenizerFactory <CoreLabel> FtbFactory()
            ITokenizerFactory <CoreLabel> tf = FrenchTokenizer.FrenchTokenizerFactory.NewTokenizerFactory();

예제 #4
        /// <summary>A fast, rule-based tokenizer for Modern Standard French.</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Modern Standard French.
        /// Performs punctuation splitting and light tokenization by default.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It assumes that the input
        /// file is delimited by the system line separator. The output will be equivalently
        /// delimited.
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
            Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs());

            if (options.Contains("help"))
            // Lexer options
            ITokenizerFactory <CoreLabel> tf = options.Contains("ftb") ? FrenchTokenizer.FtbFactory() : FrenchTokenizer.Factory();
            string orthoOptions = options.GetProperty("options", string.Empty);

            // When called from this main method, split on newline. No options for
            // more granular sentence splitting.
            orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
            // Other options
            string encoding = options.GetProperty("encoding", "UTF-8");
            bool   toLower  = PropertiesUtils.GetBool(options, "lowerCase", false);
            // Read the file from stdin
            int  nLines    = 0;
            int  nTokens   = 0;
            long startTime = Runtime.NanoTime();

                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                    string word = tokenizer.Current.Word();
                    if (word.Equals(FrenchLexer.NewlineToken))
                        printSpace = false;
                        if (printSpace)
                            System.Console.Out.Write(" ");
                        string outputToken = toLower ? word.ToLower(Locale.French) : word;
                        printSpace = true;
            catch (UnsupportedEncodingException e)
            long   elapsedTime = Runtime.NanoTime() - startTime;
            double linesPerSec = (double)nLines / (elapsedTime / 1e9);

            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec);
예제 #5
        /// <summary>Creates an ArabicTokenizer.</summary>
        /// <remarks>
        /// Creates an ArabicTokenizer. The default tokenizer
        /// is ArabicTokenizer.atbFactory(), which produces the
        /// same orthographic normalization as Green and Manning (2010).
        /// </remarks>
        /// <returns>A TokenizerFactory that produces each Arabic token as a CoreLabel</returns>
        private ITokenizerFactory <CoreLabel> GetTokenizerFactory()
            ITokenizerFactory <CoreLabel> tokFactory = null;

            if (!isTokenized)
                if (tokenizerOptions == null)
                    tokFactory = ArabicTokenizer.AtbFactory();
                    string atbVocOptions = "removeProMarker,removeMorphMarker,removeLengthening";
                    if (tokenizerOptions.Contains("removeSegMarker"))
                        throw new Exception("Option 'removeSegMarker' cannot be used with ArabicSegmenter");
                    tokFactory = ArabicTokenizer.Factory();
                log.Info("Loaded ArabicTokenizer with options: " + tokenizerOptions);
예제 #6
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
            if (args.Length != 2)
                log.Info("usage: java TaggerDemo2 modelFile fileToTag");
            MaxentTagger tagger = new MaxentTagger(args[0]);
            ITokenizerFactory <CoreLabel> ptbTokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
            BufferedReader       r  = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
            PrintWriter          pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, "utf-8"));
            DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);

            foreach (IList <IHasWord> sentence in documentPreprocessor)
                IList <TaggedWord> tSentence = tagger.TagSentence(sentence);
                pw.Println(SentenceUtils.ListToString(tSentence, false));
            // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
            IList <IHasWord>   sent       = SentenceUtils.ToWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
            IList <TaggedWord> taggedSent = tagger.TagSentence(sent);

            foreach (TaggedWord tw in taggedSent)
                if (tw.Tag().StartsWith("JJ"))
예제 #7
        // end static class SpanishTokenizerFactory
        /// <summary>Returns a tokenizer with Ancora tokenization.</summary>
        public static ITokenizerFactory <CoreLabel> AncoraFactory()
            ITokenizerFactory <CoreLabel> tf = SpanishTokenizer.SpanishTokenizerFactory.NewCoreLabelTokenizerFactory();

예제 #8
        /// <summary>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.
        /// </summary>
        /// <remarks>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.  Output is handled with a
        /// TreePrint object.  Note that the options used when creating the
        /// TreePrint can determine what results to print out.  Once again,
        /// one can capture the output by passing a PrintWriter to
        /// TreePrint.printTree. This code is for English.
        /// </remarks>
        public static void DemoAPI(LexicalizedParser lp)
            // This option shows parsing a list of correctly tokenized words
            string[]          sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
            IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent);
            Tree parse = lp.Apply(rawWords);

            // This option shows loading and using an explicit tokenizer
            string sent2 = "This is another sentence.";
            ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty);
            ITokenizer <CoreLabel>        tok       = tokenizerFactory.GetTokenizer(new StringReader(sent2));
            IList <CoreLabel>             rawWords2 = tok.Tokenize();

            parse = lp.Apply(rawWords2);
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory();
            GrammaticalStructure         gs  = gsf.NewGrammaticalStructure(parse);
            IList <TypedDependency>      tdl = gs.TypedDependenciesCCprocessed();

            // You can also use a TreePrint object to print trees and dependencies
            TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");

예제 #9
        /// <summary>Tokenize the text using the parser's tokenizer</summary>
        public virtual IList <IHasWord> Tokenize(string sentence)
            ITokenizerFactory <IHasWord> tf        = TreebankLanguagePack().GetTokenizerFactory();
            ITokenizer <IHasWord>        tokenizer = tf.GetTokenizer(new StringReader(sentence));
            IList <IHasWord>             tokens    = tokenizer.Tokenize();

예제 #10
 public MarkdownToHtmlRenderer(MarkdownParser parser, ITokenizerFactory <IMdToken> tokenizer,
                               INodeRenderer nodeRenderer)
     Parser       = parser;
     Tokenizer    = tokenizer;
     NodeRenderer = nodeRenderer;
     Modificators = new List <INodeVisitor>();
예제 #11
 /// <summary>
 /// Constructs a new DocumentReader that will read text from the given
 /// Reader and tokenize it into words using the given Tokenizer.
 /// </summary>
 /// <remarks>
 /// Constructs a new DocumentReader that will read text from the given
 /// Reader and tokenize it into words using the given Tokenizer. The default
 /// implementation will internally buffer the reader if it is not already
 /// buffered, so there is no need to pre-wrap the reader with a BufferedReader.
 /// This class provides many <tt>getReader</tt> methods for conviniently
 /// reading from many input sources.
 /// </remarks>
 public DocumentReader(Reader @in, ITokenizerFactory <IHasWord> tokenizerFactory, bool keepOriginalText)
     if (@in != null)
     this.keepOriginalText = keepOriginalText;
        /// <summary>A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).
        /// Performs punctuation splitting and light tokenization by default.
        /// Orthographic normalization options are available, and can be enabled with
        /// command line options.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It normalizes non-printing
        /// line separators across platforms and prints the system default line splitter
        /// to the output.
        /// <p>
        /// The following normalization options are provided:
        /// <ul>
        /// <li>
        /// <c>useUTF8Ellipsis</c>
        /// : Replaces sequences of three or more full stops with \u2026</li>
        /// <li>
        /// <c>normArDigits</c>
        /// : Convert Arabic digits to ASCII equivalents</li>
        /// <li>
        /// <c>normArPunc</c>
        /// : Convert Arabic punctuation to ASCII equivalents</li>
        /// <li>
        /// <c>normAlif</c>
        /// : Change all alif forms to bare alif</li>
        /// <li>
        /// <c>normYa</c>
        /// : Map ya to alif maqsura</li>
        /// <li>
        /// <c>removeDiacritics</c>
        /// : Strip all diacritics</li>
        /// <li>
        /// <c>removeTatweel</c>
        /// : Strip tatweel elongation character</li>
        /// <li>
        /// <c>removeQuranChars</c>
        /// : Remove diacritics that appear in the Quran</li>
        /// <li>
        /// <c>removeProMarker</c>
        /// : Remove the ATB null pronoun marker</li>
        /// <li>
        /// <c>removeSegMarker</c>
        /// : Remove the ATB clitic segmentation marker</li>
        /// <li>
        /// <c>removeMorphMarker</c>
        /// : Remove the ATB morpheme boundary markers</li>
        /// <li>
        /// <c>removeLengthening</c>
        /// : Replace all sequences of three or more identical (non-period) characters with one copy</li>
        /// <li>
        /// <c>atbEscaping</c>
        /// : Replace left/right parentheses with ATB escape characters</li>
        /// </ul>
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
            if (args.Length > 0 && args[0].Contains("help"))
                System.Console.Error.Printf("Usage: java %s [OPTIONS] < file%n", typeof(ArabicTokenizer).FullName);
                log.Info("   -help : Print this message. See javadocs for all normalization options.");
                log.Info("   -atb  : Tokenization for the parsing experiments in Green and Manning (2010)");
            // Process normalization options
            Properties tokenizerOptions      = StringUtils.ArgsToProperties(args);
            ITokenizerFactory <CoreLabel> tf = tokenizerOptions.Contains("atb") ? ArabicTokenizer.AtbFactory() : ArabicTokenizer.Factory();

            foreach (string option in tokenizerOptions.StringPropertyNames())
            // Replace line separators with a token so that we can
            // count lines
            // Read the file
            int nLines  = 0;
            int nTokens = 0;

                string encoding = "UTF-8";
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                    string word = tokenizer.Current.Word();
                    if (word.Equals(ArabicLexer.NewlineToken))
                        printSpace = false;
                        if (printSpace)
                            System.Console.Out.Write(" ");
                        printSpace = true;
            catch (UnsupportedEncodingException e)
            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens)%n", nLines, nTokens);
        public static ITokenizerFactory <CoreLabel> AtbFactory()
            ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.ArabicTokenizerFactory.NewTokenizerFactory();

            foreach (string option in atbOptions.StringPropertyNames())
        /// <exception cref="System.Exception"/>
        public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics)
            : base(dict, semantics)
            string fileName = props.GetProperty(Constants.MucProp);

            fileContents      = IOUtils.SlurpFile(fileName);
            currentOffset     = 0;
            tokenizerFactory  = PTBTokenizer.Factory(new CoreLabelTokenFactory(false), string.Empty);
            stanfordProcessor = LoadStanfordProcessor(props);
예제 #15
 /// <param name="hasSegMarkers">if true, input has segmentation markers</param>
 /// <param name="hasTags">if true, input has morphological analyses separated by tagDelimiter.</param>
 /// <param name="hasDomainLabels">
 /// if true, input has a whitespace-terminated domain at the beginning
 /// of each line of text
 /// </param>
 /// <param name="stripRewrites">
 /// if true, erase orthographical rewrites from the gold labels (for
 /// comparison purposes)
 /// </param>
 /// <param name="tokFactory">a TokenizerFactory for the input</param>
 public ArabicDocumentReaderAndWriter(bool hasSegMarkers, bool hasTags, bool hasDomainLabels, string domain, bool stripRewrites, ITokenizerFactory <CoreLabel> tokFactory)
     tf                   = tokFactory;
     inputHasTags         = hasTags;
     inputHasDomainLabels = hasDomainLabels;
     inputDomain          = domain;
     shouldStripRewrites  = stripRewrites;
     segMarker            = hasSegMarkers ? DefaultSegMarker : null;
     factory              = LineIterator.GetFactory(new _ISerializableFunction_131(this));
예제 #16
        /// <summary>
        /// Replaces the default <see cref="ITokenizerFactory"/> implementation.
        /// </summary>
        public FullTextIndexBuilder <TKey> WithTokenizerFactory(ITokenizerFactory tokenizerFactory)
            if (tokenizerFactory is null)
                throw new ArgumentNullException(nameof(tokenizerFactory));

            this.tokenizerFactory = tokenizerFactory;

예제 #17
 /// <summary>Make an Arabic Segmenter.</summary>
 /// <param name="props">
 /// Options for how to tokenize. See the main method of
 /// <see cref="ArabicTokenizer{T}"/>
 /// for details
 /// </param>
 public ArabicSegmenter(Properties props)
     /* Serializable */
     // SEGMENTER OPTIONS (can be set in the Properties object
     // passed to the constructor).
     // The input already been tokenized. Do not run the Arabic tokenizer.
     // Tokenizer options
     // Mark segmented prefixes with this String
     // Mark segmented suffixes with this String
     // Number of decoding threads
     // Write TedEval files
     // Use a custom feature factory
     // Training and evaluation files have domain labels
     // Training and evaluation text are all in the same domain (default:atb)
     // Ignore rewrites (training only, produces a model that then can be used to do
     // no-rewrite segmentation)
     // Use the original feature set which doesn't contain start-and-end "wrapper" features
     isTokenized      = props.Contains(optTokenized);
     tokenizerOptions = props.GetProperty(optTokenizer, null);
     tedEvalPrefix    = props.GetProperty(optTedEval, null);
     hasDomainLabels  = props.Contains(optWithDomains);
     domain           = props.GetProperty(optDomain, "atb");
     noRewrites       = props.Contains(optNoRewrites);
     tf           = GetTokenizerFactory();
     prefixMarker = props.GetProperty(optPrefix, string.Empty);
     suffixMarker = props.GetProperty(optSuffix, string.Empty);
     if (props.Contains(optLocalFeaturesOnly))
         if (props.Contains(optFeatureFactory))
             throw new Exception("Cannot use custom feature factory with localFeaturesOnly flag--" + "have your custom feature factory extend ArabicSegmenterFeatureFactory instead of " + "StartAndEndArabicSegmenterFeatureFactory and remove the localFeaturesOnly flag."
         props.SetProperty(optFeatureFactory, localOnlyFeatureFactory);
     if (!props.Contains(optFeatureFactory))
         props.SetProperty(optFeatureFactory, defaultFeatureFactory);
     // Remove all command-line properties that are specific to ArabicSegmenter
     flags      = new SeqClassifierFlags(props);
     classifier = new CRFClassifier <CoreLabel>(flags);
 public virtual void RunTest <_T0>(ITokenizerFactory <_T0> factory, string[] testStrings, string[][] resultsStrings)
     where _T0 : IHasWord
     for (int i = 0; i < testStrings.Length; ++i)
         ITokenizer <IHasWord> tokenizer = factory.GetTokenizer(new StringReader(testStrings[i]));
         IList <IHasWord>      tokens    = tokenizer.Tokenize();
         NUnit.Framework.Assert.AreEqual(resultsStrings[i].Length, tokens.Count);
         for (int j = 0; j < resultsStrings[i].Length; ++j)
             NUnit.Framework.Assert.AreEqual(resultsStrings[i][j], tokens[j].Word());
예제 #19
 /// <summary>Copy constructor.</summary>
 /// <param name="other"/>
 public ArabicSegmenter(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter other)
     isTokenized      = other.isTokenized;
     tokenizerOptions = other.tokenizerOptions;
     prefixMarker     = other.prefixMarker;
     suffixMarker     = other.suffixMarker;
     tedEvalPrefix    = other.tedEvalPrefix;
     hasDomainLabels  = other.hasDomainLabels;
     domain           = other.domain;
     noRewrites       = other.noRewrites;
     flags            = other.flags;
     // ArabicTokenizerFactory is *not* threadsafe. Make a new copy.
     tf = GetTokenizerFactory();
     // CRFClassifier is threadsafe, so return a reference.
     classifier = other.classifier;
예제 #20
        public FullTextIndex(
            FullTextIndexConfiguration <TKey> options,
            IIndexNodeFactory indexNodeFactory = null,
            ITokenizerFactory tokenizerFactory = null,
            IQueryParser queryParser           = null)
            this.indexNodeFactory = indexNodeFactory ?? new IndexNodeFactory();
            this.tokenizerFactory = tokenizerFactory ?? new TokenizerFactory();
            this.queryParser      = queryParser ?? new QueryParser();


            this.IdPool      = new IdPool <TKey>();
            this.FieldLookup = new IndexedFieldLookup();
            this.Root        = this.indexNodeFactory.CreateNode();
        public virtual void TestArabicTokenizer()
            System.Diagnostics.Debug.Assert((untokInputs.Length == tokReferences.Length));
            ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory();

            for (int i = 0; i < untokInputs.Length; ++i)
                string line = untokInputs[i];
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(line));
                IList <CoreLabel>      tokens    = tokenizer.Tokenize();
                string tokenizedLine             = SentenceUtils.ListToString(tokens);
                string reference = tokReferences[i];
                NUnit.Framework.Assert.AreEqual("Tokenization deviates from reference", reference, tokenizedLine);
        public virtual void TestCharOffsets()
            string untokInput = "إِنَّ- -نا هادِئ+ُونَ .";

            int[] beginOffsets = new int[] { 0, 7, 11, 22 };
            int[] endOffsets   = new int[] { 6, 10, 21, 23 };
            ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory();

            ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(untokInput));
            IList <CoreLabel>      tokens    = tokenizer.Tokenize();

            NUnit.Framework.Assert.AreEqual("Number of tokens doesn't match reference", tokens.Count, beginOffsets.Length);
            for (int i = 0; i < beginOffsets.Length; i++)
                NUnit.Framework.Assert.AreEqual("Char begin offset deviates from reference", beginOffsets[i], tokens[i].BeginPosition());
                NUnit.Framework.Assert.AreEqual("Char end offset deviates from reference", endOffsets[i], tokens[i].EndPosition());
예제 #23
        private void RegisterField(IFieldTokenizationOptions fieldOptions, ITokenizerFactory tokenizerFactory, TokenizationOptions defaultTokenizationOptions)
            var fieldName = fieldOptions.Name;

            if (this.fieldToDetailsLookup.ContainsKey(fieldOptions.Name))
                throw new LiftiException(ExceptionMessages.FieldNameAlreadyUsed, fieldName);

            var newId = Interlocked.Increment(ref nextId);

            if (newId > byte.MaxValue)
                throw new LiftiException(ExceptionMessages.MaximumDistinctFieldsIndexReached);

            var id = (byte)newId;
            var fieldTokenizationOptions = fieldOptions.TokenizationOptions ?? defaultTokenizationOptions;

            this.fieldToDetailsLookup[fieldName] = new IndexedFieldDetails((byte)id, tokenizerFactory.Create(fieldTokenizationOptions));
            this.idToFieldLookup[id]             = fieldName;
예제 #24
 public TokenizerAnnotator(bool verbose, Properties props, string options)
     if (props == null)
         props = new Properties();
     // check if segmenting must be done
     if (props.GetProperty("tokenize.language") != null && LanguageInfo.IsSegmenterLanguage(props.GetProperty("tokenize.language")))
         useSegmenter = true;
         if (LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language")) == LanguageInfo.HumanLanguage.Arabic)
             segmenterAnnotator = new ArabicSegmenterAnnotator("segment", props);
             if (LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language")) == LanguageInfo.HumanLanguage.Chinese)
                 segmenterAnnotator = new ChineseSegmenterAnnotator("segment", props);
                 segmenterAnnotator = null;
                 throw new Exception("No segmenter implemented for: " + LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language")));
         useSegmenter       = false;
         segmenterAnnotator = null;
     Verbose = PropertiesUtils.GetBool(props, "tokenize.verbose", verbose);
     TokenizerAnnotator.TokenizerType type = TokenizerAnnotator.TokenizerType.GetTokenizerType(props);
     factory = InitFactory(type, props, options);
예제 #25
        internal FullTextIndex(
            IndexOptions indexOptions,
            ConfiguredItemTokenizationOptions <TKey> itemTokenizationOptions,
            IIndexNodeFactory indexNodeFactory,
            ITokenizerFactory tokenizerFactory,
            IQueryParser queryParser,
            TokenizationOptions defaultTokenizationOptions,
            Func <IIndexSnapshot <TKey>, Task>[]?indexModifiedActions)
            this.indexOptions               = indexOptions;
            this.itemTokenizationOptions    = itemTokenizationOptions ?? throw new ArgumentNullException(nameof(itemTokenizationOptions));
            this.IndexNodeFactory           = indexNodeFactory ?? throw new ArgumentNullException(nameof(indexNodeFactory));
            this.tokenizerFactory           = tokenizerFactory ?? throw new ArgumentNullException(nameof(tokenizerFactory));
            this.queryParser                = queryParser ?? throw new ArgumentNullException(nameof(queryParser));
            this.defaultTokenizationOptions = defaultTokenizationOptions ?? throw new ArgumentNullException(nameof(defaultTokenizationOptions));
            this.indexModifiedActions       = indexModifiedActions;
            this.idPool      = new IdPool <TKey>();
            this.FieldLookup = new IndexedFieldLookup(

            this.Root = this.IndexNodeFactory.CreateRootNode();
예제 #26
 /// <summary>Sets the tokenizer used to chop up text into words for the documents.</summary>
 public virtual void SetTokenizerFactory <_T0>(ITokenizerFactory <_T0> tokenizerFactory)
     where _T0 : IHasWord
     this.tokenizerFactory = tokenizerFactory;
예제 #27
 /// <param name="hasSegMarkers">if true, input has segmentation markers</param>
 /// <param name="hasTags">if true, input has morphological analyses separated by tagDelimiter.</param>
 /// <param name="tokFactory">a TokenizerFactory for the input</param>
 public ArabicDocumentReaderAndWriter(bool hasSegMarkers, bool hasTags, ITokenizerFactory <CoreLabel> tokFactory)
     : this(hasSegMarkers, hasTags, false, "123", tokFactory)
예제 #28
 /// <param name="hasSegMarkers">if true, input has segmentation markers</param>
 /// <param name="hasTags">if true, input has morphological analyses separated by tagDelimiter.</param>
 /// <param name="hasDomainLabels">
 /// if true, input has a whitespace-terminated domain at the beginning
 /// of each line of text
 /// </param>
 /// <param name="tokFactory">a TokenizerFactory for the input</param>
 public ArabicDocumentReaderAndWriter(bool hasSegMarkers, bool hasTags, bool hasDomainLabels, string domain, ITokenizerFactory <CoreLabel> tokFactory)
     : this(hasSegMarkers, hasTags, hasDomainLabels, domain, false, tokFactory)
예제 #29
        /// <summary>
        /// This method lets you train and test a segmenter relative to a
        /// Treebank.
        /// </summary>
        /// <remarks>
        /// This method lets you train and test a segmenter relative to a
        /// Treebank.
        /// <p>
        /// <i>Implementation note:</i> This method is largely cloned from
        /// LexicalizedParser's main method.  Should we try to have it be able
        /// to train segmenters to stop things going out of sync?
        /// </remarks>
        public static void Main(string[] args)
            bool     train = false;
            bool     saveToSerializedFile      = false;
            bool     saveToTextFile            = false;
            string   serializedInputFileOrUrl  = null;
            string   textInputFileOrUrl        = null;
            string   serializedOutputFileOrUrl = null;
            string   textOutputFileOrUrl       = null;
            string   treebankPath = null;
            Treebank testTreebank = null;
            // Treebank tuneTreebank = null;
            string      testPath    = null;
            IFileFilter testFilter  = null;
            IFileFilter trainFilter = null;
            string      encoding    = null;
            // variables needed to process the files to be parsed
            ITokenizerFactory <Word> tokenizerFactory = null;
            //    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor();
            bool tokenized = false;
            // whether or not the input file has already been tokenized
            IFunction <IList <IHasWord>, IList <IHasWord> > escaper = new ChineseEscaper();
            // int tagDelimiter = -1;
            // String sentenceDelimiter = "\n";
            // boolean fromXML = false;
            int argIndex = 0;

            if (args.Length < 1)
                log.Info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*");
            Options op = new Options();

            op.tlpParams = new ChineseTreebankParserParams();
            // while loop through option arguments
            while (argIndex < args.Length && args[argIndex][0] == '-')
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train"))
                    train = true;
                    saveToSerializedFile = true;
                    int numSubArgs = NumSubArgs(args, argIndex);
                    if (numSubArgs > 1)
                        treebankPath = args[argIndex];
                        throw new Exception("Error: -train option must have treebankPath as first argument.");
                    if (numSubArgs == 2)
                        trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
                        if (numSubArgs >= 3)
                                int low  = System.Convert.ToInt32(args[argIndex]);
                                int high = System.Convert.ToInt32(args[argIndex + 1]);
                                trainFilter = new NumberRangeFileFilter(low, high, true);
                                argIndex   += 2;
                            catch (NumberFormatException)
                                // maybe it's a ranges expression?
                                trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-encoding"))
                        // sets encoding for TreebankLangParserParams
                        encoding = args[argIndex + 1];
                        argIndex += 2;
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-loadFromSerializedFile"))
                            // load the parser from a binary serialized file
                            // the next argument must be the path to the parser file
                            serializedInputFileOrUrl = args[argIndex + 1];
                            argIndex += 2;
                            // doesn't make sense to load from TextFile -pichuan
                            //      } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
                            //        // load the parser from declarative text file
                            //        // the next argument must be the path to the parser file
                            //        textInputFileOrUrl = args[argIndex + 1];
                            //        argIndex += 2;
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToSerializedFile"))
                                saveToSerializedFile      = true;
                                serializedOutputFileOrUrl = args[argIndex + 1];
                                argIndex += 2;
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToTextFile"))
                                    // save the parser to declarative text file
                                    saveToTextFile      = true;
                                    textOutputFileOrUrl = args[argIndex + 1];
                                    argIndex           += 2;
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                                        // the next argument is the treebank path and range for testing
                                        int numSubArgs = NumSubArgs(args, argIndex);
                                        if (numSubArgs == 1)
                                            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                            if (numSubArgs > 1)
                                                testPath = args[argIndex++];
                                                if (numSubArgs == 2)
                                                    testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                                    if (numSubArgs >= 3)
                                                            int low  = System.Convert.ToInt32(args[argIndex]);
                                                            int high = System.Convert.ToInt32(args[argIndex + 1]);
                                                            testFilter = new NumberRangeFileFilter(low, high, true);
                                                            argIndex  += 2;
                                                        catch (NumberFormatException)
                                                            // maybe it's a ranges expression?
                                                            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                        int j = op.tlpParams.SetOptionFlag(args, argIndex);
                                        if (j == argIndex)
                                            log.Info("Unknown option ignored: " + args[argIndex]);
                                        argIndex = j;
            // end while loop through arguments
            ITreebankLangParserParams tlpParams = op.tlpParams;

            // all other arguments are order dependent and
            // are processed in order below
            Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = null;
            if (!train && op.testOptions.verbose)
                System.Console.Out.WriteLine("Currently " + new DateTime());
                PrintArgs(args, System.Console.Out);
            if (train)
                PrintArgs(args, System.Console.Out);
                // so we train a parser using the treebank
                if (treebankPath == null)
                    // the next arg must be the treebank path, since it wasn't give earlier
                    treebankPath = args[argIndex];
                    if (args.Length > argIndex + 1)
                            // the next two args might be the range
                            int low  = System.Convert.ToInt32(args[argIndex]);
                            int high = System.Convert.ToInt32(args[argIndex + 1]);
                            trainFilter = new NumberRangeFileFilter(low, high, true);
                            argIndex   += 2;
                        catch (NumberFormatException)
                            // maybe it's a ranges expression?
                            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                Treebank        trainTreebank = MakeTreebank(treebankPath, op, trainFilter);
                IIndex <string> wordIndex     = new HashIndex <string>();
                IIndex <string> tagIndex      = new HashIndex <string>();
                cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex);
                if (textInputFileOrUrl != null)
                    // so we load the segmenter from a text grammar file
                    // XXXXX fix later -pichuan
                    //cs = new LexicalizedParser(textInputFileOrUrl, true, op);
                    // so we load a serialized segmenter
                    if (serializedInputFileOrUrl == null)
                        // the next argument must be the path to the serialized parser
                        serializedInputFileOrUrl = args[argIndex];
                        cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
                    catch (ArgumentException)
                        log.Info("Error loading segmenter, exiting...");
            // the following has to go after reading parser to make sure
            // op and tlpParams are the same for train and test
            TreePrint treePrint = op.testOptions.TreePrint(tlpParams);

            if (testFilter != null)
                if (testPath == null)
                    if (treebankPath == null)
                        throw new Exception("No test treebank path specified...");
                        log.Info("No test treebank path specified.  Using train path: \"" + treebankPath + "\"");
                        testPath = treebankPath;
                testTreebank = tlpParams.TestMemoryTreebank();
                testTreebank.LoadPath(testPath, testFilter);
            op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(tlpParams.SisterSplitters()));
            // at this point we should be sure that op.tlpParams is
            // set appropriately (from command line, or from grammar file),
            // and will never change again.  We also set the tlpParams of the
            // LexicalizedParser instance to be the same object.  This is
            // redundancy that we probably should take out eventually.
            // -- Roger
            if (op.testOptions.verbose)
                log.Info("Lexicon is " + cs.GetType().FullName);
            PrintWriter pwOut = tlpParams.Pw();
            PrintWriter pwErr = tlpParams.Pw(System.Console.Error);

            // Now what do we do with the parser we've made
            if (saveToTextFile)
                // save the parser to textGrammar format
                if (textOutputFileOrUrl != null)
                    SaveSegmenterDataToText(cs, textOutputFileOrUrl);
                    log.Info("Usage: must specify a text segmenter data output path");
            if (saveToSerializedFile)
                if (serializedOutputFileOrUrl == null && argIndex < args.Length)
                    // the next argument must be the path to serialize to
                    serializedOutputFileOrUrl = args[argIndex];
                if (serializedOutputFileOrUrl != null)
                    SaveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
                    if (textOutputFileOrUrl == null && testTreebank == null)
                        // no saving/parsing request has been specified
                        log.Info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename");
            /* --------------------- Testing part!!!! ----------------------- */
            if (op.testOptions.verbose)
            //      printOptions(false, op);
            if (testTreebank != null || (argIndex < args.Length && Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")))
                // test parser on treebank
                if (testTreebank == null)
                    // the next argument is the treebank path and range for testing
                    testTreebank = tlpParams.TestMemoryTreebank();
                    if (args.Length < argIndex + 4)
                        testTreebank.LoadPath(args[argIndex + 1]);
                        int testlow  = System.Convert.ToInt32(args[argIndex + 2]);
                        int testhigh = System.Convert.ToInt32(args[argIndex + 3]);
                        testTreebank.LoadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true));
예제 #30
 public TextProcessorFactory(ISentenceDetectorFactory sdFact, ITokenizerFactory tokFact, IStemmerFactory stemFact)
     this.sdFact   = sdFact;
     this.tokFact  = tokFact;
     this.stemFact = stemFact;