internal IndexedFieldLookup(
            IEnumerable <IFieldTokenizationOptions> fieldTokenizationOptions,
            ITokenizerFactory tokenizerFactory,
            TokenizationOptions defaultTokenizationOptions)
        {
            if (fieldTokenizationOptions is null)
            {
                throw new ArgumentNullException(nameof(fieldTokenizationOptions));
            }

            if (tokenizerFactory is null)
            {
                throw new ArgumentNullException(nameof(tokenizerFactory));
            }

            if (defaultTokenizationOptions is null)
            {
                throw new ArgumentNullException(nameof(defaultTokenizationOptions));
            }

            foreach (var field in fieldTokenizationOptions)
            {
                this.RegisterField(field, tokenizerFactory, defaultTokenizationOptions);
            }
        }
        /// <summary>
        /// Saves the results of applying the parser to the current text to
        /// the specified filename.
        /// </summary>
        public virtual void SaveOutput(string filename)
        {
            if (filename == null || filename.Equals(string.Empty))
            {
                return;
            }
            string                       text      = textPane.GetText();
            StringReader                 reader    = new StringReader(text);
            DocumentPreprocessor         processor = new DocumentPreprocessor(reader);
            ITokenizerFactory <IHasWord> tf        = tlp.GetTokenizerFactory();

            processor.SetTokenizerFactory(tf);
            IList <IList <IHasWord> > sentences = new List <IList <IHasWord> >();

            foreach (IList <IHasWord> sentence in processor)
            {
                sentences.Add(sentence);
            }
            JProgressBar progress = new JProgressBar(0, sentences.Count);
            JButton      cancel   = new JButton();
            JDialog      dialog   = new JDialog(new Frame(), "Parser Progress", true);

            dialog.SetSize(300, 150);
            dialog.Add(BorderLayout.North, new JLabel("Parsing " + sentences.Count + " sentences"));
            dialog.Add(BorderLayout.Center, progress);
            dialog.Add(BorderLayout.South, cancel);
            //dialog.add(progress);
            ParserPanel.SaveOutputThread thread = new ParserPanel.SaveOutputThread(this, filename, progress, dialog, cancel, sentences);
            cancel.SetText("Cancel");
            cancel.SetToolTipText("Cancel");
            cancel.AddActionListener(null);
            thread.Start();
            dialog.SetVisible(true);
        }
Exemple #3
0
        /// <summary>
        /// Returns a factory for FrenchTokenizer that replicates the tokenization of
        /// Green, de Marneffe, and Manning (2011).
        /// </summary>
        public static ITokenizerFactory <CoreLabel> FtbFactory()
        {
            ITokenizerFactory <CoreLabel> tf = FrenchTokenizer.FrenchTokenizerFactory.NewTokenizerFactory();

            tf.SetOptions(FtbOptions);
            return(tf);
        }
Exemple #4
0
        /// <summary>A fast, rule-based tokenizer for Modern Standard French.</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Modern Standard French.
        /// Performs punctuation splitting and light tokenization by default.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It assumes that the input
        /// file is delimited by the system line separator. The output will be equivalently
        /// delimited.
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs());

            if (options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            // Lexer options
            ITokenizerFactory <CoreLabel> tf = options.Contains("ftb") ? FrenchTokenizer.FtbFactory() : FrenchTokenizer.Factory();
            string orthoOptions = options.GetProperty("options", string.Empty);

            // When called from this main method, split on newline. No options for
            // more granular sentence splitting.
            orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
            tf.SetOptions(orthoOptions);
            // Other options
            string encoding = options.GetProperty("encoding", "UTF-8");
            bool   toLower  = PropertiesUtils.GetBool(options, "lowerCase", false);
            // Read the file from stdin
            int  nLines    = 0;
            int  nTokens   = 0;
            long startTime = Runtime.NanoTime();

            try
            {
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                {
                    ++nTokens;
                    string word = tokenizer.Current.Word();
                    if (word.Equals(FrenchLexer.NewlineToken))
                    {
                        ++nLines;
                        printSpace = false;
                        System.Console.Out.WriteLine();
                    }
                    else
                    {
                        if (printSpace)
                        {
                            System.Console.Out.Write(" ");
                        }
                        string outputToken = toLower ? word.ToLower(Locale.French) : word;
                        System.Console.Out.Write(outputToken);
                        printSpace = true;
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                log.Error(e);
            }
            long   elapsedTime = Runtime.NanoTime() - startTime;
            double linesPerSec = (double)nLines / (elapsedTime / 1e9);

            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec);
        }
Exemple #5
0
        /// <summary>Creates an ArabicTokenizer.</summary>
        /// <remarks>
        /// Creates an ArabicTokenizer. The default tokenizer
        /// is ArabicTokenizer.atbFactory(), which produces the
        /// same orthographic normalization as Green and Manning (2010).
        /// </remarks>
        /// <returns>A TokenizerFactory that produces each Arabic token as a CoreLabel</returns>
        private ITokenizerFactory <CoreLabel> GetTokenizerFactory()
        {
            ITokenizerFactory <CoreLabel> tokFactory = null;

            if (!isTokenized)
            {
                if (tokenizerOptions == null)
                {
                    tokFactory = ArabicTokenizer.AtbFactory();
                    string atbVocOptions = "removeProMarker,removeMorphMarker,removeLengthening";
                    tokFactory.SetOptions(atbVocOptions);
                }
                else
                {
                    if (tokenizerOptions.Contains("removeSegMarker"))
                    {
                        throw new Exception("Option 'removeSegMarker' cannot be used with ArabicSegmenter");
                    }
                    tokFactory = ArabicTokenizer.Factory();
                    tokFactory.SetOptions(tokenizerOptions);
                }
                log.Info("Loaded ArabicTokenizer with options: " + tokenizerOptions);
            }
            return(tokFactory);
        }
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                log.Info("usage: java TaggerDemo2 modelFile fileToTag");
                return;
            }
            MaxentTagger tagger = new MaxentTagger(args[0]);
            ITokenizerFactory <CoreLabel> ptbTokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
            BufferedReader       r  = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
            PrintWriter          pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, "utf-8"));
            DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);

            documentPreprocessor.SetTokenizerFactory(ptbTokenizerFactory);
            foreach (IList <IHasWord> sentence in documentPreprocessor)
            {
                IList <TaggedWord> tSentence = tagger.TagSentence(sentence);
                pw.Println(SentenceUtils.ListToString(tSentence, false));
            }
            // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
            IList <IHasWord>   sent       = SentenceUtils.ToWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
            IList <TaggedWord> taggedSent = tagger.TagSentence(sent);

            foreach (TaggedWord tw in taggedSent)
            {
                if (tw.Tag().StartsWith("JJ"))
                {
                    pw.Println(tw.Word());
                }
            }
            pw.Close();
        }
Exemple #7
0
        // end static class SpanishTokenizerFactory
        /// <summary>Returns a tokenizer with Ancora tokenization.</summary>
        public static ITokenizerFactory <CoreLabel> AncoraFactory()
        {
            ITokenizerFactory <CoreLabel> tf = SpanishTokenizer.SpanishTokenizerFactory.NewCoreLabelTokenizerFactory();

            tf.SetOptions(AncoraOptions);
            return(tf);
        }
Exemple #8
0
        /// <summary>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.
        /// </summary>
        /// <remarks>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.  Output is handled with a
        /// TreePrint object.  Note that the options used when creating the
        /// TreePrint can determine what results to print out.  Once again,
        /// one can capture the output by passing a PrintWriter to
        /// TreePrint.printTree. This code is for English.
        /// </remarks>
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            string[]          sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
            IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent);
            Tree parse = lp.Apply(rawWords);

            parse.PennPrint();
            System.Console.Out.WriteLine();
            // This option shows loading and using an explicit tokenizer
            string sent2 = "This is another sentence.";
            ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty);
            ITokenizer <CoreLabel>        tok       = tokenizerFactory.GetTokenizer(new StringReader(sent2));
            IList <CoreLabel>             rawWords2 = tok.Tokenize();

            parse = lp.Apply(rawWords2);
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory();
            GrammaticalStructure         gs  = gsf.NewGrammaticalStructure(parse);
            IList <TypedDependency>      tdl = gs.TypedDependenciesCCprocessed();

            System.Console.Out.WriteLine(tdl);
            System.Console.Out.WriteLine();
            // You can also use a TreePrint object to print trees and dependencies
            TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.PrintTree(parse);
        }
Exemple #9
0
        /// <summary>Tokenize the text using the parser's tokenizer</summary>
        public virtual IList <IHasWord> Tokenize(string sentence)
        {
            ITokenizerFactory <IHasWord> tf        = TreebankLanguagePack().GetTokenizerFactory();
            ITokenizer <IHasWord>        tokenizer = tf.GetTokenizer(new StringReader(sentence));
            IList <IHasWord>             tokens    = tokenizer.Tokenize();

            return(tokens);
        }
Exemple #10
0
 public MarkdownToHtmlRenderer(MarkdownParser parser, ITokenizerFactory <IMdToken> tokenizer,
                               INodeRenderer nodeRenderer)
 {
     Parser       = parser;
     Tokenizer    = tokenizer;
     NodeRenderer = nodeRenderer;
     Modificators = new List <INodeVisitor>();
 }
 /// <summary>
 /// Constructs a new DocumentReader that will read text from the given
 /// Reader and tokenize it into words using the given Tokenizer.
 /// </summary>
 /// <remarks>
 /// Constructs a new DocumentReader that will read text from the given
 /// Reader and tokenize it into words using the given Tokenizer. The default
 /// implementation will internally buffer the reader if it is not already
 /// buffered, so there is no need to pre-wrap the reader with a BufferedReader.
 /// This class provides many <tt>getReader</tt> methods for conviniently
 /// reading from many input sources.
 /// </remarks>
 public DocumentReader(Reader @in, ITokenizerFactory <IHasWord> tokenizerFactory, bool keepOriginalText)
 {
     if (@in != null)
     {
         SetReader(@in);
     }
     SetTokenizerFactory(tokenizerFactory);
     this.keepOriginalText = keepOriginalText;
 }
        /// <summary>A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).
        /// Performs punctuation splitting and light tokenization by default.
        /// Orthographic normalization options are available, and can be enabled with
        /// command line options.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It normalizes non-printing
        /// line separators across platforms and prints the system default line splitter
        /// to the output.
        /// <p>
        /// The following normalization options are provided:
        /// <ul>
        /// <li>
        /// <c>useUTF8Ellipsis</c>
        /// : Replaces sequences of three or more full stops with \u2026</li>
        /// <li>
        /// <c>normArDigits</c>
        /// : Convert Arabic digits to ASCII equivalents</li>
        /// <li>
        /// <c>normArPunc</c>
        /// : Convert Arabic punctuation to ASCII equivalents</li>
        /// <li>
        /// <c>normAlif</c>
        /// : Change all alif forms to bare alif</li>
        /// <li>
        /// <c>normYa</c>
        /// : Map ya to alif maqsura</li>
        /// <li>
        /// <c>removeDiacritics</c>
        /// : Strip all diacritics</li>
        /// <li>
        /// <c>removeTatweel</c>
        /// : Strip tatweel elongation character</li>
        /// <li>
        /// <c>removeQuranChars</c>
        /// : Remove diacritics that appear in the Quran</li>
        /// <li>
        /// <c>removeProMarker</c>
        /// : Remove the ATB null pronoun marker</li>
        /// <li>
        /// <c>removeSegMarker</c>
        /// : Remove the ATB clitic segmentation marker</li>
        /// <li>
        /// <c>removeMorphMarker</c>
        /// : Remove the ATB morpheme boundary markers</li>
        /// <li>
        /// <c>removeLengthening</c>
        /// : Replace all sequences of three or more identical (non-period) characters with one copy</li>
        /// <li>
        /// <c>atbEscaping</c>
        /// : Replace left/right parentheses with ATB escape characters</li>
        /// </ul>
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length > 0 && args[0].Contains("help"))
            {
                System.Console.Error.Printf("Usage: java %s [OPTIONS] < file%n", typeof(ArabicTokenizer).FullName);
                System.Console.Error.Printf("%nOptions:%n");
                log.Info("   -help : Print this message. See javadocs for all normalization options.");
                log.Info("   -atb  : Tokenization for the parsing experiments in Green and Manning (2010)");
                System.Environment.Exit(-1);
            }
            // Process normalization options
            Properties tokenizerOptions      = StringUtils.ArgsToProperties(args);
            ITokenizerFactory <CoreLabel> tf = tokenizerOptions.Contains("atb") ? ArabicTokenizer.AtbFactory() : ArabicTokenizer.Factory();

            foreach (string option in tokenizerOptions.StringPropertyNames())
            {
                tf.SetOptions(option);
            }
            // Replace line separators with a token so that we can
            // count lines
            tf.SetOptions("tokenizeNLs");
            // Read the file
            int nLines  = 0;
            int nTokens = 0;

            try
            {
                string encoding = "UTF-8";
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                {
                    ++nTokens;
                    string word = tokenizer.Current.Word();
                    if (word.Equals(ArabicLexer.NewlineToken))
                    {
                        ++nLines;
                        printSpace = false;
                        System.Console.Out.WriteLine();
                    }
                    else
                    {
                        if (printSpace)
                        {
                            System.Console.Out.Write(" ");
                        }
                        System.Console.Out.Write(word);
                        printSpace = true;
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens)%n", nLines, nTokens);
        }
        public static ITokenizerFactory <CoreLabel> AtbFactory()
        {
            ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.ArabicTokenizerFactory.NewTokenizerFactory();

            foreach (string option in atbOptions.StringPropertyNames())
            {
                tf.SetOptions(option);
            }
            return(tf);
        }
        /// <exception cref="System.Exception"/>
        public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics)
            : base(dict, semantics)
        {
            string fileName = props.GetProperty(Constants.MucProp);

            fileContents      = IOUtils.SlurpFile(fileName);
            currentOffset     = 0;
            tokenizerFactory  = PTBTokenizer.Factory(new CoreLabelTokenFactory(false), string.Empty);
            stanfordProcessor = LoadStanfordProcessor(props);
        }
Exemple #15
0
 /// <param name="hasSegMarkers">if true, input has segmentation markers</param>
 /// <param name="hasTags">if true, input has morphological analyses separated by tagDelimiter.</param>
 /// <param name="hasDomainLabels">
 /// if true, input has a whitespace-terminated domain at the beginning
 /// of each line of text
 /// </param>
 /// <param name="stripRewrites">
 /// if true, erase orthographical rewrites from the gold labels (for
 /// comparison purposes)
 /// </param>
 /// <param name="tokFactory">a TokenizerFactory for the input</param>
 public ArabicDocumentReaderAndWriter(bool hasSegMarkers, bool hasTags, bool hasDomainLabels, string domain, bool stripRewrites, ITokenizerFactory <CoreLabel> tokFactory)
 {
     tf                   = tokFactory;
     inputHasTags         = hasTags;
     inputHasDomainLabels = hasDomainLabels;
     inputDomain          = domain;
     shouldStripRewrites  = stripRewrites;
     segMarker            = hasSegMarkers ? DefaultSegMarker : null;
     factory              = LineIterator.GetFactory(new _ISerializableFunction_131(this));
 }
Exemple #16
0
        /// <summary>
        /// Replaces the default <see cref="ITokenizerFactory"/> implementation.
        /// </summary>
        public FullTextIndexBuilder <TKey> WithTokenizerFactory(ITokenizerFactory tokenizerFactory)
        {
            if (tokenizerFactory is null)
            {
                throw new ArgumentNullException(nameof(tokenizerFactory));
            }

            this.tokenizerFactory = tokenizerFactory;

            return(this);
        }
Exemple #17
0
 /// <summary>Make an Arabic Segmenter.</summary>
 /// <param name="props">
 /// Options for how to tokenize. See the main method of
 /// <see cref="ArabicTokenizer{T}"/>
 /// for details
 /// </param>
 public ArabicSegmenter(Properties props)
 {
     /* Serializable */
     // SEGMENTER OPTIONS (can be set in the Properties object
     // passed to the constructor).
     // The input already been tokenized. Do not run the Arabic tokenizer.
     // Tokenizer options
     // Mark segmented prefixes with this String
     // Mark segmented suffixes with this String
     // Number of decoding threads
     // Write TedEval files
     // Use a custom feature factory
     // Training and evaluation files have domain labels
     // Training and evaluation text are all in the same domain (default:atb)
     // Ignore rewrites (training only, produces a model that then can be used to do
     // no-rewrite segmentation)
     // Use the original feature set which doesn't contain start-and-end "wrapper" features
     isTokenized      = props.Contains(optTokenized);
     tokenizerOptions = props.GetProperty(optTokenizer, null);
     tedEvalPrefix    = props.GetProperty(optTedEval, null);
     hasDomainLabels  = props.Contains(optWithDomains);
     domain           = props.GetProperty(optDomain, "atb");
     noRewrites       = props.Contains(optNoRewrites);
     tf           = GetTokenizerFactory();
     prefixMarker = props.GetProperty(optPrefix, string.Empty);
     suffixMarker = props.GetProperty(optSuffix, string.Empty);
     if (props.Contains(optLocalFeaturesOnly))
     {
         if (props.Contains(optFeatureFactory))
         {
             throw new Exception("Cannot use custom feature factory with localFeaturesOnly flag--" + "have your custom feature factory extend ArabicSegmenterFeatureFactory instead of " + "StartAndEndArabicSegmenterFeatureFactory and remove the localFeaturesOnly flag."
                                 );
         }
         props.SetProperty(optFeatureFactory, localOnlyFeatureFactory);
     }
     if (!props.Contains(optFeatureFactory))
     {
         props.SetProperty(optFeatureFactory, defaultFeatureFactory);
     }
     // Remove all command-line properties that are specific to ArabicSegmenter
     props.Remove(optTokenizer);
     props.Remove(optTokenized);
     props.Remove(optPrefix);
     props.Remove(optSuffix);
     props.Remove(optThreads);
     props.Remove(optTedEval);
     props.Remove(optWithDomains);
     props.Remove(optDomain);
     props.Remove(optNoRewrites);
     props.Remove(optLocalFeaturesOnly);
     flags      = new SeqClassifierFlags(props);
     classifier = new CRFClassifier <CoreLabel>(flags);
 }
 public virtual void RunTest <_T0>(ITokenizerFactory <_T0> factory, string[] testStrings, string[][] resultsStrings)
     where _T0 : IHasWord
 {
     for (int i = 0; i < testStrings.Length; ++i)
     {
         ITokenizer <IHasWord> tokenizer = factory.GetTokenizer(new StringReader(testStrings[i]));
         IList <IHasWord>      tokens    = tokenizer.Tokenize();
         NUnit.Framework.Assert.AreEqual(resultsStrings[i].Length, tokens.Count);
         for (int j = 0; j < resultsStrings[i].Length; ++j)
         {
             NUnit.Framework.Assert.AreEqual(resultsStrings[i][j], tokens[j].Word());
         }
     }
 }
Exemple #19
0
 /// <summary>Copy constructor.</summary>
 /// <param name="other"/>
 public ArabicSegmenter(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter other)
 {
     isTokenized      = other.isTokenized;
     tokenizerOptions = other.tokenizerOptions;
     prefixMarker     = other.prefixMarker;
     suffixMarker     = other.suffixMarker;
     tedEvalPrefix    = other.tedEvalPrefix;
     hasDomainLabels  = other.hasDomainLabels;
     domain           = other.domain;
     noRewrites       = other.noRewrites;
     flags            = other.flags;
     // ArabicTokenizerFactory is *not* threadsafe. Make a new copy.
     tf = GetTokenizerFactory();
     // CRFClassifier is threadsafe, so return a reference.
     classifier = other.classifier;
 }
Exemple #20
0
        public FullTextIndex(
            FullTextIndexConfiguration <TKey> options,
            IIndexNodeFactory indexNodeFactory = null,
            ITokenizerFactory tokenizerFactory = null,
            IQueryParser queryParser           = null)
        {
            this.indexNodeFactory = indexNodeFactory ?? new IndexNodeFactory();
            this.tokenizerFactory = tokenizerFactory ?? new TokenizerFactory();
            this.queryParser      = queryParser ?? new QueryParser();

            this.indexNodeFactory.Configure(options);

            this.IdPool      = new IdPool <TKey>();
            this.FieldLookup = new IndexedFieldLookup();
            this.Root        = this.indexNodeFactory.CreateNode();
        }
        public virtual void TestArabicTokenizer()
        {
            System.Diagnostics.Debug.Assert((untokInputs.Length == tokReferences.Length));
            ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory();

            tf.SetOptions("removeProMarker");
            tf.SetOptions("removeSegMarker");
            tf.SetOptions("removeMorphMarker");
            for (int i = 0; i < untokInputs.Length; ++i)
            {
                string line = untokInputs[i];
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(line));
                IList <CoreLabel>      tokens    = tokenizer.Tokenize();
                string tokenizedLine             = SentenceUtils.ListToString(tokens);
                string reference = tokReferences[i];
                NUnit.Framework.Assert.AreEqual("Tokenization deviates from reference", reference, tokenizedLine);
            }
        }
        public virtual void TestCharOffsets()
        {
            string untokInput = "إِنَّ- -نا هادِئ+ُونَ .";

            int[] beginOffsets = new int[] { 0, 7, 11, 22 };
            int[] endOffsets   = new int[] { 6, 10, 21, 23 };
            ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory();

            tf.SetOptions("removeProMarker");
            tf.SetOptions("removeSegMarker");
            tf.SetOptions("removeMorphMarker");
            ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(untokInput));
            IList <CoreLabel>      tokens    = tokenizer.Tokenize();

            NUnit.Framework.Assert.AreEqual("Number of tokens doesn't match reference", tokens.Count, beginOffsets.Length);
            for (int i = 0; i < beginOffsets.Length; i++)
            {
                NUnit.Framework.Assert.AreEqual("Char begin offset deviates from reference", beginOffsets[i], tokens[i].BeginPosition());
                NUnit.Framework.Assert.AreEqual("Char end offset deviates from reference", endOffsets[i], tokens[i].EndPosition());
            }
        }
        private void RegisterField(IFieldTokenizationOptions fieldOptions, ITokenizerFactory tokenizerFactory, TokenizationOptions defaultTokenizationOptions)
        {
            var fieldName = fieldOptions.Name;

            if (this.fieldToDetailsLookup.ContainsKey(fieldOptions.Name))
            {
                throw new LiftiException(ExceptionMessages.FieldNameAlreadyUsed, fieldName);
            }

            var newId = Interlocked.Increment(ref nextId);

            if (newId > byte.MaxValue)
            {
                throw new LiftiException(ExceptionMessages.MaximumDistinctFieldsIndexReached);
            }

            var id = (byte)newId;
            var fieldTokenizationOptions = fieldOptions.TokenizationOptions ?? defaultTokenizationOptions;

            this.fieldToDetailsLookup[fieldName] = new IndexedFieldDetails((byte)id, tokenizerFactory.Create(fieldTokenizationOptions));
            this.idToFieldLookup[id]             = fieldName;
        }
Exemple #24
0
 public TokenizerAnnotator(bool verbose, Properties props, string options)
 {
     if (props == null)
     {
         props = new Properties();
     }
     // check if segmenting must be done
     if (props.GetProperty("tokenize.language") != null && LanguageInfo.IsSegmenterLanguage(props.GetProperty("tokenize.language")))
     {
         useSegmenter = true;
         if (LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language")) == LanguageInfo.HumanLanguage.Arabic)
         {
             segmenterAnnotator = new ArabicSegmenterAnnotator("segment", props);
         }
         else
         {
             if (LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language")) == LanguageInfo.HumanLanguage.Chinese)
             {
                 segmenterAnnotator = new ChineseSegmenterAnnotator("segment", props);
             }
             else
             {
                 segmenterAnnotator = null;
                 throw new Exception("No segmenter implemented for: " + LanguageInfo.GetLanguageFromString(props.GetProperty("tokenize.language")));
             }
         }
     }
     else
     {
         useSegmenter       = false;
         segmenterAnnotator = null;
     }
     Verbose = PropertiesUtils.GetBool(props, "tokenize.verbose", verbose);
     TokenizerAnnotator.TokenizerType type = TokenizerAnnotator.TokenizerType.GetTokenizerType(props);
     factory = InitFactory(type, props, options);
 }
Exemple #25
0
        internal FullTextIndex(
            IndexOptions indexOptions,
            ConfiguredItemTokenizationOptions <TKey> itemTokenizationOptions,
            IIndexNodeFactory indexNodeFactory,
            ITokenizerFactory tokenizerFactory,
            IQueryParser queryParser,
            TokenizationOptions defaultTokenizationOptions,
            Func <IIndexSnapshot <TKey>, Task>[]?indexModifiedActions)
        {
            this.indexOptions               = indexOptions;
            this.itemTokenizationOptions    = itemTokenizationOptions ?? throw new ArgumentNullException(nameof(itemTokenizationOptions));
            this.IndexNodeFactory           = indexNodeFactory ?? throw new ArgumentNullException(nameof(indexNodeFactory));
            this.tokenizerFactory           = tokenizerFactory ?? throw new ArgumentNullException(nameof(tokenizerFactory));
            this.queryParser                = queryParser ?? throw new ArgumentNullException(nameof(queryParser));
            this.defaultTokenizationOptions = defaultTokenizationOptions ?? throw new ArgumentNullException(nameof(defaultTokenizationOptions));
            this.indexModifiedActions       = indexModifiedActions;
            this.idPool      = new IdPool <TKey>();
            this.FieldLookup = new IndexedFieldLookup(
                this.itemTokenizationOptions.GetAllConfiguredFields(),
                tokenizerFactory,
                defaultTokenizationOptions);

            this.Root = this.IndexNodeFactory.CreateRootNode();
        }
 /// <summary>Sets the tokenizer used to chop up text into words for the documents.</summary>
 public virtual void SetTokenizerFactory <_T0>(ITokenizerFactory <_T0> tokenizerFactory)
     where _T0 : IHasWord
 {
     this.tokenizerFactory = tokenizerFactory;
 }
Exemple #27
0
 /// <param name="hasSegMarkers">if true, input has segmentation markers</param>
 /// <param name="hasTags">if true, input has morphological analyses separated by tagDelimiter.</param>
 /// <param name="tokFactory">a TokenizerFactory for the input</param>
 public ArabicDocumentReaderAndWriter(bool hasSegMarkers, bool hasTags, ITokenizerFactory <CoreLabel> tokFactory)
     : this(hasSegMarkers, hasTags, false, "123", tokFactory)
 {
 }
Exemple #28
0
 /// <param name="hasSegMarkers">if true, input has segmentation markers</param>
 /// <param name="hasTags">if true, input has morphological analyses separated by tagDelimiter.</param>
 /// <param name="hasDomainLabels">
 /// if true, input has a whitespace-terminated domain at the beginning
 /// of each line of text
 /// </param>
 /// <param name="tokFactory">a TokenizerFactory for the input</param>
 public ArabicDocumentReaderAndWriter(bool hasSegMarkers, bool hasTags, bool hasDomainLabels, string domain, ITokenizerFactory <CoreLabel> tokFactory)
     : this(hasSegMarkers, hasTags, hasDomainLabels, domain, false, tokFactory)
 {
 }
Exemple #29
0
        /// <summary>
        /// This method lets you train and test a segmenter relative to a
        /// Treebank.
        /// </summary>
        /// <remarks>
        /// This method lets you train and test a segmenter relative to a
        /// Treebank.
        /// <p>
        /// <i>Implementation note:</i> This method is largely cloned from
        /// LexicalizedParser's main method.  Should we try to have it be able
        /// to train segmenters to stop things going out of sync?
        /// </remarks>
        public static void Main(string[] args)
        {
            bool     train = false;
            bool     saveToSerializedFile      = false;
            bool     saveToTextFile            = false;
            string   serializedInputFileOrUrl  = null;
            string   textInputFileOrUrl        = null;
            string   serializedOutputFileOrUrl = null;
            string   textOutputFileOrUrl       = null;
            string   treebankPath = null;
            Treebank testTreebank = null;
            // Treebank tuneTreebank = null;
            string      testPath    = null;
            IFileFilter testFilter  = null;
            IFileFilter trainFilter = null;
            string      encoding    = null;
            // variables needed to process the files to be parsed
            ITokenizerFactory <Word> tokenizerFactory = null;
            //    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor();
            bool tokenized = false;
            // whether or not the input file has already been tokenized
            IFunction <IList <IHasWord>, IList <IHasWord> > escaper = new ChineseEscaper();
            // int tagDelimiter = -1;
            // String sentenceDelimiter = "\n";
            // boolean fromXML = false;
            int argIndex = 0;

            if (args.Length < 1)
            {
                log.Info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*");
                return;
            }
            Options op = new Options();

            op.tlpParams = new ChineseTreebankParserParams();
            // while loop through option arguments
            while (argIndex < args.Length && args[argIndex][0] == '-')
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train"))
                {
                    train = true;
                    saveToSerializedFile = true;
                    int numSubArgs = NumSubArgs(args, argIndex);
                    argIndex++;
                    if (numSubArgs > 1)
                    {
                        treebankPath = args[argIndex];
                        argIndex++;
                    }
                    else
                    {
                        throw new Exception("Error: -train option must have treebankPath as first argument.");
                    }
                    if (numSubArgs == 2)
                    {
                        trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
                    }
                    else
                    {
                        if (numSubArgs >= 3)
                        {
                            try
                            {
                                int low  = System.Convert.ToInt32(args[argIndex]);
                                int high = System.Convert.ToInt32(args[argIndex + 1]);
                                trainFilter = new NumberRangeFileFilter(low, high, true);
                                argIndex   += 2;
                            }
                            catch (NumberFormatException)
                            {
                                // maybe it's a ranges expression?
                                trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                                argIndex++;
                            }
                        }
                    }
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-encoding"))
                    {
                        // sets encoding for TreebankLangParserParams
                        encoding = args[argIndex + 1];
                        op.tlpParams.SetInputEncoding(encoding);
                        op.tlpParams.SetOutputEncoding(encoding);
                        argIndex += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-loadFromSerializedFile"))
                        {
                            // load the parser from a binary serialized file
                            // the next argument must be the path to the parser file
                            serializedInputFileOrUrl = args[argIndex + 1];
                            argIndex += 2;
                        }
                        else
                        {
                            // doesn't make sense to load from TextFile -pichuan
                            //      } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
                            //        // load the parser from declarative text file
                            //        // the next argument must be the path to the parser file
                            //        textInputFileOrUrl = args[argIndex + 1];
                            //        argIndex += 2;
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToSerializedFile"))
                            {
                                saveToSerializedFile      = true;
                                serializedOutputFileOrUrl = args[argIndex + 1];
                                argIndex += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToTextFile"))
                                {
                                    // save the parser to declarative text file
                                    saveToTextFile      = true;
                                    textOutputFileOrUrl = args[argIndex + 1];
                                    argIndex           += 2;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                                    {
                                        // the next argument is the treebank path and range for testing
                                        int numSubArgs = NumSubArgs(args, argIndex);
                                        argIndex++;
                                        if (numSubArgs == 1)
                                        {
                                            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                        }
                                        else
                                        {
                                            if (numSubArgs > 1)
                                            {
                                                testPath = args[argIndex++];
                                                if (numSubArgs == 2)
                                                {
                                                    testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                                }
                                                else
                                                {
                                                    if (numSubArgs >= 3)
                                                    {
                                                        try
                                                        {
                                                            int low  = System.Convert.ToInt32(args[argIndex]);
                                                            int high = System.Convert.ToInt32(args[argIndex + 1]);
                                                            testFilter = new NumberRangeFileFilter(low, high, true);
                                                            argIndex  += 2;
                                                        }
                                                        catch (NumberFormatException)
                                                        {
                                                            // maybe it's a ranges expression?
                                                            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                    else
                                    {
                                        int j = op.tlpParams.SetOptionFlag(args, argIndex);
                                        if (j == argIndex)
                                        {
                                            log.Info("Unknown option ignored: " + args[argIndex]);
                                            j++;
                                        }
                                        argIndex = j;
                                    }
                                }
                            }
                        }
                    }
                }
            }
            // end while loop through arguments
            ITreebankLangParserParams tlpParams = op.tlpParams;

            // all other arguments are order dependent and
            // are processed in order below
            Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = null;
            if (!train && op.testOptions.verbose)
            {
                System.Console.Out.WriteLine("Currently " + new DateTime());
                PrintArgs(args, System.Console.Out);
            }
            if (train)
            {
                PrintArgs(args, System.Console.Out);
                // so we train a parser using the treebank
                if (treebankPath == null)
                {
                    // the next arg must be the treebank path, since it wasn't give earlier
                    treebankPath = args[argIndex];
                    argIndex++;
                    if (args.Length > argIndex + 1)
                    {
                        try
                        {
                            // the next two args might be the range
                            int low  = System.Convert.ToInt32(args[argIndex]);
                            int high = System.Convert.ToInt32(args[argIndex + 1]);
                            trainFilter = new NumberRangeFileFilter(low, high, true);
                            argIndex   += 2;
                        }
                        catch (NumberFormatException)
                        {
                            // maybe it's a ranges expression?
                            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                            argIndex++;
                        }
                    }
                }
                Treebank        trainTreebank = MakeTreebank(treebankPath, op, trainFilter);
                IIndex <string> wordIndex     = new HashIndex <string>();
                IIndex <string> tagIndex      = new HashIndex <string>();
                cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex);
            }
            else
            {
                if (textInputFileOrUrl != null)
                {
                }
                else
                {
                    // so we load the segmenter from a text grammar file
                    // XXXXX fix later -pichuan
                    //cs = new LexicalizedParser(textInputFileOrUrl, true, op);
                    // so we load a serialized segmenter
                    if (serializedInputFileOrUrl == null)
                    {
                        // the next argument must be the path to the serialized parser
                        serializedInputFileOrUrl = args[argIndex];
                        argIndex++;
                    }
                    try
                    {
                        cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
                    }
                    catch (ArgumentException)
                    {
                        log.Info("Error loading segmenter, exiting...");
                        System.Environment.Exit(0);
                    }
                }
            }
            // the following has to go after reading parser to make sure
            // op and tlpParams are the same for train and test
            TreePrint treePrint = op.testOptions.TreePrint(tlpParams);

            if (testFilter != null)
            {
                if (testPath == null)
                {
                    if (treebankPath == null)
                    {
                        throw new Exception("No test treebank path specified...");
                    }
                    else
                    {
                        log.Info("No test treebank path specified.  Using train path: \"" + treebankPath + "\"");
                        testPath = treebankPath;
                    }
                }
                testTreebank = tlpParams.TestMemoryTreebank();
                testTreebank.LoadPath(testPath, testFilter);
            }
            op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(tlpParams.SisterSplitters()));
            // at this point we should be sure that op.tlpParams is
            // set appropriately (from command line, or from grammar file),
            // and will never change again.  We also set the tlpParams of the
            // LexicalizedParser instance to be the same object.  This is
            // redundancy that we probably should take out eventually.
            //
            // -- Roger
            if (op.testOptions.verbose)
            {
                log.Info("Lexicon is " + cs.GetType().FullName);
            }
            PrintWriter pwOut = tlpParams.Pw();
            PrintWriter pwErr = tlpParams.Pw(System.Console.Error);

            // Now what do we do with the parser we've made
            if (saveToTextFile)
            {
                // save the parser to textGrammar format
                if (textOutputFileOrUrl != null)
                {
                    SaveSegmenterDataToText(cs, textOutputFileOrUrl);
                }
                else
                {
                    log.Info("Usage: must specify a text segmenter data output path");
                }
            }
            if (saveToSerializedFile)
            {
                if (serializedOutputFileOrUrl == null && argIndex < args.Length)
                {
                    // the next argument must be the path to serialize to
                    serializedOutputFileOrUrl = args[argIndex];
                    argIndex++;
                }
                if (serializedOutputFileOrUrl != null)
                {
                    SaveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
                }
                else
                {
                    if (textOutputFileOrUrl == null && testTreebank == null)
                    {
                        // no saving/parsing request has been specified
                        log.Info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename");
                    }
                }
            }
            /* --------------------- Testing part!!!! ----------------------- */
            if (op.testOptions.verbose)
            {
            }
            //      printOptions(false, op);
            if (testTreebank != null || (argIndex < args.Length && Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")))
            {
                // test parser on treebank
                if (testTreebank == null)
                {
                    // the next argument is the treebank path and range for testing
                    testTreebank = tlpParams.TestMemoryTreebank();
                    if (args.Length < argIndex + 4)
                    {
                        testTreebank.LoadPath(args[argIndex + 1]);
                    }
                    else
                    {
                        int testlow  = System.Convert.ToInt32(args[argIndex + 2]);
                        int testhigh = System.Convert.ToInt32(args[argIndex + 3]);
                        testTreebank.LoadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true));
                    }
                }
            }
        }
 public TextProcessorFactory(ISentenceDetectorFactory sdFact, ITokenizerFactory tokFact, IStemmerFactory stemFact)
 {
     this.sdFact   = sdFact;
     this.tokFact  = tokFact;
     this.stemFact = stemFact;
 }