/// <summary>
        /// Reads a single tree in standard Penn Treebank format from the
        /// input stream.
        /// </summary>
        /// <remarks>
        /// Reads a single tree in standard Penn Treebank format from the
        /// input stream. The method supports additional parentheses around the
        /// tree (an unnamed ROOT node) so long as they are balanced. If the token stream
        /// ends before the current tree is complete, then the method will throw an
        /// <code>IOException</code>.
        /// <p>
        /// Note that the method will skip malformed trees and attempt to
        /// read additional trees from the input stream. It is possible, however,
        /// that a malformed tree will corrupt the token stream. In this case,
        /// an <code>IOException</code> will eventually be thrown.
        /// </remarks>
        /// <returns>A single tree, or <code>null</code> at end of token stream.</returns>
        /// <exception cref="System.IO.IOException"/>
        public virtual Tree ReadTree()
        {
            Tree t = null;

            while (tokenizer.MoveNext() && t == null)
            {
                //Setup PDA
                this.currentTree = null;
                this.stack       = new List <Tree>();
                try
                {
                    t = GetTreeFromInputStream();
                }
                catch (NoSuchElementException)
                {
                    throw new IOException("End of token stream encountered before parsing could complete.");
                }
                if (t != null)
                {
                    // cdm 20100618: Don't do this!  This was never the historical behavior!!!
                    // Escape empty trees e.g. (())
                    // while(t != null && (t.value() == null || t.value().equals("")) && t.numChildren() <= 1)
                    //   t = t.firstChild();
                    if (treeNormalizer != null && treeFactory != null)
                    {
                        t = treeNormalizer.NormalizeWholeTree(t, treeFactory);
                    }
                    if (t != null)
                    {
                        t.IndexLeaves(true);
                    }
                }
            }
            return(t);
        }
Esempio n. 2
0
        /// <summary>A fast, rule-based tokenizer for Modern Standard French.</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Modern Standard French.
        /// Performs punctuation splitting and light tokenization by default.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It assumes that the input
        /// file is delimited by the system line separator. The output will be equivalently
        /// delimited.
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs());

            if (options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            // Lexer options
            ITokenizerFactory <CoreLabel> tf = options.Contains("ftb") ? FrenchTokenizer.FtbFactory() : FrenchTokenizer.Factory();
            string orthoOptions = options.GetProperty("options", string.Empty);

            // When called from this main method, split on newline. No options for
            // more granular sentence splitting.
            orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
            tf.SetOptions(orthoOptions);
            // Other options
            string encoding = options.GetProperty("encoding", "UTF-8");
            bool   toLower  = PropertiesUtils.GetBool(options, "lowerCase", false);
            // Read the file from stdin
            int  nLines    = 0;
            int  nTokens   = 0;
            long startTime = Runtime.NanoTime();

            try
            {
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                {
                    ++nTokens;
                    string word = tokenizer.Current.Word();
                    if (word.Equals(FrenchLexer.NewlineToken))
                    {
                        ++nLines;
                        printSpace = false;
                        System.Console.Out.WriteLine();
                    }
                    else
                    {
                        if (printSpace)
                        {
                            System.Console.Out.Write(" ");
                        }
                        string outputToken = toLower ? word.ToLower(Locale.French) : word;
                        System.Console.Out.Write(outputToken);
                        printSpace = true;
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                log.Error(e);
            }
            long   elapsedTime = Runtime.NanoTime() - startTime;
            double linesPerSec = (double)nLines / (elapsedTime / 1e9);

            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec);
        }
        /// <summary>A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).
        /// Performs punctuation splitting and light tokenization by default.
        /// Orthographic normalization options are available, and can be enabled with
        /// command line options.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It normalizes non-printing
        /// line separators across platforms and prints the system default line splitter
        /// to the output.
        /// <p>
        /// The following normalization options are provided:
        /// <ul>
        /// <li>
        /// <c>useUTF8Ellipsis</c>
        /// : Replaces sequences of three or more full stops with \u2026</li>
        /// <li>
        /// <c>normArDigits</c>
        /// : Convert Arabic digits to ASCII equivalents</li>
        /// <li>
        /// <c>normArPunc</c>
        /// : Convert Arabic punctuation to ASCII equivalents</li>
        /// <li>
        /// <c>normAlif</c>
        /// : Change all alif forms to bare alif</li>
        /// <li>
        /// <c>normYa</c>
        /// : Map ya to alif maqsura</li>
        /// <li>
        /// <c>removeDiacritics</c>
        /// : Strip all diacritics</li>
        /// <li>
        /// <c>removeTatweel</c>
        /// : Strip tatweel elongation character</li>
        /// <li>
        /// <c>removeQuranChars</c>
        /// : Remove diacritics that appear in the Quran</li>
        /// <li>
        /// <c>removeProMarker</c>
        /// : Remove the ATB null pronoun marker</li>
        /// <li>
        /// <c>removeSegMarker</c>
        /// : Remove the ATB clitic segmentation marker</li>
        /// <li>
        /// <c>removeMorphMarker</c>
        /// : Remove the ATB morpheme boundary markers</li>
        /// <li>
        /// <c>removeLengthening</c>
        /// : Replace all sequences of three or more identical (non-period) characters with one copy</li>
        /// <li>
        /// <c>atbEscaping</c>
        /// : Replace left/right parentheses with ATB escape characters</li>
        /// </ul>
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length > 0 && args[0].Contains("help"))
            {
                System.Console.Error.Printf("Usage: java %s [OPTIONS] < file%n", typeof(ArabicTokenizer).FullName);
                System.Console.Error.Printf("%nOptions:%n");
                log.Info("   -help : Print this message. See javadocs for all normalization options.");
                log.Info("   -atb  : Tokenization for the parsing experiments in Green and Manning (2010)");
                System.Environment.Exit(-1);
            }
            // Process normalization options
            Properties tokenizerOptions      = StringUtils.ArgsToProperties(args);
            ITokenizerFactory <CoreLabel> tf = tokenizerOptions.Contains("atb") ? ArabicTokenizer.AtbFactory() : ArabicTokenizer.Factory();

            foreach (string option in tokenizerOptions.StringPropertyNames())
            {
                tf.SetOptions(option);
            }
            // Replace line separators with a token so that we can
            // count lines
            tf.SetOptions("tokenizeNLs");
            // Read the file
            int nLines  = 0;
            int nTokens = 0;

            try
            {
                string encoding = "UTF-8";
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                {
                    ++nTokens;
                    string word = tokenizer.Current.Word();
                    if (word.Equals(ArabicLexer.NewlineToken))
                    {
                        ++nLines;
                        printSpace = false;
                        System.Console.Out.WriteLine();
                    }
                    else
                    {
                        if (printSpace)
                        {
                            System.Console.Out.Write(" ");
                        }
                        System.Console.Out.Write(word);
                        printSpace = true;
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens)%n", nLines, nTokens);
        }
Esempio n. 4
0
        private YAnchor GetAnchorValueDependent(ITokenizer tokenizer)
        {
            switch (tokenizer.Current.Value.Kind)
            {
            // check "name : anchor"
            case TokenKind.Indent when tokenizer.Current.Next?.Next?.Next?.Value.Kind == TokenKind.Anchor:
                tokenizer.MoveNext();
                tokenizer.MoveNext();
                tokenizer.MoveNext();
                var name = tokenizer.Current.Value.Value;
                tokenizer.MoveNext();
                var anchor = new YAnchor(this.GetNodeValue(tokenizer));
                tokenizer.Anchors[name] = anchor;
                return(anchor);

            default:
                return(null);
            }
        }
Esempio n. 5
0
        private YScalar GetScalarValueDependent(ITokenizer tokenizer)
        {
            switch (tokenizer.Current.Value.Kind)
            {
            case TokenKind.StringDouble:
            case TokenKind.StringSingle:
            case TokenKind.StringFolding:
            case TokenKind.StringLiteral:
            {
                var kind  = tokenizer.Current.Value.Kind;
                var value = tokenizer.Current.Value.Value;

                if (tokenizer.Current.Value.Kind == TokenKind.StringDouble)
                {
                    value = YScalar.UnescapeString(value);
                }

                tokenizer.MoveNext();

                var style = kind == TokenKind.StringFolding || kind == TokenKind.StringLiteral ? YNodeStyle.Block : YNodeStyle.Flow;
                return(new YScalar(value, style));
            }

            case TokenKind.StringPlain:
            {
                var value = tokenizer.Current.Value.Value;

                tokenizer.MoveNext();

                if (string.IsNullOrEmpty(value))
                {
                    return(new YScalar(null));
                }

                return(value.Equals("null", StringComparison.OrdinalIgnoreCase) ? new YScalar(null) : new YScalar(value));
            }

            default:
                return(null);
            }
        }
Esempio n. 6
0
        private INode ParseMultiplyDivide()
        {
            var left = ParseUnary();

            while (true)
            {
                Func <double, double, double> operation = _tokenizer.Token switch
                {
                    Token.Multiply => ((a, b) => a * b),
                    Token.Divide => ((a, b) => a / b),
                    _ => null
                };

                if (operation == null)
                {
                    return(left);
                }
                _tokenizer.MoveNext();
                var right = ParseUnary();
                left = new BinaryOperation(left, right, operation);
            }
        }
        /// <summary>Read parse trees from a Reader.</summary>
        /// <param name="in">Reader</param>
        /// <param name="tf">TreeFactory -- factory to create some kind of Tree</param>
        /// <param name="tn">the method of normalizing trees</param>
        /// <param name="st">Tokenizer that divides up Reader</param>
        public PennTreeReader(Reader @in, ITreeFactory tf, TreeNormalizer tn, ITokenizer <string> st)
        {
            // misuse a list as a stack, since we want to avoid the synchronized and old Stack, but don't need the power and JDK 1.6 dependency of a Deque
            reader         = @in;
            treeFactory    = tf;
            treeNormalizer = tn;
            tokenizer      = st;
            // check for whacked out headers still present in Brown corpus in Treebank 3
            string first = (st.MoveNext() ? st.Peek() : null);

            if (first != null && first.StartsWith("*x*x*x"))
            {
                int foundCount = 0;
                while (foundCount < 4 && st.MoveNext())
                {
                    first = st.Current;
                    if (first != null && first.StartsWith("*x*x*x"))
                    {
                        foundCount++;
                    }
                }
            }
        }
Esempio n. 8
0
        private YKeyValuePair ParseMappingKey(ITokenizer tokenizer)
        {
            switch (tokenizer.Current.Value.Kind)
            {
            case TokenKind.MappingKey:
            {
                tokenizer.MoveNext();

                var key = this.GetNodeKey(tokenizer);

                if (tokenizer.Current.Value.Kind != TokenKind.MappingValue)
                {
                    return(new YKeyValuePair(key, new YScalar(null)));
                }

                tokenizer.MoveNext();
                var keyValuePair = new YKeyValuePair(key);
                var value        = this.GetNodeValue(tokenizer);
                keyValuePair.Value = value;

                return(keyValuePair);
            }

            default:
            {
                var key = this.GetNodeKey(tokenizer);
                tokenizer.MoveNext();

                var keyValuePair = new YKeyValuePair(key);
                var value        = this.GetNodeValue(tokenizer);
                keyValuePair.Value = value;

                return(keyValuePair);
            }
            }
        }
Esempio n. 9
0
        public ParserTree ParseStringToTree(string input)
        {
            tokenizer = new BTokenizer(input);

            while (tokenizer.MoveNext())
            {
                string currentToken = tokenizer.Current;

                _internalStack.DoPush(new TerminalNode(currentToken));

                while (Reduce()) { }
            }

            ParserTree tree = new ParserTree(_grammar, _internalStack);
            return tree;
        }
        private YDocument GetDocumentValueDependent(ITokenizer tokenizer)
        {
            if (tokenizer.Current.Value.Kind != TokenKind.Document)
            {
                return(null);
            }

            tokenizer.MoveNext();
            var items = new List <YNode>();

            while (tokenizer.Current.Value.Kind != TokenKind.Document && tokenizer.Current.Value.Kind != TokenKind.Eof)
            {
                items.Add(this.GetNodeValue(tokenizer));
            }

            return(new YDocument(YNodeStyle.Block, items.ToArray()));
        }
Esempio n. 11
0
        private YAlias GetAliasValueDependent(ITokenizer tokenizer)
        {
            if (tokenizer.Current.Value.Kind != TokenKind.Alias)
            {
                return(null);
            }

            var anchorName = tokenizer.Current.Value.Value;

            if (!tokenizer.Anchors.ContainsKey(anchorName))
            {
                throw ParseException.Tokenizer(tokenizer, $"Not found anchorName: {anchorName}");
            }

            var anchorValue = tokenizer.Anchors[anchorName];

            tokenizer.MoveNext();
            return(new YAlias(anchorName, anchorValue));
        }
        private YSequence GetSequenceValueDependent(ITokenizer tokenizer)
        {
            switch (tokenizer.Current.Value.Kind)
            {
            case TokenKind.Indent when tokenizer.Current.Next?.Value.Kind == TokenKind.SequenceValue:
            {
                var sequenceNode = new YSequence(YNodeStyle.Block);
                var items        = new List <YNode>();
                tokenizer.MoveNext();

                while (tokenizer.Current.Value.Kind != TokenKind.Unindent && tokenizer.Current.Value.Kind != TokenKind.Eof)
                {
                    if (tokenizer.Current.Value.Kind != TokenKind.SequenceValue)
                    {
                        throw ParseException.UnexpectedToken(tokenizer, TokenKind.SequenceValue);
                    }

                    tokenizer.MoveNext();
                    items.Add(this.GetNodeValue(tokenizer));
                }

                if (tokenizer.Current.Value.Kind == TokenKind.Unindent)
                {
                    tokenizer.MoveNext();
                }

                sequenceNode.Add(items.ToArray());

                return(sequenceNode);
            }

            case TokenKind.SequenceBegin:
            {
                var sequenceNode = new YSequence(YNodeStyle.Flow);
                var items        = new List <YNode>();
                tokenizer.MoveNext();

                do
                {
                    if (tokenizer.Current.Value.Kind == TokenKind.SequenceEnd)
                    {
                        break;
                    }

                    items.Add(this.GetNodeValue(tokenizer));
                } while (tokenizer.Current.Value.Kind == TokenKind.ItemDelimiter && tokenizer.MoveNext());

                if (tokenizer.Current.Value.Kind != TokenKind.SequenceEnd)
                {
                    throw ParseException.UnexpectedToken(tokenizer, TokenKind.SequenceEnd);
                }

                tokenizer.MoveNext();
                sequenceNode.Add(items.ToArray());

                return(sequenceNode);
            }

            default:
                return(null);
            }
        }
Esempio n. 13
0
        /// <summary>
        /// usage: java ChineseDocumentToSentenceProcessor [-segmentIBM]
        /// -file filename [-encoding encoding]
        /// <p>
        /// The -segmentIBM option is for IBM GALE-specific splitting of an
        /// XML element into sentences.
        /// </summary>
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            //String encoding = "GB18030";
            Properties props = StringUtils.ArgsToProperties(args);
            // log.info("Here are the properties:");
            // props.list(System.err);
            bool alwaysAddS = props.Contains("alwaysAddS");

            Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor cp;
            if (!props.Contains("file"))
            {
                log.Info("usage: java ChineseDocumentToSentenceProcessor [-segmentIBM] -file filename [-encoding encoding]");
                return;
            }
            cp = new Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor();
            if (props.Contains("encoding"))
            {
                log.Info("WARNING: for now the default encoding is " + cp.encoding + ". It's not changeable for now");
            }
            string input = IOUtils.SlurpFileNoExceptions(props.GetProperty("file"), cp.encoding);

            // String input = StringUtils.slurpGBURLNoExceptions(new URL(props.getProperty("file")));
            if (props.Contains("segmentIBM"))
            {
                ITokenizer <Word> tok         = WhitespaceTokenizer.NewWordWhitespaceTokenizer(new StringReader(input), true);
                string            parseInside = props.GetProperty("parseInside");
                if (parseInside == null)
                {
                    parseInside = string.Empty;
                }
                Pattern       p1;
                Pattern       p2;
                Pattern       p3;
                Pattern       p4;
                PrintWriter   pw       = new PrintWriter(new OutputStreamWriter(System.Console.Out, cp.encoding), true);
                StringBuilder buff     = new StringBuilder();
                StringBuilder sgmlbuff = new StringBuilder();
                string        lastSgml = string.Empty;
                p1 = Pattern.Compile("<.*>");
                p2 = Pattern.Compile("\uFEFF?<[\\p{Alpha}]+");
                p3 = Pattern.Compile("[A-Za-z0-9=\"]+>");
                p4 = Pattern.Compile("<(?:" + parseInside + ")[ >]");
                bool inSGML     = false;
                int  splitItems = 0;
                int  numAdded   = 0;
                while (tok.MoveNext())
                {
                    string s = tok.Current.Word();
                    // pw.println("The token is |" + s + "|");
                    if (p2.Matcher(s).Matches())
                    {
                        inSGML = true;
                        sgmlbuff.Append(s).Append(" ");
                    }
                    else
                    {
                        if (p1.Matcher(s).Matches() || inSGML && p3.Matcher(s).Matches() || "\n".Equals(s))
                        {
                            inSGML = false;
                            if (buff.ToString().Trim().Length > 0)
                            {
                                // pw.println("Dumping sentences");
                                // pw.println("Buff is " + buff);
                                bool processIt = false;
                                if (parseInside.Equals(string.Empty))
                                {
                                    processIt = true;
                                }
                                else
                                {
                                    if (p4.Matcher(lastSgml).Find())
                                    {
                                        processIt = true;
                                    }
                                }
                                if (processIt)
                                {
                                    IList <string> sents = Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor.FromPlainText(buff.ToString(), true);
                                    // pw.println("Sents is " + sents);
                                    // pw.println();
                                    if (alwaysAddS || sents.Count > 1)
                                    {
                                        int i = 1;
                                        foreach (string str in sents)
                                        {
                                            pw.Print("<s id=\"" + i + "\">");
                                            pw.Print(str);
                                            pw.Println("</s>");
                                            i++;
                                        }
                                        if (sents.Count > 1)
                                        {
                                            splitItems++;
                                            numAdded += sents.Count - 1;
                                        }
                                    }
                                    else
                                    {
                                        if (sents.Count == 1)
                                        {
                                            pw.Print(sents[0]);
                                        }
                                    }
                                }
                                else
                                {
                                    pw.Print(buff);
                                }
                                buff = new StringBuilder();
                            }
                            sgmlbuff.Append(s);
                            // pw.println("sgmlbuff is " + sgmlbuff);
                            pw.Print(sgmlbuff);
                            lastSgml = sgmlbuff.ToString();
                            sgmlbuff = new StringBuilder();
                        }
                        else
                        {
                            if (inSGML)
                            {
                                sgmlbuff.Append(s).Append(" ");
                            }
                            else
                            {
                                buff.Append(s).Append(" ");
                            }
                        }
                    }
                }
                // pw.println("Buff is now |" + buff + "|");
                // end while (tok.hasNext()) {
                // empty remaining buffers
                pw.Flush();
                pw.Close();
                log.Info("Split " + splitItems + " segments, adding " + numAdded + " sentences.");
            }
            else
            {
                IList <string> sent = Edu.Stanford.Nlp.Process.ChineseDocumentToSentenceProcessor.FromHTML(input);
                PrintWriter    pw   = new PrintWriter(new OutputStreamWriter(System.Console.Error, cp.encoding), true);
                foreach (string a in sent)
                {
                    pw.Println(a);
                }
            }
        }
Esempio n. 14
0
        private YMapping GetMappingValueDependent(ITokenizer tokenizer)
        {
            switch (tokenizer.Current.Value.Kind)
            {
            case TokenKind.Indent when tokenizer.Current.Next?.Value.Kind == TokenKind.MappingKey:
            {
                var mappingNode = new YMapping(tokenizer.Current.Value.IndentLevel);
                var items       = new YKeyValueList();
                tokenizer.MoveNext();
                while (tokenizer.Current.Value.Kind == TokenKind.MappingKey)
                {
                    var keyValueNode = this.ParseMappingKey(tokenizer);
                    items.AddNode(keyValueNode);
                }

                while (tokenizer.Current.Value.Kind == TokenKind.Unindent)
                {
                    tokenizer.MoveNext();
                }

                mappingNode.Add(items.ToNodes());

                return(mappingNode);
            }

            case TokenKind.MappingValue:
            {
                var mappingNode = new YMapping(tokenizer.Current.Value.IndentLevel);
                tokenizer.MoveNext();
                var value = this.GetNodeValue(tokenizer);
                mappingNode.Add(value);

                return(mappingNode);
            }

            case TokenKind.Indent when tokenizer.Current.Next?.Next?.Value.Kind == TokenKind.MappingValue:
            {
                var mappingNode = new YMapping(tokenizer.Current.Value.IndentLevel);
                var items       = new YKeyValueList();
                tokenizer.MoveNext();

                // Добавлеяем элементы в список
                do
                {
                    var keyValueNode = this.ParseMappingKey(tokenizer);
                    items.AddNode(keyValueNode);
                } while (tokenizer.Current.Value.Kind != TokenKind.Unindent &&
                         tokenizer.Current.Value.Kind != TokenKind.Eof &&
                         tokenizer.Current.Value.Kind != TokenKind.Indent &&
                         tokenizer.Current.Value.IndentLevel >= mappingNode.IndentLevel);

                // Удаляем ненужные отступы
                while (tokenizer.Current.Value.Kind == TokenKind.Unindent)
                {
                    tokenizer.MoveNext();
                }

                // Проверяем уровень вложенности
                if (tokenizer.Current.Value.IndentLevel != 0 && tokenizer.Current.Value.Kind == TokenKind.Indent)
                {
                    while (tokenizer.Current.Value.IndentLevel == mappingNode.IndentLevel &&
                           tokenizer.Current.Value.Kind != TokenKind.Eof)
                    {
                        if (tokenizer.Current.Value.Kind == TokenKind.Indent)
                        {
                            tokenizer.MoveNext();
                        }

                        var keyValueNode = this.ParseMappingKey(tokenizer);
                        items.AddNode(keyValueNode);
                    }
                }

                mappingNode.Add(items.ToNodes());

                return(mappingNode);
            }

            case TokenKind.MappingBegin:
            {
                var mappingNode = new YMapping(tokenizer.Current.Value.IndentLevel);
                var items       = new YKeyValueList();
                tokenizer.MoveNext();
                do
                {
                    if (tokenizer.Current.Value.Kind == TokenKind.MappingEnd)
                    {
                        break;
                    }

                    var keyValueNode = this.ParseMappingKey(tokenizer);
                    items.AddNode(keyValueNode);
                } while (tokenizer.Current.Value.Kind == TokenKind.ItemDelimiter && tokenizer.MoveNext());

                if (tokenizer.Current.Value.Kind != TokenKind.MappingEnd)
                {
                    throw ParseException.UnexpectedToken(tokenizer, TokenKind.MappingEnd);
                }

                tokenizer.MoveNext();
                mappingNode.Add(items.ToNodes());

                return(mappingNode);
            }

            default:
                return(null);
            }
        }
        // todo: give options for document splitting. A line or the whole file or sentence splitting as now
        public virtual IEnumerator <IList <In> > GetIterator(Reader r)
        {
            ITokenizer <In> tokenizer = tokenizerFactory.GetTokenizer(r);
            // PTBTokenizer.newPTBTokenizer(r, false, true);
            IList <In>    words    = new List <In>();
            IN            previous = null;
            StringBuilder prepend  = new StringBuilder();

            /*
             * This changes SGML tags into whitespace -- it should maybe be moved elsewhere
             */
            while (tokenizer.MoveNext())
            {
                IN      w    = tokenizer.Current;
                string  word = w.Get(typeof(CoreAnnotations.TextAnnotation));
                Matcher m    = sgml.Matcher(word);
                if (m.Matches())
                {
                    string before = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.BeforeAnnotation)));
                    string after  = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.AfterAnnotation)));
                    prepend.Append(before).Append(word);
                    if (previous != null)
                    {
                        string previousTokenAfter = StringUtils.GetNotNullString(previous.Get(typeof(CoreAnnotations.AfterAnnotation)));
                        previous.Set(typeof(CoreAnnotations.AfterAnnotation), previousTokenAfter + word + after);
                    }
                }
                else
                {
                    // previous.appendAfter(w.word() + w.after());
                    string before = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.BeforeAnnotation)));
                    if (prepend.Length > 0)
                    {
                        prepend.Append(before);
                        w.Set(typeof(CoreAnnotations.BeforeAnnotation), prepend.ToString());
                        prepend = new StringBuilder();
                    }
                    words.Add(w);
                    previous = w;
                }
            }
            IList <IList <In> > sentences = wts.Process(words);
            string after_1 = string.Empty;
            IN     last    = null;

            foreach (IList <In> sentence in sentences)
            {
                int pos = 0;
                foreach (IN w in sentence)
                {
                    w.Set(typeof(CoreAnnotations.PositionAnnotation), int.ToString(pos));
                    after_1 = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.AfterAnnotation)));
                    w.Remove(typeof(CoreAnnotations.AfterAnnotation));
                    last = w;
                }
            }
            if (last != null)
            {
                last.Set(typeof(CoreAnnotations.AfterAnnotation), after_1);
            }
            return(sentences.GetEnumerator());
        }
Esempio n. 16
0
        /// <summary>A fast, rule-based tokenizer for Spanish based on AnCora.</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Spanish based on AnCora.
        /// Performs punctuation splitting and light tokenization by default.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It assumes that the input
        /// file is delimited by the system line separator. The output will be equivalently
        /// delimited.
        /// </p>
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs());

            if (options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            // Lexer options
            ITokenizerFactory <CoreLabel> tf = SpanishTokenizer.CoreLabelFactory();
            string orthoOptions = options.Contains("ancora") ? AncoraOptions : string.Empty;

            if (options.Contains("options"))
            {
                orthoOptions = orthoOptions.IsEmpty() ? options.GetProperty("options") : orthoOptions + ',' + options;
            }
            bool tokens = PropertiesUtils.GetBool(options, "tokens", false);

            if (!tokens)
            {
                orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
            }
            tf.SetOptions(orthoOptions);
            // Other options
            string encoding   = options.GetProperty("encoding", "UTF-8");
            bool   toLower    = PropertiesUtils.GetBool(options, "lowerCase", false);
            Locale es         = new Locale("es");
            bool   onePerLine = PropertiesUtils.GetBool(options, "onePerLine", false);
            // Read the file from stdin
            int  nLines    = 0;
            int  nTokens   = 0;
            long startTime = Runtime.NanoTime();

            try
            {
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new BufferedReader(new InputStreamReader(Runtime.@in, encoding)));
                BufferedWriter         writer    = new BufferedWriter(new OutputStreamWriter(System.Console.Out, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                {
                    ++nTokens;
                    string word = tokenizer.Current.Word();
                    if (word.Equals(SpanishLexer.NewlineToken))
                    {
                        ++nLines;
                        if (!onePerLine)
                        {
                            writer.NewLine();
                            printSpace = false;
                        }
                    }
                    else
                    {
                        string outputToken = toLower ? word.ToLower(es) : word;
                        if (onePerLine)
                        {
                            writer.Write(outputToken);
                            writer.NewLine();
                        }
                        else
                        {
                            if (printSpace)
                            {
                                writer.Write(" ");
                            }
                            writer.Write(outputToken);
                            printSpace = true;
                        }
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                throw new RuntimeIOException("Bad character encoding", e);
            }
            catch (IOException e)
            {
                throw new RuntimeIOException(e);
            }
            long   elapsedTime = Runtime.NanoTime() - startTime;
            double linesPerSec = (double)nLines / (elapsedTime / 1e9);

            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec);
        }