示例#1
0
        /// <summary>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.
        /// </summary>
        /// <remarks>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.  Output is handled with a
        /// TreePrint object.  Note that the options used when creating the
        /// TreePrint can determine what results to print out.  Once again,
        /// one can capture the output by passing a PrintWriter to
        /// TreePrint.printTree. This code is for English.
        /// </remarks>
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            string[]          sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
            IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent);
            Tree parse = lp.Apply(rawWords);

            parse.PennPrint();
            System.Console.Out.WriteLine();
            // This option shows loading and using an explicit tokenizer
            string sent2 = "This is another sentence.";
            ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty);
            ITokenizer <CoreLabel>        tok       = tokenizerFactory.GetTokenizer(new StringReader(sent2));
            IList <CoreLabel>             rawWords2 = tok.Tokenize();

            parse = lp.Apply(rawWords2);
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory();
            GrammaticalStructure         gs  = gsf.NewGrammaticalStructure(parse);
            IList <TypedDependency>      tdl = gs.TypedDependenciesCCprocessed();

            System.Console.Out.WriteLine(tdl);
            System.Console.Out.WriteLine();
            // You can also use a TreePrint object to print trees and dependencies
            TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.PrintTree(parse);
        }
示例#2
0
        /// <summary>A fast, rule-based tokenizer for Modern Standard French.</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Modern Standard French.
        /// Performs punctuation splitting and light tokenization by default.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It assumes that the input
        /// file is delimited by the system line separator. The output will be equivalently
        /// delimited.
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs());

            if (options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            // Lexer options
            ITokenizerFactory <CoreLabel> tf = options.Contains("ftb") ? FrenchTokenizer.FtbFactory() : FrenchTokenizer.Factory();
            string orthoOptions = options.GetProperty("options", string.Empty);

            // When called from this main method, split on newline. No options for
            // more granular sentence splitting.
            orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
            tf.SetOptions(orthoOptions);
            // Other options
            string encoding = options.GetProperty("encoding", "UTF-8");
            bool   toLower  = PropertiesUtils.GetBool(options, "lowerCase", false);
            // Read the file from stdin
            int  nLines    = 0;
            int  nTokens   = 0;
            long startTime = Runtime.NanoTime();

            try
            {
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                {
                    ++nTokens;
                    string word = tokenizer.Current.Word();
                    if (word.Equals(FrenchLexer.NewlineToken))
                    {
                        ++nLines;
                        printSpace = false;
                        System.Console.Out.WriteLine();
                    }
                    else
                    {
                        if (printSpace)
                        {
                            System.Console.Out.Write(" ");
                        }
                        string outputToken = toLower ? word.ToLower(Locale.French) : word;
                        System.Console.Out.Write(outputToken);
                        printSpace = true;
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                log.Error(e);
            }
            long   elapsedTime = Runtime.NanoTime() - startTime;
            double linesPerSec = (double)nLines / (elapsedTime / 1e9);

            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec);
        }
示例#3
0
        /// <summary>Tokenize the text using the parser's tokenizer</summary>
        public virtual IList <IHasWord> Tokenize(string sentence)
        {
            ITokenizerFactory <IHasWord> tf        = TreebankLanguagePack().GetTokenizerFactory();
            ITokenizer <IHasWord>        tokenizer = tf.GetTokenizer(new StringReader(sentence));
            IList <IHasWord>             tokens    = tokenizer.Tokenize();

            return(tokens);
        }
        /// <summary>A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).
        /// Performs punctuation splitting and light tokenization by default.
        /// Orthographic normalization options are available, and can be enabled with
        /// command line options.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It normalizes non-printing
        /// line separators across platforms and prints the system default line splitter
        /// to the output.
        /// <p>
        /// The following normalization options are provided:
        /// <ul>
        /// <li>
        /// <c>useUTF8Ellipsis</c>
        /// : Replaces sequences of three or more full stops with \u2026</li>
        /// <li>
        /// <c>normArDigits</c>
        /// : Convert Arabic digits to ASCII equivalents</li>
        /// <li>
        /// <c>normArPunc</c>
        /// : Convert Arabic punctuation to ASCII equivalents</li>
        /// <li>
        /// <c>normAlif</c>
        /// : Change all alif forms to bare alif</li>
        /// <li>
        /// <c>normYa</c>
        /// : Map ya to alif maqsura</li>
        /// <li>
        /// <c>removeDiacritics</c>
        /// : Strip all diacritics</li>
        /// <li>
        /// <c>removeTatweel</c>
        /// : Strip tatweel elongation character</li>
        /// <li>
        /// <c>removeQuranChars</c>
        /// : Remove diacritics that appear in the Quran</li>
        /// <li>
        /// <c>removeProMarker</c>
        /// : Remove the ATB null pronoun marker</li>
        /// <li>
        /// <c>removeSegMarker</c>
        /// : Remove the ATB clitic segmentation marker</li>
        /// <li>
        /// <c>removeMorphMarker</c>
        /// : Remove the ATB morpheme boundary markers</li>
        /// <li>
        /// <c>removeLengthening</c>
        /// : Replace all sequences of three or more identical (non-period) characters with one copy</li>
        /// <li>
        /// <c>atbEscaping</c>
        /// : Replace left/right parentheses with ATB escape characters</li>
        /// </ul>
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length > 0 && args[0].Contains("help"))
            {
                System.Console.Error.Printf("Usage: java %s [OPTIONS] < file%n", typeof(ArabicTokenizer).FullName);
                System.Console.Error.Printf("%nOptions:%n");
                log.Info("   -help : Print this message. See javadocs for all normalization options.");
                log.Info("   -atb  : Tokenization for the parsing experiments in Green and Manning (2010)");
                System.Environment.Exit(-1);
            }
            // Process normalization options
            Properties tokenizerOptions      = StringUtils.ArgsToProperties(args);
            ITokenizerFactory <CoreLabel> tf = tokenizerOptions.Contains("atb") ? ArabicTokenizer.AtbFactory() : ArabicTokenizer.Factory();

            foreach (string option in tokenizerOptions.StringPropertyNames())
            {
                tf.SetOptions(option);
            }
            // Replace line separators with a token so that we can
            // count lines
            tf.SetOptions("tokenizeNLs");
            // Read the file
            int nLines  = 0;
            int nTokens = 0;

            try
            {
                string encoding = "UTF-8";
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                {
                    ++nTokens;
                    string word = tokenizer.Current.Word();
                    if (word.Equals(ArabicLexer.NewlineToken))
                    {
                        ++nLines;
                        printSpace = false;
                        System.Console.Out.WriteLine();
                    }
                    else
                    {
                        if (printSpace)
                        {
                            System.Console.Out.Write(" ");
                        }
                        System.Console.Out.Write(word);
                        printSpace = true;
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens)%n", nLines, nTokens);
        }
 public virtual void RunTest <_T0>(ITokenizerFactory <_T0> factory, string[] testStrings, string[][] resultsStrings)
     where _T0 : IHasWord
 {
     for (int i = 0; i < testStrings.Length; ++i)
     {
         ITokenizer <IHasWord> tokenizer = factory.GetTokenizer(new StringReader(testStrings[i]));
         IList <IHasWord>      tokens    = tokenizer.Tokenize();
         NUnit.Framework.Assert.AreEqual(resultsStrings[i].Length, tokens.Count);
         for (int j = 0; j < resultsStrings[i].Length; ++j)
         {
             NUnit.Framework.Assert.AreEqual(resultsStrings[i][j], tokens[j].Word());
         }
     }
 }
示例#6
0
        private IList <CoreLabel> SegmentStringToIOB(string line)
        {
            IList <CoreLabel> tokenList;

            if (tf == null)
            {
                // Whitespace tokenization.
                tokenList = IOBUtils.StringToIOB(line);
            }
            else
            {
                IList <CoreLabel> tokens = tf.GetTokenizer(new StringReader(line)).Tokenize();
                tokenList = IOBUtils.StringToIOB(tokens, null, false, tf, line);
            }
            IOBUtils.LabelDomain(tokenList, domain);
            tokenList = classifier.Classify(tokenList);
            return(tokenList);
        }
        public virtual void TestArabicTokenizer()
        {
            System.Diagnostics.Debug.Assert((untokInputs.Length == tokReferences.Length));
            ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory();

            tf.SetOptions("removeProMarker");
            tf.SetOptions("removeSegMarker");
            tf.SetOptions("removeMorphMarker");
            for (int i = 0; i < untokInputs.Length; ++i)
            {
                string line = untokInputs[i];
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(line));
                IList <CoreLabel>      tokens    = tokenizer.Tokenize();
                string tokenizedLine             = SentenceUtils.ListToString(tokens);
                string reference = tokReferences[i];
                NUnit.Framework.Assert.AreEqual("Tokenization deviates from reference", reference, tokenizedLine);
            }
        }
        public virtual void TestCharOffsets()
        {
            string untokInput = "إِنَّ- -نا هادِئ+ُونَ .";

            int[] beginOffsets = new int[] { 0, 7, 11, 22 };
            int[] endOffsets   = new int[] { 6, 10, 21, 23 };
            ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory();

            tf.SetOptions("removeProMarker");
            tf.SetOptions("removeSegMarker");
            tf.SetOptions("removeMorphMarker");
            ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(untokInput));
            IList <CoreLabel>      tokens    = tokenizer.Tokenize();

            NUnit.Framework.Assert.AreEqual("Number of tokens doesn't match reference", tokens.Count, beginOffsets.Length);
            for (int i = 0; i < beginOffsets.Length; i++)
            {
                NUnit.Framework.Assert.AreEqual("Char begin offset deviates from reference", beginOffsets[i], tokens[i].BeginPosition());
                NUnit.Framework.Assert.AreEqual("Char end offset deviates from reference", endOffsets[i], tokens[i].EndPosition());
            }
        }
示例#9
0
        /// <summary>For debugging.</summary>
        /// <param name="args"/>
        /// <exception cref="System.IO.IOException"></exception>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s file > output%n", typeof(ArabicDocumentReaderAndWriter).FullName);
                System.Environment.Exit(-1);
            }
            string fileName = args[0];
            ITokenizerFactory <CoreLabel> tokFactory = ArabicTokenizer.AtbFactory();
            string atbVocOptions = "removeProMarker,removeMorphMarker";

            tokFactory.SetOptions(atbVocOptions);
            BufferedReader reader = IOUtils.ReaderFromString(fileName);

            for (string line; (line = reader.ReadLine()) != null;)
            {
                string[] toks    = line.Split("\\s+");
                string   delim   = Pattern.Quote(tagDelimiter);
                bool     isStart = true;
                foreach (string wordTag in toks)
                {
                    string[] wordTagPair = wordTag.Split(delim);
                    System.Diagnostics.Debug.Assert(wordTagPair.Length == 2);
                    string word = wordTagPair[0];
                    if (tokFactory != null)
                    {
                        IList <CoreLabel> lexList = tokFactory.GetTokenizer(new StringReader(word)).Tokenize();
                        if (lexList.Count == 0)
                        {
                            continue;
                        }
                        else
                        {
                            if (lexList.Count == 1)
                            {
                                word = lexList[0].Value();
                            }
                            else
                            {
                                if (lexList.Count > 1)
                                {
                                    string secondWord = lexList[1].Value();
                                    if (secondWord.Equals(DefaultSegMarker.ToString()))
                                    {
                                        // Special case for the null marker in the vocalized section
                                        word = lexList[0].Value() + DefaultSegMarker.ToString();
                                    }
                                    else
                                    {
                                        System.Console.Error.Printf("%s: Raw token generates multiple segments: %s%n", typeof(ArabicDocumentReaderAndWriter).FullName, word);
                                        word = lexList[0].Value();
                                    }
                                }
                            }
                        }
                    }
                    if (!isStart)
                    {
                        System.Console.Out.Write(" ");
                    }
                    System.Console.Out.Write(word);
                    isStart = false;
                }
                System.Console.Out.WriteLine();
            }
        }
        /// <summary>
        /// Tokenizes the given text to populate the list of words this Document
        /// represents.
        /// </summary>
        /// <remarks>
        /// Tokenizes the given text to populate the list of words this Document
        /// represents. The default implementation uses the current tokenizer and tokenizes
        /// the entirety of the text into words. Subclasses should override this method
        /// to parse documents in non-standard formats, and/or to pull the title of the
        /// document from the text. The given text may be empty ("") but will never
        /// be null. Subclasses may want to do additional processing and then just
        /// call super.parse.
        /// </remarks>
        /// <seealso cref="BasicDocument{L}.SetTokenizerFactory(Edu.Stanford.Nlp.Process.ITokenizerFactory{T})"/>
        protected internal virtual void Parse(string text)
        {
            ITokenizer <Word> toke = tokenizerFactory.GetTokenizer(new StringReader(text));

            Sharpen.Collections.AddAll(this, toke.Tokenize());
        }
示例#11
0
 /// <summary>Returns a thread-safe tokenizer</summary>
 public virtual ITokenizer <CoreLabel> GetTokenizer(Reader r)
 {
     return(factory.GetTokenizer(r));
 }
示例#12
0
 public EnglishTextProcessor(ISentenceDetectorFactory sdFact, ITokenizerFactory tokenizerFact, IStemmerFactory stemmerFact) : base(Language.English)
 {
     sd        = sdFact.GetSentenceDetector(Language);
     tokenizer = tokenizerFact.GetTokenizer(Language);
     stemmer   = stemmerFact.GetStemmer(Language);
 }
        /// <exception cref="System.Exception"/>
        public override Document NextDoc()
        {
            IList <IList <CoreLabel> > allWords      = new List <IList <CoreLabel> >();
            IList <Tree>             allTrees        = new List <Tree>();
            IList <IList <Mention> > allGoldMentions = new List <IList <Mention> >();
            IList <IList <Mention> > allPredictedMentions;
            IList <ICoreMap>         allSentences = new List <ICoreMap>();
            Annotation docAnno         = new Annotation(string.Empty);
            Pattern    docPattern      = Pattern.Compile("<DOC>(.*?)</DOC>", Pattern.Dotall + Pattern.CaseInsensitive);
            Pattern    sentencePattern = Pattern.Compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.Dotall + Pattern.CaseInsensitive);
            Matcher    docMatcher      = docPattern.Matcher(fileContents);

            if (!docMatcher.Find(currentOffset))
            {
                return(null);
            }
            currentOffset = docMatcher.End();
            string  doc             = docMatcher.Group(1);
            Matcher sentenceMatcher = sentencePattern.Matcher(doc);
            string  ner             = null;
            //Maintain current document ID.
            Pattern docIDPattern = Pattern.Compile("<DOCNO>(.*?)</DOCNO>", Pattern.Dotall + Pattern.CaseInsensitive);
            Matcher docIDMatcher = docIDPattern.Matcher(doc);

            if (docIDMatcher.Find())
            {
                currentDocumentID = docIDMatcher.Group(1);
            }
            else
            {
                currentDocumentID = "documentAfter " + currentDocumentID;
            }
            while (sentenceMatcher.Find())
            {
                string            sentenceString = sentenceMatcher.Group(2);
                IList <CoreLabel> words          = tokenizerFactory.GetTokenizer(new StringReader(sentenceString)).Tokenize();
                // FIXING TOKENIZATION PROBLEMS
                for (int i = 0; i < words.Count; i++)
                {
                    CoreLabel w = words[i];
                    if (i > 0 && w.Word().Equals("$"))
                    {
                        if (!words[i - 1].Word().EndsWith("PRP") && !words[i - 1].Word().EndsWith("WP"))
                        {
                            continue;
                        }
                        words[i - 1].Set(typeof(CoreAnnotations.TextAnnotation), words[i - 1].Word() + "$");
                        words.Remove(i);
                        i--;
                    }
                    else
                    {
                        if (w.Word().Equals("\\/"))
                        {
                            if (words[i - 1].Word().Equals("</COREF>"))
                            {
                                continue;
                            }
                            w.Set(typeof(CoreAnnotations.TextAnnotation), words[i - 1].Word() + "\\/" + words[i + 1].Word());
                            words.Remove(i + 1);
                            words.Remove(i - 1);
                        }
                    }
                }
                // END FIXING TOKENIZATION PROBLEMS
                IList <CoreLabel> sentence = new List <CoreLabel>();
                // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open
                Stack <Mention> stack    = new Stack <Mention>();
                IList <Mention> mentions = new List <Mention>();
                allWords.Add(sentence);
                allGoldMentions.Add(mentions);
                foreach (CoreLabel word in words)
                {
                    string w = word.Get(typeof(CoreAnnotations.TextAnnotation));
                    // found regular token: WORD/POS
                    if (!w.StartsWith("<") && w.Contains("\\/") && w.LastIndexOf("\\/") != w.Length - 2)
                    {
                        int    i_1 = w.LastIndexOf("\\/");
                        string w1  = Sharpen.Runtime.Substring(w, 0, i_1);
                        // we do NOT set POS info here. We take the POS tags from the parser!
                        word.Set(typeof(CoreAnnotations.TextAnnotation), w1);
                        word.Remove(typeof(CoreAnnotations.OriginalTextAnnotation));
                        sentence.Add(word);
                    }
                    else
                    {
                        // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
                        if (w.StartsWith("<") && !w.StartsWith("<COREF") && !w.StartsWith("</"))
                        {
                            Pattern nerPattern = Pattern.Compile("<(.*?)>");
                            Matcher m          = nerPattern.Matcher(w);
                            m.Find();
                            ner = m.Group(1);
                        }
                        else
                        {
                            // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
                            if (w.StartsWith("</") && !w.StartsWith("</COREF"))
                            {
                                Pattern nerPattern = Pattern.Compile("</(.*?)>");
                                Matcher m          = nerPattern.Matcher(w);
                                m.Find();
                                string ner1 = m.Group(1);
                                if (ner != null && !ner.Equals(ner1))
                                {
                                    throw new Exception("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
                                }
                                ner = null;
                            }
                            else
                            {
                                // found the start SGML tag for a coref mention
                                if (w.StartsWith("<COREF"))
                                {
                                    Mention mention = new Mention();
                                    // position of this mention in the sentence
                                    mention.startIndex = sentence.Count;
                                    // extract GOLD info about this coref chain. needed for eval
                                    Pattern idPattern  = Pattern.Compile("ID=\"(.*?)\"");
                                    Pattern refPattern = Pattern.Compile("REF=\"(.*?)\"");
                                    Matcher m          = idPattern.Matcher(w);
                                    m.Find();
                                    mention.mentionID = System.Convert.ToInt32(m.Group(1));
                                    m = refPattern.Matcher(w);
                                    if (m.Find())
                                    {
                                        mention.originalRef = System.Convert.ToInt32(m.Group(1));
                                    }
                                    // open mention. keep track of all open mentions using the stack
                                    stack.Push(mention);
                                }
                                else
                                {
                                    // found the end SGML tag for a coref mention
                                    if (w.Equals("</COREF>"))
                                    {
                                        Mention mention = stack.Pop();
                                        mention.endIndex = sentence.Count;
                                        // this is a closed mention. add it to the final list of mentions
                                        // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef);
                                        mentions.Add(mention);
                                    }
                                    else
                                    {
                                        word.Remove(typeof(CoreAnnotations.OriginalTextAnnotation));
                                        sentence.Add(word);
                                    }
                                }
                            }
                        }
                    }
                }
                StringBuilder textContent = new StringBuilder();
                for (int i_2 = 0; i_2 < sentence.Count; i_2++)
                {
                    CoreLabel w = sentence[i_2];
                    w.Set(typeof(CoreAnnotations.IndexAnnotation), i_2 + 1);
                    w.Set(typeof(CoreAnnotations.UtteranceAnnotation), 0);
                    if (i_2 > 0)
                    {
                        textContent.Append(" ");
                    }
                    textContent.Append(w.GetString <CoreAnnotations.TextAnnotation>());
                }
                ICoreMap sentCoreMap = new Annotation(textContent.ToString());
                allSentences.Add(sentCoreMap);
                sentCoreMap.Set(typeof(CoreAnnotations.TokensAnnotation), sentence);
            }
            // assign goldCorefClusterID
            IDictionary <int, Mention> idMention = Generics.NewHashMap();

            // temporary use
            foreach (IList <Mention> goldMentions in allGoldMentions)
            {
                foreach (Mention m in goldMentions)
                {
                    idMention[m.mentionID] = m;
                }
            }
            foreach (IList <Mention> goldMentions_1 in allGoldMentions)
            {
                foreach (Mention m in goldMentions_1)
                {
                    if (m.goldCorefClusterID == -1)
                    {
                        if (m.originalRef == -1)
                        {
                            m.goldCorefClusterID = m.mentionID;
                        }
                        else
                        {
                            int @ref = m.originalRef;
                            while (true)
                            {
                                Mention m2 = idMention[@ref];
                                if (m2.goldCorefClusterID != -1)
                                {
                                    m.goldCorefClusterID = m2.goldCorefClusterID;
                                    break;
                                }
                                else
                                {
                                    if (m2.originalRef == -1)
                                    {
                                        m2.goldCorefClusterID = m2.mentionID;
                                        m.goldCorefClusterID  = m2.goldCorefClusterID;
                                        break;
                                    }
                                    else
                                    {
                                        @ref = m2.originalRef;
                                    }
                                }
                            }
                        }
                    }
                }
            }
            docAnno.Set(typeof(CoreAnnotations.SentencesAnnotation), allSentences);
            stanfordProcessor.Annotate(docAnno);
            if (allSentences.Count != allWords.Count)
            {
                throw new InvalidOperationException("allSentences != allWords");
            }
            for (int i_3 = 0; i_3 < allSentences.Count; i_3++)
            {
                IList <CoreLabel> annotatedSent   = allSentences[i_3].Get(typeof(CoreAnnotations.TokensAnnotation));
                IList <CoreLabel> unannotatedSent = allWords[i_3];
                IList <Mention>   mentionInSent   = allGoldMentions[i_3];
                foreach (Mention m in mentionInSent)
                {
                    m.dependency = allSentences[i_3].Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation));
                }
                if (annotatedSent.Count != unannotatedSent.Count)
                {
                    throw new InvalidOperationException("annotatedSent != unannotatedSent");
                }
                for (int j = 0; j < sz; j++)
                {
                    CoreLabel annotatedWord   = annotatedSent[j];
                    CoreLabel unannotatedWord = unannotatedSent[j];
                    if (!annotatedWord.Get(typeof(CoreAnnotations.TextAnnotation)).Equals(unannotatedWord.Get(typeof(CoreAnnotations.TextAnnotation))))
                    {
                        throw new InvalidOperationException("annotatedWord != unannotatedWord");
                    }
                }
                allWords.Set(i_3, annotatedSent);
                allTrees.Add(allSentences[i_3].Get(typeof(TreeCoreAnnotations.TreeAnnotation)));
            }
            // extract predicted mentions
            allPredictedMentions = mentionFinder.ExtractPredictedMentions(docAnno, maxID, dictionaries);
            // add the relevant fields to mentions and order them for coref
            return(Arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true));
        }
        // todo: give options for document splitting. A line or the whole file or sentence splitting as now
        public virtual IEnumerator <IList <In> > GetIterator(Reader r)
        {
            ITokenizer <In> tokenizer = tokenizerFactory.GetTokenizer(r);
            // PTBTokenizer.newPTBTokenizer(r, false, true);
            IList <In>    words    = new List <In>();
            IN            previous = null;
            StringBuilder prepend  = new StringBuilder();

            /*
             * This changes SGML tags into whitespace -- it should maybe be moved elsewhere
             */
            while (tokenizer.MoveNext())
            {
                IN      w    = tokenizer.Current;
                string  word = w.Get(typeof(CoreAnnotations.TextAnnotation));
                Matcher m    = sgml.Matcher(word);
                if (m.Matches())
                {
                    string before = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.BeforeAnnotation)));
                    string after  = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.AfterAnnotation)));
                    prepend.Append(before).Append(word);
                    if (previous != null)
                    {
                        string previousTokenAfter = StringUtils.GetNotNullString(previous.Get(typeof(CoreAnnotations.AfterAnnotation)));
                        previous.Set(typeof(CoreAnnotations.AfterAnnotation), previousTokenAfter + word + after);
                    }
                }
                else
                {
                    // previous.appendAfter(w.word() + w.after());
                    string before = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.BeforeAnnotation)));
                    if (prepend.Length > 0)
                    {
                        prepend.Append(before);
                        w.Set(typeof(CoreAnnotations.BeforeAnnotation), prepend.ToString());
                        prepend = new StringBuilder();
                    }
                    words.Add(w);
                    previous = w;
                }
            }
            IList <IList <In> > sentences = wts.Process(words);
            string after_1 = string.Empty;
            IN     last    = null;

            foreach (IList <In> sentence in sentences)
            {
                int pos = 0;
                foreach (IN w in sentence)
                {
                    w.Set(typeof(CoreAnnotations.PositionAnnotation), int.ToString(pos));
                    after_1 = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.AfterAnnotation)));
                    w.Remove(typeof(CoreAnnotations.AfterAnnotation));
                    last = w;
                }
            }
            if (last != null)
            {
                last.Set(typeof(CoreAnnotations.AfterAnnotation), after_1);
            }
            return(sentences.GetEnumerator());
        }
示例#15
0
        /// <summary>A fast, rule-based tokenizer for Spanish based on AnCora.</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Spanish based on AnCora.
        /// Performs punctuation splitting and light tokenization by default.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It assumes that the input
        /// file is delimited by the system line separator. The output will be equivalently
        /// delimited.
        /// </p>
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs());

            if (options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            // Lexer options
            ITokenizerFactory <CoreLabel> tf = SpanishTokenizer.CoreLabelFactory();
            string orthoOptions = options.Contains("ancora") ? AncoraOptions : string.Empty;

            if (options.Contains("options"))
            {
                orthoOptions = orthoOptions.IsEmpty() ? options.GetProperty("options") : orthoOptions + ',' + options;
            }
            bool tokens = PropertiesUtils.GetBool(options, "tokens", false);

            if (!tokens)
            {
                orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs";
            }
            tf.SetOptions(orthoOptions);
            // Other options
            string encoding   = options.GetProperty("encoding", "UTF-8");
            bool   toLower    = PropertiesUtils.GetBool(options, "lowerCase", false);
            Locale es         = new Locale("es");
            bool   onePerLine = PropertiesUtils.GetBool(options, "onePerLine", false);
            // Read the file from stdin
            int  nLines    = 0;
            int  nTokens   = 0;
            long startTime = Runtime.NanoTime();

            try
            {
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new BufferedReader(new InputStreamReader(Runtime.@in, encoding)));
                BufferedWriter         writer    = new BufferedWriter(new OutputStreamWriter(System.Console.Out, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                {
                    ++nTokens;
                    string word = tokenizer.Current.Word();
                    if (word.Equals(SpanishLexer.NewlineToken))
                    {
                        ++nLines;
                        if (!onePerLine)
                        {
                            writer.NewLine();
                            printSpace = false;
                        }
                    }
                    else
                    {
                        string outputToken = toLower ? word.ToLower(es) : word;
                        if (onePerLine)
                        {
                            writer.Write(outputToken);
                            writer.NewLine();
                        }
                        else
                        {
                            if (printSpace)
                            {
                                writer.Write(" ");
                            }
                            writer.Write(outputToken);
                            printSpace = true;
                        }
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                throw new RuntimeIOException("Bad character encoding", e);
            }
            catch (IOException e)
            {
                throw new RuntimeIOException(e);
            }
            long   elapsedTime = Runtime.NanoTime() - startTime;
            double linesPerSec = (double)nLines / (elapsedTime / 1e9);

            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec);
        }
示例#16
0
        /// <summary>
        /// arg[0] := tokenizer options
        /// args[1] := file to tokenize
        /// </summary>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                System.Console.Out.Printf("Usage: java %s OPTS filename%n", typeof(ArabicTokenizerTester).FullName);
                System.Environment.Exit(-1);
            }
            string tokOptions = args[0];
            File   path       = new File(args[1]);

            log.Info("Reading from: " + path.GetPath());
            try
            {
                BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
                ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.Factory();
                tf.SetOptions(tokOptions);
                IMapper lexMapper = new DefaultLexicalMapper();
                lexMapper.Setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8");
                int lineId = 0;
                for (string line; (line = br.ReadLine()) != null; lineId++)
                {
                    line = line.Trim();
                    // Tokenize with the tokenizer
                    IList <CoreLabel> tokenizedLine = tf.GetTokenizer(new StringReader(line)).Tokenize();
                    System.Console.Out.WriteLine(SentenceUtils.ListToString(tokenizedLine));
                    // Tokenize with the mapper
                    StringBuilder sb   = new StringBuilder();
                    string[]      toks = line.Split("\\s+");
                    foreach (string tok in toks)
                    {
                        string mappedTok = lexMapper.Map(null, tok);
                        sb.Append(mappedTok).Append(" ");
                    }
                    IList <string> mappedToks = Arrays.AsList(sb.ToString().Trim().Split("\\s+"));
                    // Evaluate the output
                    if (mappedToks.Count != tokenizedLine.Count)
                    {
                        System.Console.Error.Printf("Line length mismatch:%norig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks));
                    }
                    else
                    {
                        bool printLines = false;
                        for (int i = 0; i < mappedToks.Count; ++i)
                        {
                            string mappedTok    = mappedToks[i];
                            string tokenizedTok = tokenizedLine[i].Word();
                            if (!mappedTok.Equals(tokenizedTok))
                            {
                                System.Console.Error.Printf("Token mismatch:%nmap: %s%ntok: %s%n", mappedTok, tokenizedTok);
                                printLines = true;
                            }
                        }
                        if (printLines)
                        {
                            System.Console.Error.Printf("orig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks));
                        }
                    }
                }
                System.Console.Error.Printf("Read %d lines.%n", lineId);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
示例#17
0
        private static bool IsDeletedCharacter(char ch, ITokenizerFactory <CoreLabel> tf)
        {
            IList <CoreLabel> tokens = tf.GetTokenizer(new StringReader(char.ToString(ch))).Tokenize();

            return(tokens.IsEmpty());
        }