private CoreLabel MakeXmlToken(string tokenText, bool doNormalization, int charOffsetBegin, int charOffsetEnd)
        {
            CoreLabel token = new CoreLabel();

            token.SetOriginalText(tokenText);
            if (separatorPattern.Matcher(tokenText).Matches())
            {
                // Map to CoreNLP newline token
                tokenText = AbstractTokenizer.NewlineToken;
            }
            else
            {
                if (doNormalization && normalizeSpace)
                {
                    tokenText = tokenText.Replace(' ', '\u00A0');
                }
            }
            // change space to non-breaking space
            token.SetWord(tokenText);
            token.SetValue(tokenText);
            token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), charOffsetBegin);
            token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), charOffsetEnd);
            if (Verbose)
            {
                log.Info("Adding token " + token.ToShorterString());
            }
            return(token);
        }
        private CoreLabel MkLabel(string word, string ner)
        {
            CoreLabel label = new CoreLabel();

            label.SetWord(word);
            label.SetOriginalText(word);
            label.SetNER(ner);
            return(label);
        }
Exemple #3
0
        public virtual IList <CoreLabel> SegmentStringToTokenList(string line)
        {
            IList <CoreLabel> tokenList       = CollectionUtils.MakeList();
            IList <CoreLabel> labeledSequence = SegmentStringToIOB(line);

            foreach (IntPair span in IOBUtils.TokenSpansForIOB(labeledSequence))
            {
                CoreLabel token = new CoreLabel();
                string    text  = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.GetSource(), span.GetTarget());
                token.SetWord(text);
                token.SetValue(text);
                token.Set(typeof(CoreAnnotations.TextAnnotation), text);
                token.Set(typeof(CoreAnnotations.ArabicSegAnnotation), "1");
                int start = labeledSequence[span.GetSource()].BeginPosition();
                int end   = labeledSequence[span.GetTarget() - 1].EndPosition();
                token.SetOriginalText(Sharpen.Runtime.Substring(line, start, end));
                token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), start);
                token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end);
                tokenList.Add(token);
            }
            return(tokenList);
        }
Exemple #4
0
        /// <summary>
        /// Handles verbs with attached suffixes, marked by the lexer:
        /// Escribamosela =&gt; Escribamo + se + la =&gt; escribamos + se + la
        /// Sentaos =&gt; senta + os =&gt; sentad + os
        /// Damelo =&gt; da + me + lo
        /// </summary>
        private CoreLabel ProcessVerb(CoreLabel cl)
        {
            cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
            SpanishVerbStripper.StrippedVerb stripped = verbStripper.SeparatePronouns(cl.Word());
            if (stripped == null)
            {
                return(cl);
            }
            // Split the CoreLabel into separate labels, tracking changing begin + end
            // positions.
            int stemEnd       = cl.BeginPosition() + stripped.GetOriginalStem().Length;
            int lengthRemoved = 0;

            foreach (string pronoun in stripped.GetPronouns())
            {
                int beginOffset = stemEnd + lengthRemoved;
                compoundBuffer.Add(CopyCoreLabel(cl, pronoun, beginOffset));
                lengthRemoved += pronoun.Length;
            }
            CoreLabel stem = CopyCoreLabel(cl, stripped.GetStem(), cl.BeginPosition(), stemEnd);

            stem.SetOriginalText(stripped.GetOriginalStem());
            return(stem);
        }
 public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf)
 {
     tree = tree.Prune(emptyFilter, tf).SpliceOut(aOverAFilter, tf);
     foreach (Tree t in tree)
     {
         if (t.IsLeaf())
         {
             //Strip off morphological analyses and place them in the OriginalTextAnnotation, which is
             //specified by HasContext.
             if (t.Value().Contains(MorphoFeatureSpecification.MorphoMark))
             {
                 string[] toks = t.Value().Split(MorphoFeatureSpecification.MorphoMark);
                 if (toks.Length != 2)
                 {
                     log.Err(string.Format("%s: Word contains malformed morph annotation: %s", this.GetType().FullName, t.Value()));
                 }
                 else
                 {
                     if (t.Label() is CoreLabel)
                     {
                         CoreLabel cl = (CoreLabel)t.Label();
                         cl.SetValue(string.Intern(toks[0].Trim()));
                         cl.SetWord(string.Intern(toks[0].Trim()));
                         Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(toks[0], toks[1]);
                         string lemma         = lemmaMorph.First();
                         string morphAnalysis = lemmaMorph.Second();
                         if (lemma.Equals(toks[0]))
                         {
                             cl.SetOriginalText(string.Intern(toks[1].Trim()));
                         }
                         else
                         {
                             // TODO(spenceg): Does this help?
                             string newLemma = lexMapper.Map(null, lemma);
                             if (newLemma == null || newLemma.Trim().IsEmpty())
                             {
                                 newLemma = lemma;
                             }
                             string newMorphAnalysis = newLemma + MorphoFeatureSpecification.LemmaMark + morphAnalysis;
                             cl.SetOriginalText(string.Intern(newMorphAnalysis));
                         }
                     }
                     else
                     {
                         log.Error(string.Format("%s: Cannot store morph analysis in non-CoreLabel: %s", this.GetType().FullName, t.Label().GetType().FullName));
                     }
                 }
             }
         }
         else
         {
             if (t.IsPreTerminal())
             {
                 if (t.Value() == null || t.Value().IsEmpty())
                 {
                     log.Warn(string.Format("%s: missing tag for %s", this.GetType().FullName, t.PennString()));
                 }
                 else
                 {
                     if (t.Label() is IHasTag)
                     {
                         ((IHasTag)t.Label()).SetTag(t.Value());
                     }
                 }
             }
             else
             {
                 //Phrasal nodes
                 // there are some nodes "/" missing preterminals.  We'll splice in a tag for these.
                 int          nk      = t.NumChildren();
                 IList <Tree> newKids = new List <Tree>(nk);
                 for (int j = 0; j < nk; j++)
                 {
                     Tree child = t.GetChild(j);
                     if (child.IsLeaf())
                     {
                         log.Warn(string.Format("%s: Splicing in DUMMYTAG for %s", this.GetType().FullName, t.ToString()));
                         newKids.Add(tf.NewTreeNode("DUMMYTAG", Java.Util.Collections.SingletonList(child)));
                     }
                     else
                     {
                         newKids.Add(child);
                     }
                 }
                 t.SetChildren(newKids);
             }
         }
     }
     //Every node in the tree has now been processed
     //
     // Additional processing for specific phrasal annotations
     //
     // special global coding for moving PRD annotation from constituent to verb tag.
     if (markPRDverb)
     {
         TregexMatcher m     = prdVerbPattern.Matcher(tree);
         Tree          match = null;
         while (m.Find())
         {
             if (m.GetMatch() != match)
             {
                 match = m.GetMatch();
                 match.Label().SetValue(match.Label().Value() + "-PRDverb");
                 Tree prd = m.GetNode("prd");
                 prd.Label().SetValue(base.NormalizeNonterminal(prd.Label().Value()));
             }
         }
     }
     //Mark *only* subjects in verb-initial clauses
     if (retainNPSbj)
     {
         TregexMatcher m = npSbjPattern.Matcher(tree);
         while (m.Find())
         {
             Tree match = m.GetMatch();
             match.Label().SetValue("NP");
         }
     }
     if (tree.IsPreTerminal())
     {
         // The whole tree is a bare tag: bad!
         string val = tree.Label().Value();
         if (val.Equals("CC") || val.StartsWith("PUNC") || val.Equals("CONJ"))
         {
             log.Warn(string.Format("%s: Bare tagged word being wrapped in FRAG %s", this.GetType().FullName, tree.PennString()));
             tree = tf.NewTreeNode("FRAG", Java.Util.Collections.SingletonList(tree));
         }
         else
         {
             log.Warn(string.Format("%s: Bare tagged word %s", this.GetType().FullName, tree.PennString()));
         }
     }
     //Add start symbol so that the root has only one sub-state. Escape any enclosing brackets.
     //If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method
     //will return null. In this case, readers e.g. PennTreeReader will try to read the next tree.
     while (tree != null && (tree.Value() == null || tree.Value().IsEmpty()) && tree.NumChildren() <= 1)
     {
         tree = tree.FirstChild();
     }
     if (tree != null && !tree.Value().Equals(rootLabel))
     {
         tree = tf.NewTreeNode(rootLabel, Java.Util.Collections.SingletonList(tree));
     }
     return(tree);
 }