private static string GetString(object o)
 {
     if (o is IHasWord)
     {
         IHasWord h = (IHasWord)o;
         return(h.Word());
     }
     else
     {
         if (o is string)
         {
             return((string)o);
         }
         else
         {
             if (o is ICoreMap)
             {
                 return(((ICoreMap)o).Get(typeof(CoreAnnotations.TextAnnotation)));
             }
             else
             {
                 throw new Exception("Expected token to be either Word or String.");
             }
         }
     }
 }
Ejemplo n.º 2
0
        public static State InitialStateFromTaggedSentence <_T0>(IList <_T0> words)
            where _T0 : IHasWord
        {
            IList <Tree> preterminals = Generics.NewArrayList();

            for (int index = 0; index < words.Count; ++index)
            {
                IHasWord  hw = words[index];
                CoreLabel wordLabel;
                string    tag;
                if (hw is CoreLabel)
                {
                    wordLabel = (CoreLabel)hw;
                    tag       = wordLabel.Tag();
                }
                else
                {
                    wordLabel = new CoreLabel();
                    wordLabel.SetValue(hw.Word());
                    wordLabel.SetWord(hw.Word());
                    if (!(hw is IHasTag))
                    {
                        throw new ArgumentException("Expected tagged words");
                    }
                    tag = ((IHasTag)hw).Tag();
                    wordLabel.SetTag(tag);
                }
                if (tag == null)
                {
                    throw new ArgumentException("Input word not tagged");
                }
                CoreLabel tagLabel = new CoreLabel();
                tagLabel.SetValue(tag);
                // Index from 1.  Tools downstream from the parser expect that
                // Internally this parser uses the index, so we have to
                // overwrite incorrect indices if the label is already indexed
                wordLabel.SetIndex(index + 1);
                tagLabel.SetIndex(index + 1);
                LabeledScoredTreeNode wordNode = new LabeledScoredTreeNode(wordLabel);
                LabeledScoredTreeNode tagNode  = new LabeledScoredTreeNode(tagLabel);
                tagNode.AddChild(wordNode);
                // TODO: can we get away with not setting these on the wordLabel?
                wordLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel);
                wordLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel);
                tagLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel);
                tagLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel);
                preterminals.Add(tagNode);
            }
            return(new State(preterminals));
        }
        /// <summary>Adds a sentence final punctuation mark to sentences that lack one.</summary>
        /// <remarks>
        /// Adds a sentence final punctuation mark to sentences that lack one.
        /// This method adds a period (the first sentence final punctuation word
        /// in a parser language pack) to sentences that don't have one within
        /// the last 3 words (to allow for close parentheses, etc.).  It checks
        /// tags for punctuation, if available, otherwise words.
        /// </remarks>
        /// <param name="sentence">The sentence to check</param>
        /// <param name="length">The length of the sentence (just to avoid recomputation)</param>
        private bool AddSentenceFinalPunctIfNeeded(IList <IHasWord> sentence, int length)
        {
            int start = length - 3;

            if (start < 0)
            {
                start = 0;
            }
            ITreebankLanguagePack tlp = op.tlpParams.TreebankLanguagePack();

            for (int i = length - 1; i >= start; i--)
            {
                IHasWord item = sentence[i];
                // An object (e.g., CoreLabel) can implement HasTag but not actually store
                // a tag so we need to check that there is something there for this case.
                // If there is, use only it, since word tokens can be ambiguous.
                string tag = null;
                if (item is IHasTag)
                {
                    tag = ((IHasTag)item).Tag();
                }
                if (tag != null && !tag.IsEmpty())
                {
                    if (tlp.IsSentenceFinalPunctuationTag(tag))
                    {
                        return(false);
                    }
                }
                else
                {
                    string str = item.Word();
                    if (tlp.IsPunctuationWord(str))
                    {
                        return(false);
                    }
                }
            }
            // none found so add one.
            if (op.testOptions.verbose)
            {
                log.Info("Adding missing final punctuation to sentence.");
            }
            string[] sfpWords = tlp.SentenceFinalPunctuationWords();
            if (sfpWords.Length > 0)
            {
                sentence.Add(new Word(sfpWords[0]));
            }
            return(true);
        }
Ejemplo n.º 4
0
 public virtual void PrintSamples(IList samples, TextWriter @out)
 {
     for (int i = 0; i < document.Count; i++)
     {
         IHasWord word = (IHasWord)document[i];
         string   s    = "null";
         if (word != null)
         {
             s = word.Word();
         }
         @out.Write(StringUtils.PadOrTrim(s, 10));
         foreach (object sample in samples)
         {
             int[] sequence = (int[])sample;
             @out.Write(" " + StringUtils.PadLeft(sequence[i], 2));
         }
         @out.WriteLine();
     }
 }
Ejemplo n.º 5
0
        /// <summary>Splits the Word w on the character splitChar.</summary>
        private IHasWord SplitTag(IHasWord w)
        {
            if (splitChar == 0)
            {
                return(w);
            }
            string s     = w.Word();
            int    split = s.LastIndexOf(splitChar);

            if (split <= 0)
            {
                // == 0 isn't allowed - no empty words!
                return(w);
            }
            string word = Sharpen.Runtime.Substring(s, 0, split);
            string tag  = Sharpen.Runtime.Substring(s, split + 1, s.Length);

            return(new TaggedWord(word, tag));
        }
        /// <exception cref="System.IO.IOException"/>
        public virtual void HandleTokenize(string arg, OutputStream outStream)
        {
            if (arg == null)
            {
                return;
            }
            IList <IHasWord>   tokens = parser.Tokenize(arg);
            OutputStreamWriter osw    = new OutputStreamWriter(outStream, "utf-8");

            for (int i = 0; i < tokens.Count; ++i)
            {
                IHasWord word = tokens[i];
                if (i > 0)
                {
                    osw.Write(" ");
                }
                osw.Write(word.ToString());
            }
            osw.Write("\n");
            osw.Flush();
        }
Ejemplo n.º 7
0
        private static IList <IHasWord> FixQuotes(IList <IHasWord> input)
        {
            int inputSize = input.Count;
            LinkedList <IHasWord> result = new LinkedList <IHasWord>();

            if (inputSize == 0)
            {
                return(result);
            }
            bool begin;

            // see if there is a quote at the end
            if (input[inputSize - 1].Word().Equals("\""))
            {
                // alternate from the end
                begin = false;
                for (int i = inputSize - 1; i >= 0; i--)
                {
                    IHasWord hw  = input[i];
                    string   tok = hw.Word();
                    if (tok.Equals("\""))
                    {
                        if (begin)
                        {
                            hw.SetWord("``");
                            begin = false;
                        }
                        else
                        {
                            hw.SetWord("\'\'");
                            begin = true;
                        }
                    }
                    // otherwise leave it alone
                    result.AddFirst(hw);
                }
            }
            else
            {
                // end loop
                // alternate from the beginning
                begin = true;
                foreach (IHasWord hw in input)
                {
                    string tok = hw.Word();
                    if (tok.Equals("\""))
                    {
                        if (begin)
                        {
                            hw.SetWord("``");
                            begin = false;
                        }
                        else
                        {
                            hw.SetWord("\'\'");
                            begin = true;
                        }
                    }
                    // otherwise leave it alone
                    result.AddLast(hw);
                }
            }
            // end loop
            return(result);
        }
        protected internal virtual IList <Item> MakeInitialItems <_T0>(IList <_T0> wordList)
            where _T0 : IHasWord
        {
            IList <Item> itemList = new List <Item>();
            int          length   = wordList.Count;
            int          numTags  = tagIndex.Size();

            words          = new int[length];
            taggedWordList = new IList[length];
            int terminalCount = 0;

            originalLabels = new CoreLabel[wordList.Count];
            for (int i = 0; i < length; i++)
            {
                taggedWordList[i] = new List <IntTaggedWord>(numTags);
                IHasWord wordObject = wordList[i];
                if (wordObject is CoreLabel)
                {
                    originalLabels[i] = (CoreLabel)wordObject;
                }
                string wordStr = wordObject.Word();
                //Word context (e.g., morphosyntactic info)
                string wordContextStr = null;
                if (wordObject is IHasContext)
                {
                    wordContextStr = ((IHasContext)wordObject).OriginalText();
                    if (string.Empty.Equals(wordContextStr))
                    {
                        wordContextStr = null;
                    }
                }
                if (!wordIndex.Contains(wordStr))
                {
                    wordStr = LexiconConstants.UnknownWord;
                }
                int word = wordIndex.IndexOf(wordStr);
                words[i] = word;
                for (IEnumerator <IntTaggedWord> tagI = lex.RuleIteratorByWord(word, i, wordContextStr); tagI.MoveNext();)
                {
                    IntTaggedWord tagging = tagI.Current;
                    int           tag     = tagging.tag;
                    //String curTagStr = tagIndex.get(tag);
                    //if (!tagStr.equals("") && !tagStr.equals(curTagStr))
                    //  continue;
                    int state = stateIndex.IndexOf(tagIndex.Get(tag));
                    //itemList.add(makeInitialItem(i,tag,state,1.0*tagging.score));
                    // THIS WILL CAUSE BUGS!!!  Don't use with another A* scorer
                    tempEdge.state = state;
                    tempEdge.head  = i;
                    tempEdge.start = i;
                    tempEdge.end   = i + 1;
                    tempEdge.tag   = tag;
                    itemList.Add(MakeInitialItem(i, tag, state, scorer.IScore(tempEdge)));
                    terminalCount++;
                    taggedWordList[i].Add(new IntTaggedWord(word, tag));
                }
            }
            if (op.testOptions.verbose)
            {
                log.Info("Terminals (# of tag edges in chart): " + terminalCount);
            }
            return(itemList);
        }
Ejemplo n.º 9
0
        /// <exception cref="Java.Util.NoSuchElementException"/>
        private Tree GetTreeFromInputStream()
        {
            int wordIndex = 1;

            // FSA
            while (tokenizer.MoveNext())
            {
                string token = tokenizer.Current;
                switch (token)
                {
                case leftParen:
                {
                    // cdm 20100225: This next line used to have "" instead of null, but the traditional and current tree normalizers depend on the label being null not "" when there is no label on a tree (like the outermost English PTB level)
                    string label = (tokenizer.Peek().Equals(leftParen)) ? null : tokenizer.Current;
                    if (rightParen.Equals(label))
                    {
                        //Skip past empty trees
                        continue;
                    }
                    else
                    {
                        if (treeNormalizer != null)
                        {
                            label = treeNormalizer.NormalizeNonterminal(label);
                        }
                    }
                    if (label != null)
                    {
                        label = StarPattern.Matcher(label).ReplaceAll("*");
                        label = SlashPattern.Matcher(label).ReplaceAll("/");
                    }
                    Tree newTree = treeFactory.NewTreeNode(label, null);
                    // dtrs are added below
                    if (currentTree == null)
                    {
                        stack.Add(newTree);
                    }
                    else
                    {
                        currentTree.AddChild(newTree);
                        stack.Add(currentTree);
                    }
                    currentTree = newTree;
                    break;
                }

                case rightParen:
                {
                    if (stack.IsEmpty())
                    {
                        // Warn that file has too many right parentheses
                        log.Info("PennTreeReader: warning: file has extra non-matching right parenthesis [ignored]");
                        goto label_break;
                    }
                    //Accept
                    currentTree = stack.Remove(stack.Count - 1);
                    // i.e., stack.pop()
                    if (stack.IsEmpty())
                    {
                        return(currentTree);
                    }
                    break;
                }

                default:
                {
                    if (currentTree == null)
                    {
                        // A careful Reader should warn here, but it's kind of useful to
                        // suppress this because then the TreeReader doesn't print a ton of
                        // messages if there is a README file in a directory of Trees.
                        // log.info("PennTreeReader: warning: file has extra token not in a s-expression tree: " + token + " [ignored]");
                        goto label_break;
                    }
                    string terminal = (treeNormalizer == null) ? token : treeNormalizer.NormalizeTerminal(token);
                    terminal = StarPattern.Matcher(terminal).ReplaceAll("*");
                    terminal = SlashPattern.Matcher(terminal).ReplaceAll("/");
                    Tree leaf = treeFactory.NewLeaf(terminal);
                    if (leaf.Label() is IHasIndex)
                    {
                        IHasIndex hi = (IHasIndex)leaf.Label();
                        hi.SetIndex(wordIndex);
                    }
                    if (leaf.Label() is IHasWord)
                    {
                        IHasWord hw = (IHasWord)leaf.Label();
                        hw.SetWord(leaf.Label().Value());
                    }
                    if (leaf.Label() is IHasTag)
                    {
                        IHasTag ht = (IHasTag)leaf.Label();
                        ht.SetTag(currentTree.Label().Value());
                    }
                    wordIndex++;
                    currentTree.AddChild(leaf);
                    // cdm: Note: this implementation just isn't as efficient as the old recursive descent parser (see 2008 code), where all the daughters are gathered before the tree is made....
                    break;
                }
                }
                label_continue :;
            }
            label_break :;
            //Reject
            if (currentTree != null)
            {
                log.Info("PennTreeReader: warning: incomplete tree (extra left parentheses in input): " + currentTree);
            }
            return(null);
        }