Exemple #1
0
        /// <summary>Trains this lexicon on the Collection of trees.</summary>
        public override void Train(TaggedWord tw, int loc, double weight)
        {
            IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex);
            IntTaggedWord iT  = new IntTaggedWord(UnknownWordModelTrainerConstants.nullWord, iTW.tag);
            IntTaggedWord iW  = new IntTaggedWord(iTW.word, UnknownWordModelTrainerConstants.nullTag);

            seenCounter.IncrementCount(iW, weight);
            IntTaggedWord i = UnknownWordModelTrainerConstants.NullItw;

            if (treesRead > indexToStartUnkCounting)
            {
                // start doing this once some way through trees;
                // treesRead is 1 based counting
                if (seenCounter.GetCount(iW) < 2)
                {
                    // it's an entirely unknown word
                    int           s   = model.GetSignatureIndex(iTW.word, loc, wordIndex.Get(iTW.word));
                    IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag);
                    IntTaggedWord iS  = new IntTaggedWord(s, UnknownWordModelTrainerConstants.nullTag);
                    unSeenCounter.IncrementCount(iTS, weight);
                    unSeenCounter.IncrementCount(iT, weight);
                    unSeenCounter.IncrementCount(iS, weight);
                    unSeenCounter.IncrementCount(i, weight);
                }
            }
        }
        /// <summary>
        /// Join verb parts like Dadedgan corpus.
        /// Input:
        ///     دیده/ADJ_INO
        ///     شد/V_PA
        /// Iutput:
        ///     دیده شد/V_PA
        /// </summary>
        /// <param name="sentence">List of TaggedWord object </param>
        /// <returns>List of TaggedWord</returns>
        public static List<TaggedWord> JoinVerbParts(List<TaggedWord> sentence)
        {
            sentence.Reverse();
            var result = new List<TaggedWord>();
            var beforeTaggedWord = new TaggedWord("", "");
            foreach (var taggedWord in sentence)
            {
                if (PeykareReader.tokenizer.BeforeVerbs.Contains(taggedWord.word()) ||
                    (PeykareReader.tokenizer.AfterVerbs.Contains(beforeTaggedWord.word()) &&
                     PeykareReader.tokenizer.Verbs.Contains(taggedWord.word())))
                {
                    beforeTaggedWord.setWord(taggedWord.word() + " " + beforeTaggedWord.word());
                    if (result.Count == 0)
                        result.Add(beforeTaggedWord);
                }
                else
                {
                    result.Add(taggedWord);
                    beforeTaggedWord = taggedWord;
                }
            }

            result.Reverse();
            return result;
        }
Exemple #3
0
        public virtual void Train(TaggedWord tw, int loc, double weight)
        {
            uwModelTrainer.Train(tw, loc, weight);
            IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex);

            seenCounter.IncrementCount(iTW, weight);
            IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag);

            seenCounter.IncrementCount(iT, weight);
            IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag);

            seenCounter.IncrementCount(iW, weight);
            IntTaggedWord i = new IntTaggedWord(nullWord, nullTag);

            seenCounter.IncrementCount(i, weight);
            // rules.add(iTW);
            tags.Add(iT);
            words.Add(iW);
            string            tag     = tw.Tag();
            string            baseTag = op.Langpack().BasicCategory(tag);
            ICounter <string> counts  = baseTagCounts[baseTag];

            if (counts == null)
            {
                counts = new ClassicCounter <string>();
                baseTagCounts[baseTag] = counts;
            }
            counts.IncrementCount(tag, weight);
        }
Exemple #4
0
        private List <TaggedWord> JoinVerbParts(List <TaggedWord> sentence)
        {
            sentence.Reverse();
            var result           = new List <TaggedWord>();
            var beforeTaggedWord = new TaggedWord("", "");

            foreach (var taggedWord in sentence)
            {
                if (this.tokenizer.BeforeVerbs.Contains(taggedWord.word()) ||
                    (this.tokenizer.AfterVerbs.Contains(beforeTaggedWord.word()) &&
                     this.tokenizer.Verbs.Contains(taggedWord.word())))
                {
                    beforeTaggedWord.setWord(taggedWord.word() + " " + taggedWord.word());
                    if (result.Count == 0)
                    {
                        result.Add(beforeTaggedWord);
                    }
                }
                else
                {
                    result.Add(taggedWord);
                    beforeTaggedWord = taggedWord;
                }
            }

            result.Reverse();
            return(result);
        }
Exemple #5
0
 internal virtual Tree BinarizeLocalTree(Tree t, int headNum, TaggedWord head)
 {
     //System.out.println("Working on: "+headNum+" -- "+t.label());
     if (markovFactor)
     {
         string topCat   = t.Label().Value();
         ILabel newLabel = new CategoryWordTag(topCat, head.Word(), head.Tag());
         t.SetLabel(newLabel);
         Tree t2;
         if (insideFactor)
         {
             t2 = MarkovInsideBinarizeLocalTreeNew(t, headNum, 0, t.NumChildren() - 1, true);
         }
         else
         {
             //          t2 = markovInsideBinarizeLocalTree(t, head, headNum, topCat, false);
             t2 = MarkovOutsideBinarizeLocalTree(t, head, headNum, topCat, new LinkedList <Tree>(), false);
         }
         return(t2);
     }
     if (insideFactor)
     {
         return(InsideBinarizeLocalTree(t, headNum, head, 0, 0));
     }
     return(OutsideBinarizeLocalTree(t, t.Label().Value(), t.Label().Value(), headNum, head, 0, string.Empty, 0, string.Empty));
 }
        //The specific types have stricter constructors than the parent type.
        //And SO says that that is okay.
        //https://stackoverflow.com/questions/5490824/should-constructors-comply-with-the-liskov-substitution-principle
        public Token(string word, bool iAmAPrepositionChain = false)
        {
            if (word == null)
            {
                throw new ArgumentNullException("word");
            }

            if (word.EndCheck(" ") && word != " ")
            {
                throw new InvalidOperationException("Untrimmed word.");
            }
            if (!iAmAPrepositionChain)
            {
                if (String.IsNullOrWhiteSpace(word))
                {
                    throw new ArgumentNullException("word", "Token cannot be null or white space");
                }
                if (word.ContainsCheck(" ") && !TaggedWord.CheckIsTagWord(word))
                {
                    throw new ArgumentNullException("word",
                                                    "Token can't contain white space-- must use *, - or other punctuation to join words into a single token : " +
                                                    word);
                }
            }
            if (word.ContainsCheck("\n"))
            {
                throw new ArgumentNullException("word", "Token cannot contain white space-- must use *, - or other punctuation to join words into a single token : " + word);
            }
            if (word.ContainsCheck("\t"))
            {
                throw new ArgumentNullException("word", "Token cannot contain white space-- must use *, - or other punctuation to join words into a single token : " + word);
            }
            this.word = word;
        }
Exemple #7
0
        public void TrainUnannotated(IList <TaggedWord> sentence, double weight)
        {
            uwModelTrainer.IncrementTreesRead(weight);
            int loc = 0;

            foreach (TaggedWord tw in sentence)
            {
                string            baseTag = op.Langpack().BasicCategory(tw.Tag());
                ICounter <string> counts  = baseTagCounts[baseTag];
                if (counts == null)
                {
                    ++loc;
                    continue;
                }
                double totalCount = counts.TotalCount();
                if (totalCount == 0)
                {
                    ++loc;
                    continue;
                }
                foreach (string tag in counts.KeySet())
                {
                    TaggedWord newTW = new TaggedWord(tw.Word(), tag);
                    Train(newTW, loc, weight * counts.GetCount(tag) / totalCount);
                }
                ++loc;
            }
        }
        public override void Train(TaggedWord tw, int loc, double weight)
        {
            if (useGT)
            {
                unknownGTTrainer.Train(tw, weight);
            }
            // scan data
            string word      = tw.Word();
            string subString = model.GetSignature(word, loc);
            ILabel tag       = new Tag(tw.Tag());

            if (!c.Contains(tag))
            {
                c[tag] = new ClassicCounter <string>();
            }
            c[tag].IncrementCount(subString, weight);
            tc.IncrementCount(tag, weight);
            seenEnd.Add(subString);
            string        tagStr = tw.Tag();
            IntTaggedWord iW     = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex);

            seenCounter.IncrementCount(iW, weight);
            if (treesRead > indexToStartUnkCounting)
            {
                // start doing this once some way through trees;
                // treesRead is 1 based counting
                if (seenCounter.GetCount(iW) < 2)
                {
                    IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tagStr, wordIndex, tagIndex);
                    unSeenCounter.IncrementCount(iT, weight);
                    unSeenCounter.IncrementCount(UnknownWordModelTrainerConstants.NullItw, weight);
                }
            }
        }
Exemple #9
0
        private Tree InsideBinarizeLocalTree(Tree t, int headNum, TaggedWord head, int leftProcessed, int rightProcessed)
        {
            string       word        = head.Word();
            string       tag         = head.Tag();
            IList <Tree> newChildren = new List <Tree>(2);

            // check done
            if (t.NumChildren() <= leftProcessed + rightProcessed + 2)
            {
                Tree leftChild = t.GetChild(leftProcessed);
                newChildren.Add(leftChild);
                if (t.NumChildren() == leftProcessed + rightProcessed + 1)
                {
                    // unary ... so top level
                    string finalCat = t.Label().Value();
                    return(tf.NewTreeNode(new CategoryWordTag(finalCat, word, tag), newChildren));
                }
                // binary
                Tree rightChild = t.GetChild(leftProcessed + 1);
                newChildren.Add(rightChild);
                string labelStr = t.Label().Value();
                if (leftProcessed != 0 || rightProcessed != 0)
                {
                    labelStr = ("@ " + leftChild.Label().Value() + ' ' + rightChild.Label().Value());
                }
                return(tf.NewTreeNode(new CategoryWordTag(labelStr, word, tag), newChildren));
            }
            if (headNum > leftProcessed)
            {
                // eat left word
                Tree leftChild  = t.GetChild(leftProcessed);
                Tree rightChild = InsideBinarizeLocalTree(t, headNum, head, leftProcessed + 1, rightProcessed);
                newChildren.Add(leftChild);
                newChildren.Add(rightChild);
                string labelStr = ("@ " + leftChild.Label().Value() + ' ' + Sharpen.Runtime.Substring(rightChild.Label().Value(), 2));
                if (leftProcessed == 0 && rightProcessed == 0)
                {
                    labelStr = t.Label().Value();
                }
                return(tf.NewTreeNode(new CategoryWordTag(labelStr, word, tag), newChildren));
            }
            else
            {
                // eat right word
                Tree leftChild  = InsideBinarizeLocalTree(t, headNum, head, leftProcessed, rightProcessed + 1);
                Tree rightChild = t.GetChild(t.NumChildren() - rightProcessed - 1);
                newChildren.Add(leftChild);
                newChildren.Add(rightChild);
                string labelStr = ("@ " + Sharpen.Runtime.Substring(leftChild.Label().Value(), 2) + ' ' + rightChild.Label().Value());
                if (leftProcessed == 0 && rightProcessed == 0)
                {
                    labelStr = t.Label().Value();
                }
                return(tf.NewTreeNode(new CategoryWordTag(labelStr, word, tag), newChildren));
            }
        }
        /// <summary>
        /// Gets formatted chunk information for a specified sentence.
        /// </summary>
        /// <param name="tokens">
        /// string array of tokens in the sentence
        /// </param>
        /// <param name="tags">
        /// string array of POS tags for the tokens in the sentence
        /// </param>
        /// <returns>
        /// A string containing the formatted chunked sentence
        /// </returns>
        public List <SentenceChunk> GetChunks(string[] tokens, string[] tags)
        {
            var results = new List <SentenceChunk>();

            string[]      chunks = Chunk(tokens, tags);
            SentenceChunk currentSentenceChunk = null;

            for (int currentChunk = 0, chunkCount = chunks.Length; currentChunk < chunkCount; currentChunk++)
            {
                if (chunks[currentChunk].StartsWith("B-") || chunks[currentChunk] == "O")
                {
                    if (currentSentenceChunk != null)
                    {
                        results.Add(currentSentenceChunk);
                    }

                    var index = results.Count;
                    if (chunks[currentChunk].Length > 2)
                    {
                        var tag = chunks[currentChunk].Substring(2);
                        currentSentenceChunk = new SentenceChunk(tag, index);
                    }
                    else
                    {
                        currentSentenceChunk = new SentenceChunk(index);
                    }
                }

                var word = tokens[currentChunk];
                var wTag = tags[currentChunk];

                // Filter out specific words from chunking results
                if (wTag[0] == 'N' || wTag[0] == 'J')
                {
                    if (FrequentWordsList.ContainsKey(word.ToLower()))
                    {
                        continue;
                    }
                }

                if (currentSentenceChunk == null)
                {
                    currentSentenceChunk = new SentenceChunk(0);
                }

                var wIndex     = currentSentenceChunk.TaggedWords.Count;
                var taggedWord = new TaggedWord(word, wTag, wIndex);
                currentSentenceChunk.TaggedWords.Add(taggedWord);
            }
            // add last chunk
            results.Add(currentSentenceChunk);

            return(results);
        }
        public virtual void Train(TaggedWord tw, double weight)
        {
            tokens = tokens + weight;
            string word = tw.Word();
            string tag  = tw.Tag();
            // TaggedWord has crummy equality conditions
            Pair <string, string> wt = new Pair <string, string>(word, tag);

            wtCount.IncrementCount(wt, weight);
            tagCount.IncrementCount(tag, weight);
            seenWords.Add(word);
        }
Exemple #12
0
        // pcfgPE.printGoodBad();
        private static IList <TaggedWord> CleanTags(IList <TaggedWord> twList, ITreebankLanguagePack tlp)
        {
            int sz = twList.Count;
            IList <TaggedWord> l = new List <TaggedWord>(sz);

            foreach (TaggedWord tw in twList)
            {
                TaggedWord tw2 = new TaggedWord(tw.Word(), tlp.BasicCategory(tw.Tag()));
                l.Add(tw2);
            }
            return(l);
        }
        public PosSentence(List taggedSentence)
        {
            Words = new List <PosTaggedWord>();

            var i = taggedSentence.iterator();

            while (i.hasNext())
            {
                TaggedWord x = (TaggedWord)i.next();

                PosTaggedWord word = new PosTaggedWord(x.word(), x.tag());
                Words.Add(word);
            }
        }
Exemple #14
0
        /// <summary>
        /// Gets formatted chunk information for a specified sentence.
        /// </summary>
        /// <param name="tokens">
        /// string array of tokens in the sentence
        /// </param>
        /// <param name="tags">
        /// string array of POS tags for the tokens in the sentence
        /// </param>
        /// <param name="chunks">
        /// already chunked
        /// </param>
        /// <returns>
        /// A string containing the formatted chunked sentence
        /// </returns>
        public List <SentenceChunk> GetChunks(string[] tokens, string[] tags, string[] chunks)
        {
            var results = new List <SentenceChunk>();

            SentenceChunk currentSentenceChunk = null;

            for (int currentChunk = 0, chunkCount = chunks.Length; currentChunk < chunkCount; currentChunk++)
            {
                if (
                    // Per https://opennlp.apache.org/docs/1.5.3/manual/opennlp.html
                    // it seems like B- is expected when it's the first chunk.
                    // But in practice with "Awesome!" it returns "I-NP as the first chunk".
                    (currentChunk == 0 && chunks[currentChunk].StartsWith("I-")) ||
                    chunks[currentChunk].StartsWith("B-") ||
                    chunks[currentChunk] == "O")
                {
                    if (currentSentenceChunk != null)
                    {
                        results.Add(currentSentenceChunk);
                    }

                    var index = results.Count;
                    if (chunks[currentChunk].Length > 2)
                    {
                        var tag = chunks[currentChunk].Substring(2);
                        currentSentenceChunk = new SentenceChunk(tag, index);
                    }
                    else
                    {
                        currentSentenceChunk = new SentenceChunk(index);
                    }
                }

                // in all cases add the tagged word
                var word       = tokens[currentChunk];
                var wTag       = tags[currentChunk];
                var wIndex     = currentSentenceChunk.TaggedWords.Count;
                var taggedWord = new TaggedWord(word, wTag, wIndex);
                currentSentenceChunk.TaggedWords.Add(taggedWord);
            }
            // add last chunk
            results.Add(currentSentenceChunk);

            return(results);
        }
        /// <summary>Trains the first-character based unknown word model.</summary>
        /// <param name="tw">The word we are currently training on</param>
        /// <param name="loc">The position of that word</param>
        /// <param name="weight">The weight to give this word in terms of training</param>
        public override void Train(TaggedWord tw, int loc, double weight)
        {
            if (useGT)
            {
                unknownGTTrainer.Train(tw, weight);
            }
            string word  = tw.Word();
            ILabel tagL  = new Tag(tw.Tag());
            string first = Sharpen.Runtime.Substring(word, 0, 1);

            if (useUnicodeType)
            {
                char ch   = word[0];
                int  type = char.GetType(ch);
                if (type != char.OtherLetter)
                {
                    // standard Chinese characters are of type "OTHER_LETTER"!!
                    first = int.ToString(type);
                }
            }
            string tag = tw.Tag();

            if (!c.Contains(tagL))
            {
                c[tagL] = new ClassicCounter <string>();
            }
            c[tagL].IncrementCount(first, weight);
            tc.IncrementCount(tagL, weight);
            seenFirst.Add(first);
            IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex);

            seenCounter.IncrementCount(iW, weight);
            if (treesRead > indexToStartUnkCounting)
            {
                // start doing this once some way through trees;
                // treesRead is 1 based counting
                if (seenCounter.GetCount(iW) < 2)
                {
                    IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tag, wordIndex, tagIndex);
                    unSeenCounter.IncrementCount(iT, weight);
                    unSeenCounter.IncrementCount(iTotal, weight);
                }
            }
        }
Exemple #16
0
        public List <TaggedWord> BatchTag(List <string> sentence)
        {
            string[] sen = new string[sentence.Count];
            for (int i = 0; i < sentence.Count; i++)
            {
                sen[i] = sentence[i].Replace(" ", "_");
            }
            List      newSent        = Sentence.toWordList(sen);
            ArrayList taggedSentence = this._tagger.tagSentence(newSent);

            var taggedSen = new List <TaggedWord>();

            for (int i = 0; i < taggedSentence.size(); i++)
            {
                TaggedWord tw = (TaggedWord)taggedSentence.get(i);
                tw.setWord(sentence[i]);
                taggedSen.Add(tw);
            }
            return(taggedSen);
        }
Exemple #17
0
        /// <summary>Turns a sentence into a flat phrasal tree.</summary>
        /// <remarks>
        /// Turns a sentence into a flat phrasal tree.
        /// The structure is S -&gt; tag*.  And then each tag goes to a word.
        /// The tag is either found from the label or made "WD".
        /// The tag and phrasal node have a StringLabel.
        /// </remarks>
        /// <param name="s">The Sentence to make the Tree from</param>
        /// <param name="lf">The LabelFactory with which to create the new Tree labels</param>
        /// <returns>The one phrasal level Tree</returns>
        public static Tree ToFlatTree <_T0>(IList <_T0> s, ILabelFactory lf)
            where _T0 : IHasWord
        {
            IList <Tree> daughters = new List <Tree>(s.Count);

            foreach (IHasWord word in s)
            {
                Tree wordNode = new LabeledScoredTreeNode(lf.NewLabel(word.Word()));
                if (word is TaggedWord)
                {
                    TaggedWord taggedWord = (TaggedWord)word;
                    wordNode = new LabeledScoredTreeNode(new StringLabel(taggedWord.Tag()), Java.Util.Collections.SingletonList(wordNode));
                }
                else
                {
                    wordNode = new LabeledScoredTreeNode(lf.NewLabel("WD"), Java.Util.Collections.SingletonList(wordNode));
                }
                daughters.Add(wordNode);
            }
            return(new LabeledScoredTreeNode(new StringLabel("S"), daughters));
        }
Exemple #18
0
        internal virtual List <TaggedWord> GetTaggedSentence()
        {
            bool hasOffset;

            hasOffset = origWords != null && origWords.Count > 0 && (origWords[0] is IHasOffset);
            List <TaggedWord> taggedSentence = new List <TaggedWord>();

            for (int j = 0; j < size - 1; j++)
            {
                string     tag = finalTags[j];
                TaggedWord w   = new TaggedWord(sent[j], tag);
                if (hasOffset)
                {
                    IHasOffset offset = (IHasOffset)origWords[j];
                    w.SetBeginPosition(offset.BeginPosition());
                    w.SetEndPosition(offset.EndPosition());
                }
                taggedSentence.Add(w);
            }
            return(taggedSentence);
        }
Exemple #19
0
        /// <summary>
        /// Gets formatted chunk information for a specified sentence.
        /// </summary>
        /// <param name="tokens">
        /// string array of tokens in the sentence
        /// </param>
        /// <param name="tags">
        /// string array of POS tags for the tokens in the sentence
        /// </param>
        /// <returns>
        /// A string containing the formatted chunked sentence
        /// </returns>
        public List <SentenceChunk> GetChunks(string[] tokens, string[] tags)
        {
            var results = new List <SentenceChunk>();

            string[]      chunks = Chunk(tokens, tags);
            SentenceChunk currentSentenceChunk = null;

            for (int currentChunk = 0, chunkCount = chunks.Length; currentChunk < chunkCount; currentChunk++)
            {
                if (chunks[currentChunk].StartsWith("B-") || chunks[currentChunk] == "O")
                {
                    if (currentSentenceChunk != null)
                    {
                        results.Add(currentSentenceChunk);
                    }

                    var index = results.Count;
                    if (chunks[currentChunk].Length > 2)
                    {
                        var tag = chunks[currentChunk].Substring(2);
                        currentSentenceChunk = new SentenceChunk(tag, index);
                    }
                    else
                    {
                        currentSentenceChunk = new SentenceChunk(index);
                    }
                }

                // in all cases add the tagged word
                var word       = tokens[currentChunk];
                var wTag       = tags[currentChunk];
                var wIndex     = currentSentenceChunk.TaggedWords.Count;
                var taggedWord = new TaggedWord(word, wTag, wIndex);
                currentSentenceChunk.TaggedWords.Add(taggedWord);
            }
            // add last chunk
            results.Add(currentSentenceChunk);

            return(results);
        }
Exemple #20
0
        /// <summary>
        /// Gets formatted chunk information for a specified sentence.
        /// </summary>
        /// <param name="tokens">
        /// string array of tokens in the sentence
        /// </param>
        /// <param name="tags">
        /// string array of POS tags for the tokens in the sentence
        /// </param>
        /// <returns>
        /// A string containing the formatted chunked sentence
        /// </returns>
        public SentenceChunk[] GetChunks(string[] tokens, string[] tags)
        {
            var results = new List <SentenceChunk>();

            string[]      chunks = Chunk(tokens, tags);
            SentenceChunk currentSentenceChunk = null;

            for (int i = 0; i < chunks.Length; i++)
            {
                if (i > 0 &&
                    !chunks[i].StartsWith("I-") &&
                    chunks[i - 1] != "O")
                {
                    currentSentenceChunk = null;
                }

                if (chunks[i].StartsWith("B-"))
                {
                    currentSentenceChunk = new SentenceChunk(chunks[i].Substring(2), i);
                    results.Add(currentSentenceChunk);
                }

                if (currentSentenceChunk == null)
                {
                    currentSentenceChunk = new SentenceChunk(results.Count);
                    results.Add(currentSentenceChunk);
                }

                // in all cases add the tagged word
                var word       = tokens[i];
                var wTag       = tags[i];
                var wIndex     = currentSentenceChunk.TaggedWords.Count;
                var taggedWord = new TaggedWord(word, wTag, wIndex);
                currentSentenceChunk.TaggedWords.Add(taggedWord);
            }

            return(results.ToArray());
        }
Exemple #21
0
        private Tree MarkovOutsideBinarizeLocalTree(Tree t, TaggedWord head, int headLoc, string topCat, LinkedList <Tree> ll, bool doneLeft)
        {
            string       word        = head.Word();
            string       tag         = head.Tag();
            IList <Tree> newChildren = new List <Tree>(2);

            // call with t, headNum, head, topCat, false
            if (headLoc == 0)
            {
                if (!doneLeft)
                {
                    // insert a unary to separate the sides
                    if (tlp.IsStartSymbol(topCat))
                    {
                        return(MarkovOutsideBinarizeLocalTree(t, head, headLoc, topCat, new LinkedList <Tree>(), true));
                    }
                    string subLabelStr;
                    if (simpleLabels)
                    {
                        subLabelStr = '@' + topCat;
                    }
                    else
                    {
                        string headStr = t.GetChild(headLoc).Label().Value();
                        subLabelStr = '@' + topCat + ": " + headStr + " ]";
                    }
                    ILabel subLabel = new CategoryWordTag(subLabelStr, word, tag);
                    Tree   subTree  = tf.NewTreeNode(subLabel, t.GetChildrenAsList());
                    newChildren.Add(MarkovOutsideBinarizeLocalTree(subTree, head, headLoc, topCat, new LinkedList <Tree>(), true));
                    return(tf.NewTreeNode(t.Label(), newChildren));
                }
                int len = t.NumChildren();
                // len = 1
                if (len == 1)
                {
                    return(tf.NewTreeNode(t.Label(), Java.Util.Collections.SingletonList(t.GetChild(0))));
                }
                ll.AddFirst(t.GetChild(len - 1));
                if (ll.Count > markovOrder)
                {
                    ll.RemoveLast();
                }
                // generate a right
                string subLabelStr_1;
                if (simpleLabels)
                {
                    subLabelStr_1 = '@' + topCat;
                }
                else
                {
                    string headStr  = t.GetChild(headLoc).Label().Value();
                    string rightStr = (len > markovOrder - 1 ? "... " : string.Empty) + Join(ll);
                    subLabelStr_1 = '@' + topCat + ": " + headStr + ' ' + rightStr;
                }
                ILabel subLabel_1 = new CategoryWordTag(subLabelStr_1, word, tag);
                Tree   subTree_1  = tf.NewTreeNode(subLabel_1, t.GetChildrenAsList().SubList(0, len - 1));
                newChildren.Add(MarkovOutsideBinarizeLocalTree(subTree_1, head, headLoc, topCat, ll, true));
                newChildren.Add(t.GetChild(len - 1));
                return(tf.NewTreeNode(t.Label(), newChildren));
            }
            if (headLoc > 0)
            {
                ll.AddLast(t.GetChild(0));
                if (ll.Count > markovOrder)
                {
                    ll.RemoveFirst();
                }
                // generate a left
                string subLabelStr;
                if (simpleLabels)
                {
                    subLabelStr = '@' + topCat;
                }
                else
                {
                    string headStr = t.GetChild(headLoc).Label().Value();
                    string leftStr = Join(ll) + (headLoc > markovOrder - 1 ? " ..." : string.Empty);
                    subLabelStr = '@' + topCat + ": " + leftStr + ' ' + headStr + " ]";
                }
                ILabel subLabel = new CategoryWordTag(subLabelStr, word, tag);
                Tree   subTree  = tf.NewTreeNode(subLabel, t.GetChildrenAsList().SubList(1, t.NumChildren()));
                newChildren.Add(t.GetChild(0));
                newChildren.Add(MarkovOutsideBinarizeLocalTree(subTree, head, headLoc - 1, topCat, ll, false));
                return(tf.NewTreeNode(t.Label(), newChildren));
            }
            return(t);
        }
Exemple #22
0
 public virtual void Train(TaggedWord tw, int loc, double weight)
 {
     throw new NotSupportedException();
 }
Exemple #23
0
        public static void Main(string[] args)
        {
            System.Console.Out.WriteLine("Testing unknown matching");
            string s = "\u5218\u00b7\u9769\u547d";

            if (s.Matches(properNameMatch))
            {
                System.Console.Out.WriteLine("hooray names!");
            }
            else
            {
                System.Console.Out.WriteLine("Uh-oh names!");
            }
            string s1 = "\uff13\uff10\uff10\uff10";

            if (s1.Matches(numberMatch))
            {
                System.Console.Out.WriteLine("hooray numbers!");
            }
            else
            {
                System.Console.Out.WriteLine("Uh-oh numbers!");
            }
            string s11 = "\u767e\u5206\u4e4b\u56db\u5341\u4e09\u70b9\u4e8c";

            if (s11.Matches(numberMatch))
            {
                System.Console.Out.WriteLine("hooray numbers!");
            }
            else
            {
                System.Console.Out.WriteLine("Uh-oh numbers!");
            }
            string s12 = "\u767e\u5206\u4e4b\u4e09\u5341\u516b\u70b9\u516d";

            if (s12.Matches(numberMatch))
            {
                System.Console.Out.WriteLine("hooray numbers!");
            }
            else
            {
                System.Console.Out.WriteLine("Uh-oh numbers!");
            }
            string s2 = "\u4e09\u6708";

            if (s2.Matches(dateMatch))
            {
                System.Console.Out.WriteLine("hooray dates!");
            }
            else
            {
                System.Console.Out.WriteLine("Uh-oh dates!");
            }
            System.Console.Out.WriteLine("Testing tagged word");
            ClassicCounter <TaggedWord> c = new ClassicCounter <TaggedWord>();
            TaggedWord tw1 = new TaggedWord("w", "t");

            c.IncrementCount(tw1);
            TaggedWord tw2 = new TaggedWord("w", "t2");

            System.Console.Out.WriteLine(c.ContainsKey(tw2));
            System.Console.Out.WriteLine(tw1.Equals(tw2));
            WordTag wt1 = ToWordTag(tw1);
            WordTag wt2 = ToWordTag(tw2);
            WordTag wt3 = new WordTag("w", "t2");

            System.Console.Out.WriteLine(wt1.Equals(wt2));
            System.Console.Out.WriteLine(wt2.Equals(wt3));
        }
Exemple #24
0
 private static WordTag ToWordTag(TaggedWord tw)
 {
     return(new WordTag(tw.Word(), tw.Tag()));
 }
Exemple #25
0
        public List <Word> TurnThisWordsIntoWordsWithTaggedWords(Word[] tail)
        {
            List <Word> mergedTail = new List <Word>();

            //jan pona kin.
            if (tail.Length > 1)
            {
                //mergedTail.Add(currentWord);
                int resumeAt = -1;
                for (int i = 0; i < tail.Length; i++)
                {
                    Word currentWord = tail[i];
                    if (resumeAt != -1)
                    {
                        if (i < resumeAt)
                        {
                            continue;
                        }
                    }

                    bool       stopLookAhead = false;
                    TaggedWord possible      = null;
                    for (int j = tail.Length - 1; j > i; j--)
                    {
                        if (stopLookAhead)
                        {
                            continue;                //PERF prob here.
                        }
                        try
                        {
#if VS2013
                            possible = new TaggedWord(currentWord, new WordList(new ArraySegment <Word>(tail, i + 1, j - i)));
#else
                            possible = new TaggedWord(currentWord, new WordList(tail.Skip(i + 1).Take(j - i)));
#endif

                            resumeAt      = j + 1;
                            stopLookAhead = true; //we found the largest possible. now stop.
                        }
                        catch (TpSyntaxException ex)
                        {
                            //resumeAt = j + 1;
                        }
                    }

                    //Okay, we looked everywhere.
                    if (possible == null)
                    {
                        //This isn't a suitable head for a TaggedWord.
                        mergedTail.Add(currentWord);
                    }
                    else
                    {
                        mergedTail.Add(possible);
                    }
                }
            }
            else
            {
                mergedTail.Add(tail[0]);
            }
            return(mergedTail);
        }
Exemple #26
0
        private Tree OutsideBinarizeLocalTree(Tree t, string labelStr, string finalCat, int headNum, TaggedWord head, int leftProcessed, string leftStr, int rightProcessed, string rightStr)
        {
            IList <Tree> newChildren = new List <Tree>(2);
            ILabel       label       = new CategoryWordTag(labelStr, head.Word(), head.Tag());

            // check if there are <=2 children already
            if (t.NumChildren() - leftProcessed - rightProcessed <= 2)
            {
                // done, return
                newChildren.Add(t.GetChild(leftProcessed));
                if (t.NumChildren() - leftProcessed - rightProcessed == 2)
                {
                    newChildren.Add(t.GetChild(leftProcessed + 1));
                }
                return(tf.NewTreeNode(label, newChildren));
            }
            if (headNum > leftProcessed)
            {
                // eat a left word
                Tree   leftChild    = t.GetChild(leftProcessed);
                string childLeftStr = leftStr + ' ' + leftChild.Label().Value();
                string childLabelStr;
                if (simpleLabels)
                {
                    childLabelStr = '@' + finalCat;
                }
                else
                {
                    childLabelStr = '@' + finalCat + " :" + childLeftStr + " ..." + rightStr;
                }
                Tree rightChild = OutsideBinarizeLocalTree(t, childLabelStr, finalCat, headNum, head, leftProcessed + 1, childLeftStr, rightProcessed, rightStr);
                newChildren.Add(leftChild);
                newChildren.Add(rightChild);
                return(tf.NewTreeNode(label, newChildren));
            }
            else
            {
                // eat a right word
                Tree   rightChild    = t.GetChild(t.NumChildren() - rightProcessed - 1);
                string childRightStr = ' ' + rightChild.Label().Value() + rightStr;
                string childLabelStr;
                if (simpleLabels)
                {
                    childLabelStr = '@' + finalCat;
                }
                else
                {
                    childLabelStr = '@' + finalCat + " :" + leftStr + " ..." + childRightStr;
                }
                Tree leftChild = OutsideBinarizeLocalTree(t, childLabelStr, finalCat, headNum, head, leftProcessed, leftStr, rightProcessed + 1, childRightStr);
                newChildren.Add(leftChild);
                newChildren.Add(rightChild);
                return(tf.NewTreeNode(label, newChildren));
            }
        }
Exemple #27
0
        private void LoadFile(ITaggedFileReader reader, IDictionary <string, IntCounter <string> > wordTagCounts)
        {
            log.Info("Loading tagged words from " + reader.Filename());
            List <string> words        = new List <string>();
            List <string> tags         = new List <string>();
            int           numSentences = 0;
            int           numWords     = 0;
            int           maxLen       = int.MinValue;
            int           minLen       = int.MaxValue;

            foreach (IList <TaggedWord> sentence in reader)
            {
                if (maxentTagger.wordFunction != null)
                {
                    IList <TaggedWord> newSentence = new List <TaggedWord>(sentence.Count);
                    foreach (TaggedWord word in sentence)
                    {
                        TaggedWord newWord = new TaggedWord(maxentTagger.wordFunction.Apply(word.Word()), word.Tag());
                        newSentence.Add(newWord);
                    }
                    sentence = newSentence;
                }
                foreach (TaggedWord tw in sentence)
                {
                    if (tw != null)
                    {
                        words.Add(tw.Word());
                        tags.Add(tw.Tag());
                        if (!maxentTagger.tagTokens.Contains(tw.Tag()))
                        {
                            maxentTagger.tagTokens[tw.Tag()] = Generics.NewHashSet <string>();
                        }
                        maxentTagger.tagTokens[tw.Tag()].Add(tw.Word());
                    }
                }
                maxLen = (sentence.Count > maxLen ? sentence.Count : maxLen);
                minLen = (sentence.Count < minLen ? sentence.Count : minLen);
                words.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosWord);
                tags.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosTag);
                numElements = numElements + sentence.Count + 1;
                // iterate over the words in the sentence
                for (int i = 0; i < sentence.Count + 1; i++)
                {
                    History h    = new History(totalWords + totalSentences, totalWords + totalSentences + sentence.Count, totalWords + totalSentences + i, pairs, maxentTagger.extractors);
                    string  tag  = tags[i];
                    string  word = words[i];
                    pairs.Add(new WordTag(word, tag));
                    int         y   = maxentTagger.AddTag(tag);
                    DataWordTag dat = new DataWordTag(h, y, tag);
                    v.Add(dat);
                    IntCounter <string> tagCounts = wordTagCounts[word];
                    if (tagCounts == null)
                    {
                        tagCounts           = new IntCounter <string>();
                        wordTagCounts[word] = tagCounts;
                    }
                    tagCounts.IncrementCount(tag, 1);
                }
                totalSentences++;
                totalWords += sentence.Count;
                numSentences++;
                numWords += sentence.Count;
                words.Clear();
                tags.Clear();
                if ((numSentences % 100000) == 0)
                {
                    log.Info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words ... [still reading]");
                }
            }
            log.Info("Read " + numWords + " words from " + reader.Filename() + " [done].");
            log.Info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words.");
        }
Exemple #28
0
        private void BgWorkerClean_DoWork(object sender, DoWorkEventArgs e)
        {
            BackgroundWorkerData BGData = (BackgroundWorkerData)e.Argument;


            //report what we're working on
            FilenameLabel.Invoke((MethodInvoker) delegate
            {
                FilenameLabel.Text = "Loading model...";
            });


            //set up our sentence boundary detection
            Regex SentenceSplitter = new Regex(@"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", RegexOptions.Compiled);

            //selects the text encoding based on user selection
            Encoding SelectedEncoding = null;

            this.Invoke((MethodInvoker) delegate()
            {
                SelectedEncoding = Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString());
            });



            //get the list of files
            var SearchDepth = SearchOption.TopDirectoryOnly;

            if (ScanSubfolderCheckbox.Checked)
            {
                SearchDepth = SearchOption.AllDirectories;
            }
            var files = Directory.EnumerateFiles(BGData.TextFileFolder, "*.txt", SearchDepth);



            try {
                var tagger = new MaxentTagger(modelsDirectory + @"/" + BGData.SelectedModel);

                int NumberOfTagsInModel = tagger.numTags();

                List <string> tags_list_header = new List <string>();
                List <string> tags_list        = new List <string>();


                for (int i = 0; i < NumberOfTagsInModel; i++)
                {
                    tags_list_header.Add("\"" + tagger.getTag(i) + "\"");
                    tags_list.Add(tagger.getTag(i));
                }

                tags_list_header.Sort();
                tags_list.Sort();

                string[] tags_array = tags_list.ToArray();



                //open up the output file
                using (StreamWriter outputFile = new StreamWriter(new FileStream(BGData.OutputFileLocation, FileMode.Create), SelectedEncoding))
                {
                    //write the header row to the output file
                    StringBuilder HeaderString = new StringBuilder();
                    HeaderString.Append("\"Filename\",\"Segment\",\"TokenCount\",\"SentenceCount\"," + string.Join(",", tags_list_header.ToArray()));

                    if (BGData.OutputTaggedText)
                    {
                        HeaderString.Append(",\"TaggedText\"");
                    }
                    if (BGData.OrderedPOSTagText)
                    {
                        HeaderString.Append(",\"OrderedPOSTags\"");
                    }

                    outputFile.WriteLine(HeaderString.ToString());


                    foreach (string fileName in files)
                    {
                        //set up our variables to report
                        string Filename_Clean = Path.GetFileName(fileName);


                        //report what we're working on
                        FilenameLabel.Invoke((MethodInvoker) delegate
                        {
                            FilenameLabel.Text = "Analyzing: " + Filename_Clean;
                        });



                        //read in the text file, convert everything to lowercase
                        var InputText = System.IO.File.ReadAllText(fileName, SelectedEncoding).Trim();

                        var sentences = MaxentTagger.tokenizeText(new java.io.StringReader(InputText)).toArray();



                        //now that we know how many sentences we have, we can figure out the segmentation
                        double SentencesPerSegment = 1.0;
                        int    NumberOfSegments    = BGData.NumSegments;
                        if (NumberOfSegments > sentences.Length)
                        {
                            NumberOfSegments = sentences.Length;
                        }

                        if (sentences.Length > 0)
                        {
                            SentencesPerSegment = sentences.Length / (double)NumberOfSegments;
                        }


                        List <List <ArrayList> > Sentences_Segmented = new List <List <ArrayList> >();

                        int SegmentCounter = 1;
                        //int SentenceNumberTracker = 0;
                        for (int i = 0; i < sentences.Length; i++)
                        {
                            if (Sentences_Segmented.Count < SegmentCounter)
                            {
                                Sentences_Segmented.Add(new List <ArrayList>());
                            }

                            Sentences_Segmented[SegmentCounter - 1].Add((ArrayList)sentences[i]);
                            //SentenceNumberTracker++;

                            if (i + 1 >= SegmentCounter * SentencesPerSegment)
                            {
                                SegmentCounter++;
                                //SentenceNumberTracker = 0;
                            }
                        }


                        sentences = null;



                        //     _                _                 _____         _
                        //    / \   _ __   __ _| |_   _ _______  |_   _|____  _| |_
                        //   / _ \ | '_ \ / _` | | | | |_  / _ \   | |/ _ \ \/ / __|
                        //  / ___ \| | | | (_| | | |_| |/ /  __/   | |  __/>  <| |_
                        // /_/   \_\_| |_|\__,_|_|\__, /___\___|   |_|\___/_/\_\\__|
                        //                        |___/



                        for (int i = 0; i < NumberOfSegments; i++)
                        {
                            Dictionary <string, int> POSSums = new Dictionary <string, int>();
                            for (int j = 0; j < NumberOfTagsInModel; j++)
                            {
                                POSSums.Add(tags_array[j], 0);
                            }


                            StringBuilder TaggedText     = new StringBuilder();
                            StringBuilder OrderedPOSTags = new StringBuilder();

                            int TotalSentences = Sentences_Segmented[i].Count;
                            int TotalWC        = 0;


                            foreach (ArrayList sentence in Sentences_Segmented[i])
                            {
                                var taggedSentence = tagger.tagSentence(sentence);


                                Iterator it = taggedSentence.iterator();



                                while (it.hasNext())
                                {
                                    TaggedWord token = (TaggedWord)it.next();

                                    if (BGData.OutputTaggedText)
                                    {
                                        TaggedText.Append(token.toString() + " ");
                                    }
                                    if (BGData.OrderedPOSTagText)
                                    {
                                        OrderedPOSTags.Append(token.tag() + " ");
                                    }


                                    POSSums[token.tag()] += 1;
                                    TotalWC += 1;

                                    //MessageBox.Show(token.word());
                                }

                                TaggedText.Append(Environment.NewLine);
                                OrderedPOSTags.Append(Environment.NewLine);
                            }



                            // __        __    _ _          ___        _               _
                            // \ \      / / __(_) |_ ___   / _ \ _   _| |_ _ __  _   _| |_
                            //  \ \ /\ / / '__| | __/ _ \ | | | | | | | __| '_ \| | | | __|
                            //   \ V  V /| |  | | ||  __/ | |_| | |_| | |_| |_) | |_| | |_
                            //    \_/\_/ |_|  |_|\__\___|  \___/ \__,_|\__| .__/ \__,_|\__|
                            //                                            |_|



                            string[] OutputString = new string[4];
                            OutputString[0] = "\"" + Filename_Clean + "\"";
                            OutputString[1] = (i + 1).ToString();
                            OutputString[2] = TotalWC.ToString();
                            OutputString[3] = TotalSentences.ToString();

                            int include_tagged_text = 0;
                            int include_ordered_pos = 0;
                            if (BGData.OutputTaggedText)
                            {
                                include_tagged_text = 1;
                            }
                            if (BGData.OrderedPOSTagText)
                            {
                                include_ordered_pos = 1;
                            }

                            string[] TagOutputString = new string[NumberOfTagsInModel + include_tagged_text + include_ordered_pos];

                            for (int j = 0; j < NumberOfTagsInModel; j++)
                            {
                                if (BGData.NormalizeOutput && TotalWC > 0)
                                {
                                    TagOutputString[j] = RoundUp(POSSums[tags_array[j]] * 100 / (double)TotalWC, 5).ToString();
                                }
                                else
                                {
                                    TagOutputString[j] = POSSums[tags_array[j]].ToString();
                                }
                            }

                            if (BGData.OutputTaggedText)
                            {
                                TagOutputString[TagOutputString.Length - include_tagged_text - include_ordered_pos] = "\"" + TaggedText.ToString().Replace("\"", "\"\"") + "\"";
                            }
                            if (BGData.OrderedPOSTagText)
                            {
                                TagOutputString[TagOutputString.Length - include_ordered_pos] = "\"" + OrderedPOSTags.ToString().Replace("\"", "\"\"") + "\"";
                            }

                            outputFile.WriteLine(String.Join(",", MergeOutputArrays(OutputString, TagOutputString)));
                        }



                        //end of the "for each file" loop
                    }
                }
            }
            catch (OutOfMemoryException OOM)
            {
                MessageBox.Show("One or more of your files caused an Out of Memory error. This means that you do not have enough RAM to process the current file. This is often caused by extremely complex / messy language samples with run-on sentences or other peculiar constructions, paired with a computer that does not have enough RAM to handle such processing.", "Out of Memory", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
            catch
            {
                MessageBox.Show("POSTModern encountered an issue somewhere while trying to analyze your texts. The most common cause of this is trying to open your output file while POSTModern is still running. Did any of your input files move, or is your output file being opened/modified by another application?", "Error while analyzing", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
        }
Exemple #29
0
        /// <summary>Binarizes the tree according to options set up in the constructor.</summary>
        /// <remarks>
        /// Binarizes the tree according to options set up in the constructor.
        /// Does the whole tree by calling itself recursively.
        /// </remarks>
        /// <param name="t">
        /// A tree to be binarized. The non-leaf nodes must already have
        /// CategoryWordTag labels, with heads percolated.
        /// </param>
        /// <returns>A binary tree.</returns>
        public virtual Tree TransformTree(Tree t)
        {
            // handle null
            if (t == null)
            {
                return(null);
            }
            string cat = t.Label().Value();

            // handle words
            if (t.IsLeaf())
            {
                ILabel label = new Word(cat);
                //new CategoryWordTag(cat,cat,"");
                return(tf.NewLeaf(label));
            }
            // handle tags
            if (t.IsPreTerminal())
            {
                Tree   childResult = TransformTree(t.GetChild(0));
                string word        = childResult.Value();
                // would be nicer if Word/CWT ??
                IList <Tree> newChildren = new List <Tree>(1);
                newChildren.Add(childResult);
                return(tf.NewTreeNode(new CategoryWordTag(cat, word, cat), newChildren));
            }
            // handle categories
            Tree headChild = hf.DetermineHead(t);

            /*
             * System.out.println("### finding head for:");
             * t.pennPrint();
             * System.out.println("### its head is:");
             * headChild.pennPrint();
             */
            if (headChild == null && !t.Label().Value().StartsWith(tlp.StartSymbol()))
            {
                log.Info("### No head found for:");
                t.PennPrint();
            }
            int headNum = -1;

            Tree[]       kids          = t.Children();
            IList <Tree> newChildren_1 = new List <Tree>(kids.Length);

            for (int childNum = 0; childNum < kids.Length; childNum++)
            {
                Tree child       = kids[childNum];
                Tree childResult = TransformTree(child);
                // recursive call
                if (child == headChild)
                {
                    headNum = childNum;
                }
                newChildren_1.Add(childResult);
            }
            Tree result;

            // XXXXX UPTO HERE!!!  ALMOST DONE!!!
            if (t.Label().Value().StartsWith(tlp.StartSymbol()))
            {
                // handle the ROOT tree properly

                /*
                 * //CategoryWordTag label = (CategoryWordTag) t.label();
                 * // binarize without the last kid and then add it back to the top tree
                 * Tree lastKid = (Tree)newChildren.remove(newChildren.size()-1);
                 * Tree tempTree = tf.newTreeNode(label, newChildren);
                 * tempTree = binarizeLocalTree(tempTree, headNum, result.head);
                 * newChildren = tempTree.getChildrenAsList();
                 * newChildren.add(lastKid); // add it back
                 */
                result = tf.NewTreeNode(t.Label(), newChildren_1);
            }
            else
            {
                // label shouldn't have changed
                //      CategoryWordTag headLabel = (CategoryWordTag) headChild.label();
                string word  = ((IHasWord)headChild.Label()).Word();
                string tag   = ((IHasTag)headChild.Label()).Tag();
                ILabel label = new CategoryWordTag(cat, word, tag);
                result = tf.NewTreeNode(label, newChildren_1);
                // cdm Mar 2005: invent a head so I don't have to rewrite all this
                // code, but with the removal of TreeHeadPair, some of the rest of
                // this should probably be rewritten too to not use this head variable
                TaggedWord head = new TaggedWord(word, tag);
                result = BinarizeLocalTree(result, headNum, head);
            }
            return(result);
        }
Exemple #30
0
 public abstract void Train(TaggedWord arg1, int arg2, double arg3);
		/// <summary>
		/// Gets formatted chunk information for a specified sentence.
		/// </summary>
		/// <param name="tokens">
		/// string array of tokens in the sentence
		/// </param>
		/// <param name="tags">
		/// string array of POS tags for the tokens in the sentence
		/// </param>
		/// <returns>
		/// A string containing the formatted chunked sentence
		/// </returns>
		public List<SentenceChunk> GetChunks(string[] tokens, string[] tags)
		{
		    var results = new List<SentenceChunk>();

			string[] chunks = Chunk(tokens, tags);
            SentenceChunk currentSentenceChunk = null;
			for (int currentChunk = 0, chunkCount = chunks.Length; currentChunk < chunkCount; currentChunk++)
			{
				if (chunks[currentChunk].StartsWith("B-") || chunks[currentChunk] == "O")
                {
                    if (currentSentenceChunk != null)
	                {
		                results.Add(currentSentenceChunk); 
	                }

                    var index = results.Count;
                    if (chunks[currentChunk].Length > 2)
                    {
                        var tag = chunks[currentChunk].Substring(2);
                        currentSentenceChunk = new SentenceChunk(tag, index);
                    }
                    else
                    {
                        currentSentenceChunk = new SentenceChunk(index);
                    }
				}

                // in all cases add the tagged word
			    var word = tokens[currentChunk];
			    var wTag = tags[currentChunk];
			    var wIndex = currentSentenceChunk.TaggedWords.Count;
			    var taggedWord = new TaggedWord(word, wTag, wIndex);
                currentSentenceChunk.TaggedWords.Add(taggedWord);
			}
            // add last chunk
            results.Add(currentSentenceChunk);

		    return results;
		}
        /// <summary>Parse a sentence represented as a List of tokens.</summary>
        /// <remarks>
        /// Parse a sentence represented as a List of tokens.
        /// The text must already have been tokenized and
        /// normalized into tokens that are appropriate to the treebank
        /// which was used to train the parser.  The tokens can be of
        /// multiple types, and the list items need not be homogeneous as to type
        /// (in particular, only some words might be given tags):
        /// <ul>
        /// <li>If a token implements HasWord, then the word to be parsed is
        /// given by its word() value.</li>
        /// <li>If a token implements HasTag and the tag() value is not
        /// null or the empty String, then the parser is strongly advised to assign
        /// a part of speech tag that <i>begins</i> with this String.</li>
        /// </ul>
        /// </remarks>
        /// <param name="sentence">The sentence to parse</param>
        /// <returns>true Iff the sentence was accepted by the grammar</returns>
        /// <exception cref="System.NotSupportedException">
        /// If the Sentence is too long or
        /// of zero length or the parse
        /// otherwise fails for resource reasons
        /// </exception>
        private bool ParseInternal <_T0>(IList <_T0> sentence)
            where _T0 : IHasWord
        {
            parseSucceeded   = false;
            parseNoMemory    = false;
            parseUnparsable  = false;
            parseSkipped     = false;
            parseFallback    = false;
            whatFailed       = null;
            addedPunct       = false;
            originalSentence = sentence;
            int length = sentence.Count;

            if (length == 0)
            {
                parseSkipped = true;
                throw new NotSupportedException("Can't parse a zero-length sentence!");
            }
            IList <IHasWord> sentenceB;

            if (op.wordFunction != null)
            {
                sentenceB = Generics.NewArrayList();
                foreach (IHasWord word in originalSentence)
                {
                    if (word is ILabel)
                    {
                        ILabel label    = (ILabel)word;
                        ILabel newLabel = label.LabelFactory().NewLabel(label);
                        if (newLabel is IHasWord)
                        {
                            sentenceB.Add((IHasWord)newLabel);
                        }
                        else
                        {
                            throw new AssertionError("This should have been a HasWord");
                        }
                    }
                    else
                    {
                        if (word is IHasTag)
                        {
                            TaggedWord tw = new TaggedWord(word.Word(), ((IHasTag)word).Tag());
                            sentenceB.Add(tw);
                        }
                        else
                        {
                            sentenceB.Add(new Word(word.Word()));
                        }
                    }
                }
                foreach (IHasWord word_1 in sentenceB)
                {
                    word_1.SetWord(op.wordFunction.Apply(word_1.Word()));
                }
            }
            else
            {
                sentenceB = new List <IHasWord>(sentence);
            }
            if (op.testOptions.addMissingFinalPunctuation)
            {
                addedPunct = AddSentenceFinalPunctIfNeeded(sentenceB, length);
            }
            if (length > op.testOptions.maxLength)
            {
                parseSkipped = true;
                throw new NotSupportedException("Sentence too long: length " + length);
            }
            TreePrint   treePrint = GetTreePrint();
            PrintWriter pwOut     = op.tlpParams.Pw();

            //Insert the boundary symbol
            if (sentence[0] is CoreLabel)
            {
                CoreLabel boundary = new CoreLabel();
                boundary.SetWord(LexiconConstants.Boundary);
                boundary.SetValue(LexiconConstants.Boundary);
                boundary.SetTag(LexiconConstants.BoundaryTag);
                boundary.SetIndex(sentence.Count + 1);
                //1-based indexing used in the parser
                sentenceB.Add(boundary);
            }
            else
            {
                sentenceB.Add(new TaggedWord(LexiconConstants.Boundary, LexiconConstants.BoundaryTag));
            }
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            if (op.doPCFG)
            {
                if (!pparser.Parse(sentenceB))
                {
                    return(parseSucceeded);
                }
                if (op.testOptions.verbose)
                {
                    pwOut.Println("PParser output");
                    // getBestPCFGParse(false).pennPrint(pwOut); // with scores on nodes
                    treePrint.PrintTree(GetBestPCFGParse(false), pwOut);
                }
            }
            // without scores on nodes
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            if (op.doDep && !op.testOptions.useFastFactored)
            {
                if (!dparser.Parse(sentenceB))
                {
                    return(parseSucceeded);
                }
                // cdm nov 2006: should move these printing bits to the main printing section,
                // so don't calculate the best parse twice!
                if (op.testOptions.verbose)
                {
                    pwOut.Println("DParser output");
                    treePrint.PrintTree(dparser.GetBestParse(), pwOut);
                }
            }
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            if (op.doPCFG && op.doDep)
            {
                if (!bparser.Parse(sentenceB))
                {
                    return(parseSucceeded);
                }
                else
                {
                    parseSucceeded = true;
                }
            }
            return(true);
        }