C# (CSharp) CoreLabel.Tag Examples

Programming Language: C# (CSharp)

Class/Type: CoreLabel

Method/Function: Tag

Examples at hotexamples.com: 12

C# (CSharp) CoreLabel.Tag - 12 examples found. These are the top rated real world C# (CSharp) examples of CoreLabel.Tag extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Set(30)

Word(30)

Get(30)

SetWord(27)

SetValue(27)

Lemma(15)

SetTag(13)

Tag(12)

ContainsKey(11)

GetString(10)

Index(9)

SetIndex(9)

Value(9)

Factory(9)

Remove(8)

SetNER(7)

BeginPosition(6)

SetLemma(6)

SetOriginalText(5)

ToString(4)

get(4)

SetBeginPosition(4)

OriginalText(4)

SetEndPosition(4)

KeySet(3)

ToShorterString(3)

value(2)

Ner(2)

IsNewline(2)

EndPosition(2)

toString(1)

set(1)

lemma(1)

index(1)

SetCategory(1)

endPosition(1)

beginPosition(1)

GetHashCode(1)

Size(1)

LabelFactory(1)

Category(1)

word(1)

Example #1

Show file

        /// <summary>
        /// Set the tags of the original tokens and the leaves if they
        /// aren't already set.
        /// </summary>
        private static void SetMissingTags(ICoreMap sentence, Tree tree)
        {
            IList <TaggedWord> taggedWords = null;
            IList <ILabel>     leaves      = null;
            IList <CoreLabel>  tokens      = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));

            for (int i = 0; i < size; ++i)
            {
                CoreLabel token = tokens[i];
                if (token.Tag() == null)
                {
                    if (taggedWords == null)
                    {
                        taggedWords = tree.TaggedYield();
                    }
                    if (leaves == null)
                    {
                        leaves = tree.Yield();
                    }
                    token.SetTag(taggedWords[i].Tag());
                    ILabel leaf = leaves[i];
                    if (leaf is IHasTag)
                    {
                        ((IHasTag)leaf).SetTag(taggedWords[i].Tag());
                    }
                }
            }
        }

Example #2

Show file

File: LearnImportantFeatures.cs Project: zerouid/Stanford.CoreNLP.NET

        private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i)
        {
            ICounter <string> feat = new ClassicCounter <string>();
            CoreLabel         l    = sent[i];
            string            label;

            if (l.Get(answerClass).ToString().Equals(answerLabel))
            {
                label = answerLabel;
            }
            else
            {
                label = "O";
            }
            CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases));

            if (matchedPhrases == null)
            {
                matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>();
                matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word()));
            }
            foreach (CandidatePhrase w in matchedPhrases.AllValues())
            {
                int num = this.clusterIds[w.GetPhrase()];
                if (num == null)
                {
                    num = -1;
                }
                feat.SetCount("Cluster-" + num, 1.0);
            }
            // feat.incrementCount("WORD-" + l.word());
            // feat.incrementCount("LEMMA-" + l.lemma());
            // feat.incrementCount("TAG-" + l.tag());
            int window = 0;

            for (int j = Math.Max(0, i - window); j < i; j++)
            {
                CoreLabel lj = sent[j];
                feat.IncrementCount("PREV-" + "WORD-" + lj.Word());
                feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("PREV-" + "TAG-" + lj.Tag());
            }
            for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++)
            {
                CoreLabel lj = sent[j_1];
                feat.IncrementCount("NEXT-" + "WORD-" + lj.Word());
                feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag());
            }
            // System.out.println("adding " + l.word() + " as " + label);
            return(new RVFDatum <string, string>(feat, label));
        }

Example #3

Show file

File: ApplyDepPatterns.cs Project: zerouid/Stanford.CoreNLP.NET

        private bool MatchedRestriction(CoreLabel coreLabel, string label)
        {
            bool use = false;

            if (PatternFactory.useTargetNERRestriction)
            {
                foreach (string s in constVars.allowedNERsforLabels[label])
                {
                    if (coreLabel.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Matches(s))
                    {
                        use = true;
                        break;
                    }
                }
            }
            else
            {
                //System.out.println("not matching NER");
                use = true;
            }
            if (use)
            {
                string tag = coreLabel.Tag();
                if (constVars.allowedTagsInitials != null && constVars.allowedTagsInitials.Contains(label))
                {
                    foreach (string allowed in constVars.allowedTagsInitials[label])
                    {
                        if (tag.StartsWith(allowed))
                        {
                            use = true;
                            break;
                        }
                        use = false;
                    }
                }
            }
            if (constVars.debug >= 4)
            {
                if (use)
                {
                    System.Console.Out.WriteLine(coreLabel.Word() + " matched restriction " + (PatternFactory.useTargetNERRestriction ? constVars.allowedNERsforLabels[label] : string.Empty) + "and" + PatternFactory.useTargetNERRestriction + " and " + (constVars
                                                                                                                                                                                                                                                            .allowedTagsInitials != null ? constVars.allowedTagsInitials[label] : string.Empty));
                }
                else
                {
                    System.Console.Out.WriteLine(coreLabel.Word() + " did not matched restrict " + (PatternFactory.useTargetNERRestriction ? constVars.allowedNERsforLabels[label] : string.Empty) + "and" + PatternFactory.useTargetNERRestriction + " and " + (constVars
                                                                                                                                                                                                                                                                 .allowedTagsInitials != null ? constVars.allowedTagsInitials[label] : string.Empty));
                }
            }
            return(use);
        }

Example #4

Show file

File: SupervisedSieveTraining.cs Project: zerouid/Stanford.CoreNLP.NET

        //goldList null if not training
        public static SupervisedSieveTraining.FeaturesData Featurize(SupervisedSieveTraining.SieveData sd, IList <XMLToAnnotation.GoldQuoteInfo> goldList, bool isTraining)
        {
            Annotation doc = sd.doc;

            sieve = new Sieve(doc, sd.characterMap, sd.pronounCorefMap, sd.animacyList);
            IList <ICoreMap>  quotes    = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));
            IList <ICoreMap>  sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));
            IList <CoreLabel> tokens    = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IDictionary <int, IList <ICoreMap> > paragraphToQuotes = GetQuotesInParagraph(doc);
            GeneralDataset <string, string>      dataset           = new RVFDataset <string, string>();
            //necessary for 'ScoreBestMention'
            IDictionary <int, Pair <int, int> > mapQuoteToDataRange = new Dictionary <int, Pair <int, int> >();
            //maps quote to corresponding indices in the dataset
            IDictionary <int, Sieve.MentionData> mapDatumToMention = new Dictionary <int, Sieve.MentionData>();

            if (isTraining && goldList.Count != quotes.Count)
            {
                throw new Exception("Gold Quote List size doesn't match quote list size!");
            }
            for (int quoteIdx = 0; quoteIdx < quotes.Count; quoteIdx++)
            {
                int      initialSize = dataset.Size();
                ICoreMap quote       = quotes[quoteIdx];
                XMLToAnnotation.GoldQuoteInfo gold = null;
                if (isTraining)
                {
                    gold = goldList[quoteIdx];
                    if (gold.speaker == string.Empty)
                    {
                        continue;
                    }
                }
                ICoreMap        quoteFirstSentence = sentences[quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))];
                Pair <int, int> quoteRun           = new Pair <int, int>(quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)));
                //      int quoteChapter = quoteFirstSentence.get(ChapterAnnotator.ChapterAnnotation.class);
                int quoteParagraphIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation));
                //add mentions before quote up to the previous paragraph
                int rightValue = quoteRun.first - 1;
                int leftValue  = quoteRun.first - 1;
                //move left value to be the first token idx of the previous paragraph
                for (int sentIdx = quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation)); sentIdx >= 0; sentIdx--)
                {
                    ICoreMap sentence = sentences[sentIdx];
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx)
                    {
                        continue;
                    }
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1)
                    {
                        //quoteParagraphIdx - 1 for this and prev
                        leftValue = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                    }
                    else
                    {
                        break;
                    }
                }
                IList <Sieve.MentionData> mentionsInPreviousParagraph = new List <Sieve.MentionData>();
                if (leftValue > -1 && rightValue > -1)
                {
                    mentionsInPreviousParagraph = EliminateDuplicates(sieve.FindClosestMentionsInSpanBackward(new Pair <int, int>(leftValue, rightValue)));
                }
                //mentions in next paragraph
                leftValue  = quoteRun.second + 1;
                rightValue = quoteRun.second + 1;
                for (int sentIdx_1 = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation)); sentIdx_1 < sentences.Count; sentIdx_1++)
                {
                    ICoreMap sentence = sentences[sentIdx_1];
                    //        if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx) {
                    //          continue;
                    //        }
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx)
                    {
                        //quoteParagraphIdx + 1
                        rightValue = sentence.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1;
                    }
                    else
                    {
                        break;
                    }
                }
                IList <Sieve.MentionData> mentionsInNextParagraph = new List <Sieve.MentionData>();
                if (leftValue < tokens.Count && rightValue < tokens.Count)
                {
                    mentionsInNextParagraph = sieve.FindClosestMentionsInSpanForward(new Pair <int, int>(leftValue, rightValue));
                }
                IList <Sieve.MentionData> candidateMentions = new List <Sieve.MentionData>();
                Sharpen.Collections.AddAll(candidateMentions, mentionsInPreviousParagraph);
                Sharpen.Collections.AddAll(candidateMentions, mentionsInNextParagraph);
                //      System.out.println(candidateMentions.size());
                int rankedDistance = 1;
                int numBackwards   = mentionsInPreviousParagraph.Count;
                foreach (Sieve.MentionData mention in candidateMentions)
                {
                    IList <CoreLabel> mentionCandidateTokens   = doc.Get(typeof(CoreAnnotations.TokensAnnotation)).SubList(mention.begin, mention.end + 1);
                    ICoreMap          mentionCandidateSentence = sentences[mentionCandidateTokens[0].SentIndex()];
                    //        if (mentionCandidateSentence.get(ChapterAnnotator.ChapterAnnotation.class) != quoteChapter) {
                    //          continue;
                    //        }
                    ICounter <string> features = new ClassicCounter <string>();
                    bool isLeft   = true;
                    int  distance = quoteRun.first - mention.end;
                    if (distance < 0)
                    {
                        isLeft   = false;
                        distance = mention.begin - quoteRun.second;
                    }
                    if (distance < 0)
                    {
                        continue;
                    }
                    //disregard mention-in-quote cases.
                    features.SetCount("wordDistance", distance);
                    IList <CoreLabel> betweenTokens;
                    if (isLeft)
                    {
                        betweenTokens = tokens.SubList(mention.end + 1, quoteRun.first);
                    }
                    else
                    {
                        betweenTokens = tokens.SubList(quoteRun.second + 1, mention.begin);
                    }
                    //Punctuation in between
                    foreach (CoreLabel token in betweenTokens)
                    {
                        if (punctuation.Contains(token.Word()))
                        {
                            features.SetCount("punctuationPresence:" + token.Word(), 1);
                        }
                    }
                    // number of mentions away
                    features.SetCount("rankedDistance", rankedDistance);
                    rankedDistance++;
                    if (rankedDistance == numBackwards)
                    {
                        //reset for the forward
                        rankedDistance = 1;
                    }
                    //        int quoteParagraphIdx = quoteFirstSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class);
                    //third distance: # of paragraphs away
                    int      mentionParagraphIdx        = -1;
                    ICoreMap sentenceInMentionParagraph = null;
                    int      quoteParagraphBeginToken   = GetParagraphBeginToken(quoteFirstSentence, sentences);
                    int      quoteParagraphEndToken     = GetParagraphEndToken(quoteFirstSentence, sentences);
                    if (isLeft)
                    {
                        if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken)
                        {
                            features.SetCount("leftParagraphDistance", 0);
                            mentionParagraphIdx        = quoteParagraphIdx;
                            sentenceInMentionParagraph = quoteFirstSentence;
                        }
                        else
                        {
                            int      paragraphDistance = 1;
                            int      currParagraphIdx  = quoteParagraphIdx - paragraphDistance;
                            ICoreMap currSentence      = quoteFirstSentence;
                            int      currSentenceIdx   = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                            while (currParagraphIdx >= 0)
                            {
                                //              Paragraph prevParagraph = paragraphs.get(prevParagraphIndex);
                                //extract begin and end tokens of
                                while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != currParagraphIdx)
                                {
                                    currSentenceIdx--;
                                    currSentence = sentences[currSentenceIdx];
                                }
                                int prevParagraphBegin = GetParagraphBeginToken(currSentence, sentences);
                                int prevParagraphEnd   = GetParagraphEndToken(currSentence, sentences);
                                if (prevParagraphBegin <= mention.begin && mention.end <= prevParagraphEnd)
                                {
                                    mentionParagraphIdx        = currParagraphIdx;
                                    sentenceInMentionParagraph = currSentence;
                                    features.SetCount("leftParagraphDistance", paragraphDistance);
                                    if (paragraphDistance % 2 == 0)
                                    {
                                        features.SetCount("leftParagraphDistanceEven", 1);
                                    }
                                    break;
                                }
                                paragraphDistance++;
                                currParagraphIdx--;
                            }
                        }
                    }
                    else
                    {
                        //right
                        if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken)
                        {
                            features.SetCount("rightParagraphDistance", 0);
                            sentenceInMentionParagraph = quoteFirstSentence;
                            mentionParagraphIdx        = quoteParagraphIdx;
                        }
                        else
                        {
                            int      paragraphDistance  = 1;
                            int      nextParagraphIndex = quoteParagraphIdx + paragraphDistance;
                            ICoreMap currSentence       = quoteFirstSentence;
                            int      currSentenceIdx    = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                            while (currSentenceIdx < sentences.Count)
                            {
                                while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != nextParagraphIndex)
                                {
                                    currSentenceIdx++;
                                    currSentence = sentences[currSentenceIdx];
                                }
                                int nextParagraphBegin = GetParagraphBeginToken(currSentence, sentences);
                                int nextParagraphEnd   = GetParagraphEndToken(currSentence, sentences);
                                if (nextParagraphBegin <= mention.begin && mention.end <= nextParagraphEnd)
                                {
                                    sentenceInMentionParagraph = currSentence;
                                    features.SetCount("rightParagraphDistance", paragraphDistance);
                                    break;
                                }
                                paragraphDistance++;
                                nextParagraphIndex++;
                            }
                        }
                    }
                    //2. mention features
                    if (sentenceInMentionParagraph != null)
                    {
                        int mentionParagraphBegin = GetParagraphBeginToken(sentenceInMentionParagraph, sentences);
                        int mentionParagraphEnd   = GetParagraphEndToken(sentenceInMentionParagraph, sentences);
                        if (!(mentionParagraphBegin == quoteParagraphBeginToken && mentionParagraphEnd == quoteParagraphEndToken))
                        {
                            IList <ICoreMap> quotesInMentionParagraph = paragraphToQuotes.GetOrDefault(mentionParagraphIdx, new List <ICoreMap>());
                            Pair <List <string>, List <Pair <int, int> > > namesInMentionParagraph = sieve.ScanForNames(new Pair <int, int>(mentionParagraphBegin, mentionParagraphEnd));
                            features.SetCount("quotesInMentionParagraph", quotesInMentionParagraph.Count);
                            features.SetCount("wordsInMentionParagraph", mentionParagraphEnd - mentionParagraphBegin + 1);
                            features.SetCount("namesInMentionParagraph", namesInMentionParagraph.first.Count);
                            //mention ordering in paragraph it is in
                            for (int i = 0; i < namesInMentionParagraph.second.Count; i++)
                            {
                                if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(mention.begin, mention.end), namesInMentionParagraph.second[i]))
                                {
                                    features.SetCount("orderInParagraph", i);
                                }
                            }
                            //if mention paragraph is all one quote
                            if (quotesInMentionParagraph.Count == 1)
                            {
                                ICoreMap qInMentionParagraph = quotesInMentionParagraph[0];
                                if (qInMentionParagraph.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == mentionParagraphBegin && qInMentionParagraph.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1 == mentionParagraphEnd)
                                {
                                    features.SetCount("mentionParagraphIsInConversation", 1);
                                }
                                else
                                {
                                    features.SetCount("mentionParagraphIsInConversation", -1);
                                }
                            }
                            foreach (ICoreMap quoteIMP in quotesInMentionParagraph)
                            {
                                if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(quoteIMP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIMP.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1), new Pair <int, int>(mention.begin, mention.end)))
                                {
                                    features.SetCount("mentionInQuote", 1);
                                }
                            }
                            if (features.GetCount("mentionInQuote") != 1)
                            {
                                features.SetCount("mentionNotInQuote", 1);
                            }
                        }
                    }
                    // nearby word syntax types...make sure to check if there are previous or next words
                    // or there will be an array index crash
                    if (mention.begin > 0)
                    {
                        CoreLabel prevWord = tokens[mention.begin - 1];
                        features.SetCount("prevWordType:" + prevWord.Tag(), 1);
                        if (punctuationForFeatures.Contains(prevWord.Lemma()))
                        {
                            features.SetCount("prevWordPunct:" + prevWord.Lemma(), 1);
                        }
                    }
                    if (mention.end + 1 < tokens.Count)
                    {
                        CoreLabel nextWord = tokens[mention.end + 1];
                        features.SetCount("nextWordType:" + nextWord.Tag(), 1);
                        if (punctuationForFeatures.Contains(nextWord.Lemma()))
                        {
                            features.SetCount("nextWordPunct:" + nextWord.Lemma(), 1);
                        }
                    }
                    //                    features.setCount("prevAndNext:" + prevWord.tag()+ ";" + nextWord.tag(), 1);
                    //quote paragraph features
                    IList <ICoreMap> quotesInQuoteParagraph = paragraphToQuotes[quoteParagraphIdx];
                    features.SetCount("QuotesInQuoteParagraph", quotesInQuoteParagraph.Count);
                    features.SetCount("WordsInQuoteParagraph", quoteParagraphEndToken - quoteParagraphBeginToken + 1);
                    features.SetCount("NamesInQuoteParagraph", sieve.ScanForNames(new Pair <int, int>(quoteParagraphBeginToken, quoteParagraphEndToken)).first.Count);
                    //quote features
                    features.SetCount("quoteLength", quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) + 1);
                    for (int i_1 = 0; i_1 < quotesInQuoteParagraph.Count; i_1++)
                    {
                        if (quotesInQuoteParagraph[i_1].Equals(quote))
                        {
                            features.SetCount("quotePosition", i_1 + 1);
                        }
                    }
                    if (features.GetCount("quotePosition") == 0)
                    {
                        throw new Exception("Check this (equality not working)");
                    }
                    Pair <List <string>, List <Pair <int, int> > > namesData = sieve.ScanForNames(quoteRun);
                    foreach (string name in namesData.first)
                    {
                        features.SetCount("charactersInQuote:" + sd.characterMap[name][0].name, 1);
                    }
                    //if quote encompasses entire paragraph
                    if (quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == quoteParagraphBeginToken && quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) == quoteParagraphEndToken)
                    {
                        features.SetCount("isImplicitSpeaker", 1);
                    }
                    else
                    {
                        features.SetCount("isImplicitSpeaker", -1);
                    }
                    //Vocative detection
                    if (mention.type.Equals("name"))
                    {
                        IList <Person> pList = sd.characterMap[sieve.TokenRangeToString(new Pair <int, int>(mention.begin, mention.end))];
                        Person         p     = null;
                        if (pList != null)
                        {
                            p = pList[0];
                        }
                        else
                        {
                            Pair <List <string>, List <Pair <int, int> > > scanForNamesResultPair = sieve.ScanForNames(new Pair <int, int>(mention.begin, mention.end));
                            if (scanForNamesResultPair.first.Count != 0)
                            {
                                string scanForNamesResultString = scanForNamesResultPair.first[0];
                                if (scanForNamesResultString != null && sd.characterMap.Contains(scanForNamesResultString))
                                {
                                    p = sd.characterMap[scanForNamesResultString][0];
                                }
                            }
                        }
                        if (p != null)
                        {
                            foreach (string name_1 in namesData.first)
                            {
                                if (p.aliases.Contains(name_1))
                                {
                                    features.SetCount("nameInQuote", 1);
                                }
                            }
                            if (quoteParagraphIdx > 0)
                            {
                                //            Paragraph prevParagraph = paragraphs.get(ex.paragraph_idx - 1);
                                IList <ICoreMap>         quotesInPrevParagraph = paragraphToQuotes.GetOrDefault(quoteParagraphIdx - 1, new List <ICoreMap>());
                                IList <Pair <int, int> > exclusionList         = new List <Pair <int, int> >();
                                foreach (ICoreMap quoteIPP in quotesInPrevParagraph)
                                {
                                    Pair <int, int> quoteRange = new Pair <int, int>(quoteIPP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIPP.Get(typeof(CoreAnnotations.TokenEndAnnotation)));
                                    exclusionList.Add(quoteRange);
                                    foreach (string name_2 in sieve.ScanForNames(quoteRange).first)
                                    {
                                        if (p.aliases.Contains(name_2))
                                        {
                                            features.SetCount("nameInPrevParagraphQuote", 1);
                                        }
                                    }
                                }
                                int      sentenceIdx             = quoteFirstSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                                ICoreMap sentenceInPrevParagraph = null;
                                for (int i = sentenceIdx - 1; i_1 >= 0; i_1--)
                                {
                                    ICoreMap currSentence = sentences[i_1];
                                    if (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1)
                                    {
                                        sentenceInPrevParagraph = currSentence;
                                        break;
                                    }
                                }
                                int prevParagraphBegin = GetParagraphBeginToken(sentenceInPrevParagraph, sentences);
                                int prevParagraphEnd   = GetParagraphEndToken(sentenceInPrevParagraph, sentences);
                                IList <Pair <int, int> > prevParagraphNonQuoteRuns = GetRangeExclusion(new Pair <int, int>(prevParagraphBegin, prevParagraphEnd), exclusionList);
                                foreach (Pair <int, int> nonQuoteRange in prevParagraphNonQuoteRuns)
                                {
                                    foreach (string name_2 in sieve.ScanForNames(nonQuoteRange).first)
                                    {
                                        if (p.aliases.Contains(name_2))
                                        {
                                            features.SetCount("nameInPrevParagraphNonQuote", 1);
                                        }
                                    }
                                }
                            }
                        }
                    }
                    if (isTraining)
                    {
                        if (QuoteAttributionUtils.RangeContains(new Pair <int, int>(gold.mentionStartTokenIndex, gold.mentionEndTokenIndex), new Pair <int, int>(mention.begin, mention.end)))
                        {
                            RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isMention");
                            datum.SetID(int.ToString(dataset.Size()));
                            mapDatumToMention[dataset.Size()] = mention;
                            dataset.Add(datum);
                        }
                        else
                        {
                            RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isNotMention");
                            datum.SetID(int.ToString(dataset.Size()));
                            dataset.Add(datum);
                            mapDatumToMention[dataset.Size()] = mention;
                        }
                    }
                    else
                    {
                        RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "none");
                        datum.SetID(int.ToString(dataset.Size()));
                        mapDatumToMention[dataset.Size()] = mention;
                        dataset.Add(datum);
                    }
                }
                mapQuoteToDataRange[quoteIdx] = new Pair <int, int>(initialSize, dataset.Size() - 1);
            }
            return(new SupervisedSieveTraining.FeaturesData(mapQuoteToDataRange, mapDatumToMention, dataset));
        }

Example #5

Show file

        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                System.Console.Error.Printf("Usage: java %s tree_file morfette_tnt_file%n", typeof(MungeTreesWithMorfetteAnalyses).FullName);
                System.Environment.Exit(-1);
            }
            string             treeFile     = args[0];
            string             morfetteFile = args[1];
            ITreeReaderFactory trf          = new FrenchTreeReaderFactory();

            try
            {
                ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
                IEnumerator <IList <CoreLabel> > morfetteItr = new MungeTreesWithMorfetteAnalyses.MorfetteFileIterator(morfetteFile);
                for (Tree tree; (tree = tr.ReadTree()) != null && morfetteItr.MoveNext();)
                {
                    IList <CoreLabel> analysis = morfetteItr.Current;
                    IList <ILabel>    yield    = tree.Yield();
                    System.Diagnostics.Debug.Assert(analysis.Count == yield.Count);
                    int yieldLen = yield.Count;
                    for (int i = 0; i < yieldLen; ++i)
                    {
                        CoreLabel tokenAnalysis = analysis[i];
                        ILabel    token         = yield[i];
                        string    lemma         = GetLemma(token.Value(), tokenAnalysis.Lemma());
                        string    newLeaf       = string.Format("%s%s%s%s%s", token.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, tokenAnalysis.Tag());
                        ((CoreLabel)token).SetValue(newLeaf);
                    }
                    System.Console.Out.WriteLine(tree.ToString());
                }
                if (tr.ReadTree() != null || morfetteItr.MoveNext())
                {
                    log.Info("WARNING: Uneven input files!");
                }
                tr.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }

Example #6

Show file

        public virtual void TrigramPatterns(Annotation doc)
        {
            IList <CoreLabel> docTokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <ICoreMap>  docQuotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));

            foreach (ICoreMap quote in docQuotes)
            {
                if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null)
                {
                    continue;
                }
                int             quoteBeginTokenIndex  = quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                int             quoteEndTokenIndex    = quote.Get(typeof(CoreAnnotations.TokenEndAnnotation));
                int             quoteEndSentenceIndex = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation));
                Pair <int, int> precedingTokenRange   = QuoteAttributionUtils.GetTokenRangePrecedingQuote(doc, quote);
                //get tokens before and after
                if (precedingTokenRange != null)
                {
                    Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(precedingTokenRange);
                    List <string>           names       = namesAndNameIndices.first;
                    List <Pair <int, int> > nameIndices = namesAndNameIndices.second;
                    if (names.Count > 0)
                    {
                        int offset = 0;
                        if (beforeQuotePunctuation.Contains(docTokens[quoteBeginTokenIndex - 1].Word()))
                        {
                            offset = 1;
                        }
                        Pair <int, int> lastNameIndex = nameIndices[nameIndices.Count - 1];
                        CoreLabel       prevToken     = docTokens[quoteBeginTokenIndex - 1 - offset];
                        //CVQ
                        if (prevToken.Tag() != null && prevToken.Tag().StartsWith("V") && lastNameIndex.second.Equals(quoteBeginTokenIndex - 2 - offset))
                        {
                            // verb!
                            FillInMention(quote, names[names.Count - 1], lastNameIndex.first, lastNameIndex.second, "trigram CVQ", Name);
                            continue;
                        }
                        //VCQ
                        if (lastNameIndex.second.Equals(quoteBeginTokenIndex - 1 - offset))
                        {
                            CoreLabel secondPrevToken = docTokens[lastNameIndex.first - 1];
                            if (secondPrevToken.Tag().StartsWith("V"))
                            {
                                FillInMention(quote, names[names.Count - 1], lastNameIndex.first, lastNameIndex.second, "trigram VCQ", Name);
                                continue;
                            }
                        }
                    }
                    List <int> pronounsIndices = ScanForPronouns(precedingTokenRange);
                    if (pronounsIndices.Count > 0)
                    {
                        int offset = 0;
                        if (beforeQuotePunctuation.Contains(docTokens[quoteBeginTokenIndex - 1].Word()))
                        {
                            offset = 1;
                        }
                        CoreLabel prevToken        = docTokens[quoteBeginTokenIndex - 1 - offset];
                        int       lastPronounIndex = pronounsIndices[pronounsIndices.Count - 1];
                        //PVQ
                        if (prevToken.Tag().StartsWith("V") && lastPronounIndex == quoteBeginTokenIndex - 2 - offset)
                        {
                            // verb!
                            FillInMention(quote, TokenRangeToString(lastPronounIndex), lastPronounIndex, lastPronounIndex, "trigram PVQ", Pronoun);
                            continue;
                        }
                        //VPQ
                        if (lastPronounIndex == quoteBeginTokenIndex - 1 - offset && docTokens[quoteBeginTokenIndex - 2 - offset].Tag().StartsWith("V"))
                        {
                            FillInMention(quote, TokenRangeToString(lastPronounIndex), lastPronounIndex, lastPronounIndex, "trigram VPQ", Pronoun);
                            continue;
                        }
                    }
                }
                Pair <int, int> followingTokenRange = QuoteAttributionUtils.GetTokenRangeFollowingQuote(doc, quote);
                if (followingTokenRange != null)
                {
                    Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(followingTokenRange);
                    List <string>           names       = namesAndNameIndices.first;
                    List <Pair <int, int> > nameIndices = namesAndNameIndices.second;
                    if (names.Count > 0)
                    {
                        Pair <int, int> firstNameIndex = nameIndices[0];
                        CoreLabel       nextToken      = docTokens[quoteEndTokenIndex + 1];
                        //QVC
                        if (nextToken.Tag().StartsWith("V") && firstNameIndex.first.Equals(quoteEndTokenIndex + 2))
                        {
                            // verb!
                            FillInMention(quote, names[0], firstNameIndex.first, firstNameIndex.second, "trigram QVC", Name);
                            continue;
                        }
                        //QCV
                        if (firstNameIndex.first.Equals(quoteEndTokenIndex + 1))
                        {
                            CoreLabel secondNextToken = docTokens[firstNameIndex.second + 1];
                            if (secondNextToken.Tag().StartsWith("V"))
                            {
                                FillInMention(quote, names[0], firstNameIndex.first, firstNameIndex.second, "trigram QCV", Name);
                                continue;
                            }
                        }
                    }
                    List <int> pronounsIndices = ScanForPronouns(followingTokenRange);
                    if (pronounsIndices.Count > 0)
                    {
                        CoreLabel nextToken         = docTokens[quoteEndTokenIndex + 1];
                        int       firstPronounIndex = pronounsIndices[0];
                        //QVP
                        if (nextToken.Tag().StartsWith("V") && firstPronounIndex == quoteEndTokenIndex + 2)
                        {
                            // verb!
                            FillInMention(quote, TokenRangeToString(pronounsIndices[0]), firstPronounIndex, firstPronounIndex, "trigram QVP", Pronoun);
                            continue;
                        }
                        //QPV
                        if (firstPronounIndex == quoteEndTokenIndex + 1 && docTokens[quoteEndTokenIndex + 2].Tag().StartsWith("V"))
                        {
                            FillInMention(quote, TokenRangeToString(pronounsIndices[pronounsIndices.Count - 1]), firstPronounIndex, firstPronounIndex, "trigram QPV", Pronoun);
                            continue;
                        }
                    }
                }
            }
        }

Example #7

Show file

File: NegraPennTreebankParserParams.cs Project: awesomedotnetcore/Stanford.CoreNLP.NET

        /// <summary>
        /// transformTree does all language-specific tree
        /// transformations.
        /// </summary>
        /// <remarks>
        /// transformTree does all language-specific tree
        /// transformations. Any parameterizations should be inside the
        /// specific TreebankLangParserarams class.
        /// </remarks>
        public override Tree TransformTree(Tree t, Tree root)
        {
            if (t == null || t.IsLeaf())
            {
                return(t);
            }
            IList <string> annotations = new List <string>();
            CoreLabel      lab         = (CoreLabel)t.Label();
            string         word        = lab.Word();
            string         tag         = lab.Tag();
            string         cat         = lab.Value();
            string         baseCat     = TreebankLanguagePack().BasicCategory(cat);

            //Tree parent = t.parent(root);
            // String mcat = "";
            // if (parent != null) {
            //   mcat = parent.label().value();
            // }
            //categories -- at present there is no tag annotation!!
            if (t.IsPhrasal())
            {
                IList <string> childBasicCats = ChildBasicCats(t);
                // mark vp's headed by "zu" verbs
                if (markZuVP && baseCat.Equals("VP") && (childBasicCats.Contains("VZ") || childBasicCats.Contains("VVIZU")))
                {
                    annotations.Add("%ZU");
                }
                // mark relative clause S's
                if (markRC && (t.Label() is NegraLabel) && baseCat.Equals("S") && ((NegraLabel)t.Label()).GetEdge() != null && ((NegraLabel)t.Label()).GetEdge().Equals("RC"))
                {
                    //throw new RuntimeException("damn, not a Negra Label");
                    annotations.Add("%RC");
                }
                //      if(t.children().length == 1) {
                //        annotations.add("%U");
                //      }
                if (markContainsV && ContainsVP(t))
                {
                    annotations.Add("%vp");
                }
                if (markLP && LeftPhrasal(t))
                {
                    annotations.Add("%LP");
                }
                if (markKonjParent)
                {
                    // this depends on functional tags being present
                    foreach (string cCat in childBasicCats)
                    {
                        if (cCat.Contains("-KONJ"))
                        {
                            annotations.Add("%konjp");
                            break;
                        }
                    }
                }
                if (markHDParent)
                {
                    // this depends on functional tags being present
                    foreach (string cCat in childBasicCats)
                    {
                        if (cCat.Contains("-HD"))
                        {
                            annotations.Add("%hdp");
                            break;
                        }
                    }
                }
            }
            else
            {
                //t.isPreTerminal() case
                if (markColon && cat.Equals("$.") && (word.Equals(":") || word.Equals(";")))
                {
                    annotations.Add("-%colon");
                }
            }
            //    if(t.isPreTerminal()) {
            //      if(parent != null) {
            //        String parentVal = parent.label().value();
            //        int cutOffPtD = parentVal.indexOf('-');
            //        int cutOffPtC = parentVal.indexOf('^');
            //        int curMin = parentVal.length();
            //        if(cutOffPtD != -1) {
            //          curMin = cutOffPtD;
            //        }
            //        if(cutOffPtC != -1) {
            //          curMin = Math.min(curMin, cutOffPtC);
            //        }
            //        parentVal = parentVal.substring(0, curMin);
            //        annotations.add("^" + parentVal);
            //      }
            //    }
            // put on all the annotations
            StringBuilder catSB = new StringBuilder(cat);

            foreach (string annotation in annotations)
            {
                catSB.Append(annotation);
            }
            t.SetLabel(new CategoryWordTag(catSB.ToString(), word, tag));
            return(t);
        }

Example #8

Show file

        public static ICounter <string> ExtractFeatures(Mention p, ICollection <Mention> shares, ICollection <string> neStrings, Dictionaries dict, Properties props)
        {
            ICounter <string> features = new ClassicCounter <string>();
            string            span     = p.LowercaseNormalizedSpanString();
            string            ner      = p.headWord.Ner();
            int sIdx = p.startIndex;
            int eIdx = p.endIndex;
            IList <CoreLabel> sent      = p.sentenceWords;
            CoreLabel         preWord   = (sIdx == 0) ? null : sent[sIdx - 1];
            CoreLabel         nextWord  = (eIdx == sent.Count) ? null : sent[eIdx];
            CoreLabel         firstWord = p.originalSpan[0];
            CoreLabel         lastWord  = p.originalSpan[p.originalSpan.Count - 1];

            features.IncrementCount("B-NETYPE-" + ner);
            if (neStrings.Contains(span))
            {
                features.IncrementCount("B-NE-STRING-EXIST");
                if ((preWord == null || !preWord.Ner().Equals(ner)) && (nextWord == null || !nextWord.Ner().Equals(ner)))
                {
                    features.IncrementCount("B-NE-FULLSPAN");
                }
            }
            if (preWord != null)
            {
                features.IncrementCount("B-PRECEDINGWORD-" + preWord.Word());
            }
            if (nextWord != null)
            {
                features.IncrementCount("B-FOLLOWINGWORD-" + nextWord.Word());
            }
            if (preWord != null)
            {
                features.IncrementCount("B-PRECEDINGPOS-" + preWord.Tag());
            }
            if (nextWord != null)
            {
                features.IncrementCount("B-FOLLOWINGPOS-" + nextWord.Tag());
            }
            features.IncrementCount("B-FIRSTWORD-" + firstWord.Word());
            features.IncrementCount("B-FIRSTPOS-" + firstWord.Tag());
            features.IncrementCount("B-LASTWORD-" + lastWord.Word());
            features.IncrementCount("B-LASTWORD-" + lastWord.Tag());
            foreach (Mention s in shares)
            {
                if (s == p)
                {
                    continue;
                }
                if (s.InsideIn(p))
                {
                    features.IncrementCount("B-BIGGER-THAN-ANOTHER");
                    break;
                }
            }
            foreach (Mention s_1 in shares)
            {
                if (s_1 == p)
                {
                    continue;
                }
                if (p.InsideIn(s_1))
                {
                    features.IncrementCount("B-SMALLER-THAN-ANOTHER");
                    break;
                }
            }
            return(features);
        }

Example #9

Show file

 public static void WriteConllFile(string outFile, IList <ICoreMap> sentences, IList <DependencyTree> trees)
 {
     try
     {
         PrintWriter output = IOUtils.GetPrintWriter(outFile);
         for (int i = 0; i < sentences.Count; i++)
         {
             ICoreMap          sentence = sentences[i];
             DependencyTree    tree     = trees[i];
             IList <CoreLabel> tokens   = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
             for (int j = 1; j <= size; ++j)
             {
                 CoreLabel token = tokens[j - 1];
                 output.Printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n", j, token.Word(), token.Tag(), token.Tag(), tree.GetHead(j), tree.GetLabel(j));
             }
             output.Println();
         }
         output.Close();
     }
     catch (Exception e)
     {
         throw new RuntimeIOException(e);
     }
 }

Example #10

Show file

        /// <summary>
        /// Adds stem under annotation
        /// <paramref name="ann"/>
        /// to the given CoreLabel.
        /// Assumes that it has a TextAnnotation and PartOfSpeechAnnotation.
        /// </summary>
        public virtual void Stem(CoreLabel label, Type ann)
        {
            string lemma = Lemmatize(label.Word(), label.Tag(), lexer, lexer.Option(1));

            label.Set(ann, lemma);
        }

Example #11

Show file

        public static ICollection <SurfacePattern> GetContext(IList <CoreLabel> sent, int i, ICollection <CandidatePhrase> stopWords)
        {
            ICollection <SurfacePattern> prevpatterns     = new HashSet <SurfacePattern>();
            ICollection <SurfacePattern> nextpatterns     = new HashSet <SurfacePattern>();
            ICollection <SurfacePattern> prevnextpatterns = new HashSet <SurfacePattern>();
            CoreLabel token = sent[i];
            string    tag   = null;

            if (usePOS4Pattern)
            {
                string fulltag = token.Tag();
                if (useCoarsePOS)
                {
                    tag = Sharpen.Runtime.Substring(fulltag, 0, Math.Min(fulltag.Length, 2));
                }
                else
                {
                    tag = fulltag;
                }
            }
            string nerTag = token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));

            for (int maxWin = 1; maxWin <= maxWindow4Pattern; maxWin++)
            {
                IList <Token>  previousTokens   = new List <Token>();
                IList <string> originalPrev     = new List <string>();
                IList <string> originalNext     = new List <string>();
                IList <Token>  nextTokens       = new List <Token>();
                int            numStopWordsprev = 0;
                int            numStopWordsnext = 0;
                // int numPrevTokensSpecial = 0, numNextTokensSpecial = 0;
                int          numNonStopWordsNext = 0;
                int          numNonStopWordsPrev = 0;
                bool         useprev             = false;
                bool         usenext             = false;
                PatternToken twithoutPOS         = null;
                //TODO: right now using numWordsCompoundMax.
                if (addPatWithoutPOS)
                {
                    twithoutPOS = new PatternToken(tag, false, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation)));
                }
                PatternToken twithPOS = null;
                if (usePOS4Pattern)
                {
                    twithPOS = new PatternToken(tag, true, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation)));
                }
                if (usePreviousContext)
                {
                    // int j = Math.max(0, i - 1);
                    int j         = i - 1;
                    int numTokens = 0;
                    while (numTokens < maxWin && j >= 0)
                    {
                        // for (int j = Math.max(i - maxWin, 0); j < i; j++) {
                        CoreLabel tokenj = sent[j];
                        string    tokenjStr;
                        if (useLemmaContextTokens)
                        {
                            tokenjStr = tokenj.Lemma();
                        }
                        else
                        {
                            tokenjStr = tokenj.Word();
                        }
                        // do not use this word in context consideration
                        if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower()))
                        {
                            j--;
                            continue;
                        }
                        //          if (!tokenj.containsKey(answerClass.get(label))) {
                        //            throw new RuntimeException("how come the class "
                        //                + answerClass.get(label) + " for token "
                        //                + tokenj.word() + " in " + sent + " is not set");
                        //          }
                        Triple <bool, Token, string> tr = GetContextTokenStr(tokenj);
                        bool   isLabeledO  = tr.first;
                        Token  strgeneric  = tr.second;
                        string strOriginal = tr.third;
                        if (!isLabeledO)
                        {
                            // numPrevTokensSpecial++;
                            previousTokens.Add(0, strgeneric);
                            // previousTokens.add(0,
                            // "[{answer:"
                            // + tokenj.get(answerClass.get(label)).toString()
                            // + "}]");
                            originalPrev.Add(0, strOriginal);
                            numNonStopWordsPrev++;
                        }
                        else
                        {
                            if (tokenj.Word().StartsWith("http"))
                            {
                                useprev = false;
                                previousTokens.Clear();
                                originalPrev.Clear();
                                break;
                            }
                            else
                            {
                                Token str = SurfacePattern.GetContextToken(tokenj);
                                previousTokens.Add(0, str);
                                originalPrev.Add(0, tokenjStr);
                                if (DoNotUse(tokenjStr, stopWords))
                                {
                                    numStopWordsprev++;
                                }
                                else
                                {
                                    numNonStopWordsPrev++;
                                }
                            }
                        }
                        numTokens++;
                        j--;
                    }
                }
                if (useNextContext)
                {
                    int numTokens = 0;
                    int j         = i + 1;
                    while (numTokens < maxWin && j < sent.Count)
                    {
                        // for (int j = i + 1; j < sent.size() && j <= i + maxWin; j++) {
                        CoreLabel tokenj = sent[j];
                        string    tokenjStr;
                        if (useLemmaContextTokens)
                        {
                            tokenjStr = tokenj.Lemma();
                        }
                        else
                        {
                            tokenjStr = tokenj.Word();
                        }
                        // do not use this word in context consideration
                        if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower()))
                        {
                            j++;
                            continue;
                        }
                        //          if (!tokenj.containsKey(answerClass.get(label))) {
                        //            throw new RuntimeException(
                        //                "how come the dict annotation for token " + tokenj.word()
                        //                    + " in " + sent + " is not set");
                        //          }
                        Triple <bool, Token, string> tr = GetContextTokenStr(tokenj);
                        bool   isLabeledO  = tr.first;
                        Token  strgeneric  = tr.second;
                        string strOriginal = tr.third;
                        // boolean isLabeledO = tokenj.get(answerClass.get(label))
                        // .equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
                        if (!isLabeledO)
                        {
                            // numNextTokensSpecial++;
                            numNonStopWordsNext++;
                            nextTokens.Add(strgeneric);
                            // nextTokens.add("[{" + label + ":"
                            // + tokenj.get(answerClass.get(label)).toString()
                            // + "}]");
                            originalNext.Add(strOriginal);
                        }
                        else
                        {
                            // originalNextStr += " "
                            // + tokenj.get(answerClass.get(label)).toString();
                            if (tokenj.Word().StartsWith("http"))
                            {
                                usenext = false;
                                nextTokens.Clear();
                                originalNext.Clear();
                                break;
                            }
                            else
                            {
                                // if (!tokenj.word().matches("[.,?()]")) {
                                Token str = SurfacePattern.GetContextToken(tokenj);
                                nextTokens.Add(str);
                                originalNext.Add(tokenjStr);
                                if (DoNotUse(tokenjStr, stopWords))
                                {
                                    numStopWordsnext++;
                                }
                                else
                                {
                                    numNonStopWordsNext++;
                                }
                            }
                        }
                        j++;
                        numTokens++;
                    }
                }
                // String prevContext = null, nextContext = null;
                // int numNonSpecialPrevTokens = previousTokens.size()
                // - numPrevTokensSpecial;
                // int numNonSpecialNextTokens = nextTokens.size() - numNextTokensSpecial;
                Token[] prevContext = null;
                //String[] prevContext = null;
                //String[] prevOriginalArr = null;
                // if (previousTokens.size() >= minWindow4Pattern
                // && (numStopWordsprev < numNonSpecialPrevTokens ||
                // numNonSpecialPrevTokens > numMinStopWordsToAdd)) {
                if (previousTokens.Count >= minWindow4Pattern && (numNonStopWordsPrev > 0 || numStopWordsprev > numMinStopWordsToAdd))
                {
                    // prevContext = StringUtils.join(previousTokens, fw);
                    IList <Token>  prevContextList = new List <Token>();
                    IList <string> prevOriginal    = new List <string>();
                    foreach (Token p in previousTokens)
                    {
                        prevContextList.Add(p);
                        if (!fw.IsEmpty())
                        {
                            prevContextList.Add(fw);
                        }
                    }
                    // add fw and sw to the the originalprev
                    foreach (string p_1 in originalPrev)
                    {
                        prevOriginal.Add(p_1);
                        if (!fw.IsEmpty())
                        {
                            prevOriginal.Add(" FW ");
                        }
                    }
                    if (!sw.IsEmpty())
                    {
                        prevContextList.Add(sw);
                        prevOriginal.Add(" SW ");
                    }
                    // String str = prevContext + fw + sw;
                    if (IsASCII(StringUtils.Join(prevOriginal)))
                    {
                        prevContext = Sharpen.Collections.ToArray(prevContextList, new Token[0]);
                        //prevOriginalArr = prevOriginal.toArray(new String[0]);
                        if (previousTokens.Count >= minWindow4Pattern)
                        {
                            if (twithoutPOS != null)
                            {
                                SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, null, SurfacePatternFactory.Genre.Prev);
                                prevpatterns.Add(pat);
                            }
                            if (twithPOS != null)
                            {
                                SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, null, SurfacePatternFactory.Genre.Prev);
                                prevpatterns.Add(patPOS);
                            }
                        }
                        useprev = true;
                    }
                }
                Token[] nextContext = null;
                //String [] nextOriginalArr = null;
                // if (nextTokens.size() > 0
                // && (numStopWordsnext < numNonSpecialNextTokens ||
                // numNonSpecialNextTokens > numMinStopWordsToAdd)) {
                if (nextTokens.Count > 0 && (numNonStopWordsNext > 0 || numStopWordsnext > numMinStopWordsToAdd))
                {
                    // nextContext = StringUtils.join(nextTokens, fw);
                    IList <Token>  nextContextList = new List <Token>();
                    IList <string> nextOriginal    = new List <string>();
                    if (!sw.IsEmpty())
                    {
                        nextContextList.Add(sw);
                        nextOriginal.Add(" SW ");
                    }
                    foreach (Token n in nextTokens)
                    {
                        if (!fw.IsEmpty())
                        {
                            nextContextList.Add(fw);
                        }
                        nextContextList.Add(n);
                    }
                    foreach (string n_1 in originalNext)
                    {
                        if (!fw.IsEmpty())
                        {
                            nextOriginal.Add(" FW ");
                        }
                        nextOriginal.Add(n_1);
                    }
                    if (nextTokens.Count >= minWindow4Pattern)
                    {
                        nextContext = Sharpen.Collections.ToArray(nextContextList, new Token[0]);
                        //nextOriginalArr =  nextOriginal.toArray(new String[0]);
                        if (twithoutPOS != null)
                        {
                            SurfacePattern pat = new SurfacePattern(null, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Next);
                            nextpatterns.Add(pat);
                        }
                        if (twithPOS != null)
                        {
                            SurfacePattern patPOS = new SurfacePattern(null, twithPOS, nextContext, SurfacePatternFactory.Genre.Next);
                            nextpatterns.Add(patPOS);
                        }
                    }
                    usenext = true;
                }
                if (useprev && usenext)
                {
                    // String strprev = prevContext + fw + sw;
                    // String strnext = sw + fw + nextContext;
                    if (previousTokens.Count + nextTokens.Count >= minWindow4Pattern)
                    {
                        if (twithoutPOS != null)
                        {
                            SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Prevnext);
                            prevnextpatterns.Add(pat);
                        }
                        if (twithPOS != null)
                        {
                            SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, nextContext, SurfacePatternFactory.Genre.Prevnext);
                            prevnextpatterns.Add(patPOS);
                        }
                    }
                }
            }
            //    Triple<Set<Integer>, Set<Integer>, Set<Integer>> patterns = new Triple<Set<Integer>, Set<Integer>, Set<Integer>>(
            //        prevpatterns, nextpatterns, prevnextpatterns);
            // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
            // " prev patterns are " + prevpatterns);
            // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
            // " next patterns are " + nextpatterns);
            // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
            // " prevnext patterns are " + prevnextpatterns);
            //getPatternIndex().finishCommit();
            return(CollectionUtils.UnionAsSet(prevpatterns, nextpatterns, prevnextpatterns));
        }

Example #12

Show file

File: IOBUtils.cs Project: zerouid/Stanford.CoreNLP.NET

        /// <summary>Convert token to a sequence of datums and add to iobList.</summary>
        /// <param name="iobList"/>
        /// <param name="token"/>
        /// <param name="tokType"/>
        /// <param name="tokenLabel"/>
        /// <param name="lastToken"/>
        /// <param name="applyRewriteRules"/>
        /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param>
        /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param>
        private static void TokenToDatums(IList <CoreLabel> iobList, CoreLabel cl, string token, IOBUtils.TokenType tokType, CoreLabel tokenLabel, string lastToken, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText
                                          )
        {
            if (token.IsEmpty())
            {
                return;
            }
            string lastLabel        = ContinuationSymbol;
            string firstLabel       = BeginSymbol;
            string rewritten        = cl.Get(typeof(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation));
            bool   crossRefRewrites = true;

            if (rewritten == null)
            {
                rewritten        = token;
                crossRefRewrites = false;
            }
            else
            {
                rewritten = StripSegmentationMarkers(rewritten, tokType);
            }
            if (applyRewriteRules)
            {
                // Apply Arabic-specific re-write rules
                string rawToken = tokenLabel.Word();
                string tag      = tokenLabel.Tag();
                MorphoFeatureSpecification featureSpec = new ArabicMorphoFeatureSpecification();
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Ngen);
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Nnum);
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Def);
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Tense);
                MorphoFeatures features = featureSpec.StrToFeatures(tag);
                // Rule #1 : ت --> ة
                if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Ngen).Equals("F") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Nnum).Equals("SG") && rawToken.EndsWith("ت-") && !stripRewrites)
                {
                    lastLabel = RewriteSymbol;
                }
                else
                {
                    if (rawToken.EndsWith("ة-"))
                    {
                        System.Diagnostics.Debug.Assert(token.EndsWith("ة"));
                        token     = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ت";
                        lastLabel = RewriteSymbol;
                    }
                }
                // Rule #2 : لل --> ل ال
                if (lastToken.Equals("ل") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Def).Equals("D"))
                {
                    if (rawToken.StartsWith("-ال"))
                    {
                        if (!token.StartsWith("ا"))
                        {
                            log.Info("Bad REWAL: " + rawToken + " / " + token);
                        }
                        token     = Sharpen.Runtime.Substring(token, 1);
                        rewritten = Sharpen.Runtime.Substring(rewritten, 1);
                        if (!stripRewrites)
                        {
                            firstLabel = RewriteSymbol;
                        }
                    }
                    else
                    {
                        if (rawToken.StartsWith("-ل"))
                        {
                            if (!token.StartsWith("ل"))
                            {
                                log.Info("Bad REWAL: " + rawToken + " / " + token);
                            }
                            if (!stripRewrites)
                            {
                                firstLabel = RewriteSymbol;
                            }
                        }
                        else
                        {
                            log.Info("Ignoring REWAL: " + rawToken + " / " + token);
                        }
                    }
                }
                // Rule #3 : ي --> ى
                // Rule #4 : ا --> ى
                if (rawToken.EndsWith("ى-"))
                {
                    if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Tense) != null)
                    {
                        // verb: ى becomes ا
                        token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ا";
                    }
                    else
                    {
                        // assume preposition:
                        token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ي";
                    }
                    if (!stripRewrites)
                    {
                        lastLabel = RewriteSymbol;
                    }
                }
                else
                {
                    if (rawToken.Equals("علي-") || rawToken.Equals("-علي-"))
                    {
                        if (!stripRewrites)
                        {
                            lastLabel = RewriteSymbol;
                        }
                    }
                }
            }
            string origWord;

            if (origText == null)
            {
                origWord = tokenLabel.Word();
            }
            else
            {
                origWord = Sharpen.Runtime.Substring(origText, cl.BeginPosition(), cl.EndPosition());
            }
            int origIndex = 0;

            while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf))
            {
                ++origIndex;
            }
            // Create datums and add to iobList
            if (token.IsEmpty())
            {
                log.Info("Rewriting resulted in empty token: " + tokenLabel.Word());
            }
            string firstChar = token[0].ToString();

            // Start at 0 to make sure we include the whole token according to the tokenizer
            iobList.Add(CreateDatum(cl, firstChar, firstLabel, 0, origIndex + 1));
            int numChars = token.Length;

            if (crossRefRewrites && rewritten.Length != numChars)
            {
                System.Console.Error.Printf("Rewritten annotation doesn't have correct length: %s>>>%s%n", token, rewritten);
                crossRefRewrites = false;
            }
            ++origIndex;
            for (int j = 1; j < numChars; ++j, ++origIndex)
            {
                while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf))
                {
                    ++origIndex;
                }
                if (origIndex >= origWord.Length)
                {
                    origIndex = origWord.Length - 1;
                }
                string charLabel = (j == numChars - 1) ? lastLabel : ContinuationSymbol;
                string thisChar  = token[j].ToString();
                if (crossRefRewrites && !rewritten[j].ToString().Equals(thisChar))
                {
                    charLabel = RewriteSymbol;
                }
                if (charLabel == ContinuationSymbol && thisChar.Equals("ى") && j != numChars - 1)
                {
                    charLabel = RewriteSymbol;
                }
                // Assume all mid-word alef maqsura are supposed to be yah
                iobList.Add(CreateDatum(cl, thisChar, charLabel, origIndex, origIndex + 1));
            }
            // End at endPosition to make sure we include the whole token according to the tokenizer
            if (!iobList.IsEmpty())
            {
                iobList[iobList.Count - 1].SetEndPosition(cl.EndPosition());
            }
        }