Example #1
0
        /// <summary>
        /// Set the tags of the original tokens and the leaves if they
        /// aren't already set.
        /// </summary>
        private static void SetMissingTags(ICoreMap sentence, Tree tree)
        {
            IList <TaggedWord> taggedWords = null;
            IList <ILabel>     leaves      = null;
            IList <CoreLabel>  tokens      = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));

            for (int i = 0; i < size; ++i)
            {
                CoreLabel token = tokens[i];
                if (token.Tag() == null)
                {
                    if (taggedWords == null)
                    {
                        taggedWords = tree.TaggedYield();
                    }
                    if (leaves == null)
                    {
                        leaves = tree.Yield();
                    }
                    token.SetTag(taggedWords[i].Tag());
                    ILabel leaf = leaves[i];
                    if (leaf is IHasTag)
                    {
                        ((IHasTag)leaf).SetTag(taggedWords[i].Tag());
                    }
                }
            }
        }
        private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i)
        {
            ICounter <string> feat = new ClassicCounter <string>();
            CoreLabel         l    = sent[i];
            string            label;

            if (l.Get(answerClass).ToString().Equals(answerLabel))
            {
                label = answerLabel;
            }
            else
            {
                label = "O";
            }
            CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases));

            if (matchedPhrases == null)
            {
                matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>();
                matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word()));
            }
            foreach (CandidatePhrase w in matchedPhrases.AllValues())
            {
                int num = this.clusterIds[w.GetPhrase()];
                if (num == null)
                {
                    num = -1;
                }
                feat.SetCount("Cluster-" + num, 1.0);
            }
            // feat.incrementCount("WORD-" + l.word());
            // feat.incrementCount("LEMMA-" + l.lemma());
            // feat.incrementCount("TAG-" + l.tag());
            int window = 0;

            for (int j = Math.Max(0, i - window); j < i; j++)
            {
                CoreLabel lj = sent[j];
                feat.IncrementCount("PREV-" + "WORD-" + lj.Word());
                feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("PREV-" + "TAG-" + lj.Tag());
            }
            for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++)
            {
                CoreLabel lj = sent[j_1];
                feat.IncrementCount("NEXT-" + "WORD-" + lj.Word());
                feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag());
            }
            // System.out.println("adding " + l.word() + " as " + label);
            return(new RVFDatum <string, string>(feat, label));
        }
        private bool MatchedRestriction(CoreLabel coreLabel, string label)
        {
            bool use = false;

            if (PatternFactory.useTargetNERRestriction)
            {
                foreach (string s in constVars.allowedNERsforLabels[label])
                {
                    if (coreLabel.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Matches(s))
                    {
                        use = true;
                        break;
                    }
                }
            }
            else
            {
                //System.out.println("not matching NER");
                use = true;
            }
            if (use)
            {
                string tag = coreLabel.Tag();
                if (constVars.allowedTagsInitials != null && constVars.allowedTagsInitials.Contains(label))
                {
                    foreach (string allowed in constVars.allowedTagsInitials[label])
                    {
                        if (tag.StartsWith(allowed))
                        {
                            use = true;
                            break;
                        }
                        use = false;
                    }
                }
            }
            if (constVars.debug >= 4)
            {
                if (use)
                {
                    System.Console.Out.WriteLine(coreLabel.Word() + " matched restriction " + (PatternFactory.useTargetNERRestriction ? constVars.allowedNERsforLabels[label] : string.Empty) + "and" + PatternFactory.useTargetNERRestriction + " and " + (constVars
                                                                                                                                                                                                                                                            .allowedTagsInitials != null ? constVars.allowedTagsInitials[label] : string.Empty));
                }
                else
                {
                    System.Console.Out.WriteLine(coreLabel.Word() + " did not matched restrict " + (PatternFactory.useTargetNERRestriction ? constVars.allowedNERsforLabels[label] : string.Empty) + "and" + PatternFactory.useTargetNERRestriction + " and " + (constVars
                                                                                                                                                                                                                                                                 .allowedTagsInitials != null ? constVars.allowedTagsInitials[label] : string.Empty));
                }
            }
            return(use);
        }
        //goldList null if not training
        public static SupervisedSieveTraining.FeaturesData Featurize(SupervisedSieveTraining.SieveData sd, IList <XMLToAnnotation.GoldQuoteInfo> goldList, bool isTraining)
        {
            Annotation doc = sd.doc;

            sieve = new Sieve(doc, sd.characterMap, sd.pronounCorefMap, sd.animacyList);
            IList <ICoreMap>  quotes    = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));
            IList <ICoreMap>  sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));
            IList <CoreLabel> tokens    = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IDictionary <int, IList <ICoreMap> > paragraphToQuotes = GetQuotesInParagraph(doc);
            GeneralDataset <string, string>      dataset           = new RVFDataset <string, string>();
            //necessary for 'ScoreBestMention'
            IDictionary <int, Pair <int, int> > mapQuoteToDataRange = new Dictionary <int, Pair <int, int> >();
            //maps quote to corresponding indices in the dataset
            IDictionary <int, Sieve.MentionData> mapDatumToMention = new Dictionary <int, Sieve.MentionData>();

            if (isTraining && goldList.Count != quotes.Count)
            {
                throw new Exception("Gold Quote List size doesn't match quote list size!");
            }
            for (int quoteIdx = 0; quoteIdx < quotes.Count; quoteIdx++)
            {
                int      initialSize = dataset.Size();
                ICoreMap quote       = quotes[quoteIdx];
                XMLToAnnotation.GoldQuoteInfo gold = null;
                if (isTraining)
                {
                    gold = goldList[quoteIdx];
                    if (gold.speaker == string.Empty)
                    {
                        continue;
                    }
                }
                ICoreMap        quoteFirstSentence = sentences[quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))];
                Pair <int, int> quoteRun           = new Pair <int, int>(quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)));
                //      int quoteChapter = quoteFirstSentence.get(ChapterAnnotator.ChapterAnnotation.class);
                int quoteParagraphIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation));
                //add mentions before quote up to the previous paragraph
                int rightValue = quoteRun.first - 1;
                int leftValue  = quoteRun.first - 1;
                //move left value to be the first token idx of the previous paragraph
                for (int sentIdx = quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation)); sentIdx >= 0; sentIdx--)
                {
                    ICoreMap sentence = sentences[sentIdx];
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx)
                    {
                        continue;
                    }
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1)
                    {
                        //quoteParagraphIdx - 1 for this and prev
                        leftValue = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                    }
                    else
                    {
                        break;
                    }
                }
                IList <Sieve.MentionData> mentionsInPreviousParagraph = new List <Sieve.MentionData>();
                if (leftValue > -1 && rightValue > -1)
                {
                    mentionsInPreviousParagraph = EliminateDuplicates(sieve.FindClosestMentionsInSpanBackward(new Pair <int, int>(leftValue, rightValue)));
                }
                //mentions in next paragraph
                leftValue  = quoteRun.second + 1;
                rightValue = quoteRun.second + 1;
                for (int sentIdx_1 = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation)); sentIdx_1 < sentences.Count; sentIdx_1++)
                {
                    ICoreMap sentence = sentences[sentIdx_1];
                    //        if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx) {
                    //          continue;
                    //        }
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx)
                    {
                        //quoteParagraphIdx + 1
                        rightValue = sentence.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1;
                    }
                    else
                    {
                        break;
                    }
                }
                IList <Sieve.MentionData> mentionsInNextParagraph = new List <Sieve.MentionData>();
                if (leftValue < tokens.Count && rightValue < tokens.Count)
                {
                    mentionsInNextParagraph = sieve.FindClosestMentionsInSpanForward(new Pair <int, int>(leftValue, rightValue));
                }
                IList <Sieve.MentionData> candidateMentions = new List <Sieve.MentionData>();
                Sharpen.Collections.AddAll(candidateMentions, mentionsInPreviousParagraph);
                Sharpen.Collections.AddAll(candidateMentions, mentionsInNextParagraph);
                //      System.out.println(candidateMentions.size());
                int rankedDistance = 1;
                int numBackwards   = mentionsInPreviousParagraph.Count;
                foreach (Sieve.MentionData mention in candidateMentions)
                {
                    IList <CoreLabel> mentionCandidateTokens   = doc.Get(typeof(CoreAnnotations.TokensAnnotation)).SubList(mention.begin, mention.end + 1);
                    ICoreMap          mentionCandidateSentence = sentences[mentionCandidateTokens[0].SentIndex()];
                    //        if (mentionCandidateSentence.get(ChapterAnnotator.ChapterAnnotation.class) != quoteChapter) {
                    //          continue;
                    //        }
                    ICounter <string> features = new ClassicCounter <string>();
                    bool isLeft   = true;
                    int  distance = quoteRun.first - mention.end;
                    if (distance < 0)
                    {
                        isLeft   = false;
                        distance = mention.begin - quoteRun.second;
                    }
                    if (distance < 0)
                    {
                        continue;
                    }
                    //disregard mention-in-quote cases.
                    features.SetCount("wordDistance", distance);
                    IList <CoreLabel> betweenTokens;
                    if (isLeft)
                    {
                        betweenTokens = tokens.SubList(mention.end + 1, quoteRun.first);
                    }
                    else
                    {
                        betweenTokens = tokens.SubList(quoteRun.second + 1, mention.begin);
                    }
                    //Punctuation in between
                    foreach (CoreLabel token in betweenTokens)
                    {
                        if (punctuation.Contains(token.Word()))
                        {
                            features.SetCount("punctuationPresence:" + token.Word(), 1);
                        }
                    }
                    // number of mentions away
                    features.SetCount("rankedDistance", rankedDistance);
                    rankedDistance++;
                    if (rankedDistance == numBackwards)
                    {
                        //reset for the forward
                        rankedDistance = 1;
                    }
                    //        int quoteParagraphIdx = quoteFirstSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class);
                    //third distance: # of paragraphs away
                    int      mentionParagraphIdx        = -1;
                    ICoreMap sentenceInMentionParagraph = null;
                    int      quoteParagraphBeginToken   = GetParagraphBeginToken(quoteFirstSentence, sentences);
                    int      quoteParagraphEndToken     = GetParagraphEndToken(quoteFirstSentence, sentences);
                    if (isLeft)
                    {
                        if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken)
                        {
                            features.SetCount("leftParagraphDistance", 0);
                            mentionParagraphIdx        = quoteParagraphIdx;
                            sentenceInMentionParagraph = quoteFirstSentence;
                        }
                        else
                        {
                            int      paragraphDistance = 1;
                            int      currParagraphIdx  = quoteParagraphIdx - paragraphDistance;
                            ICoreMap currSentence      = quoteFirstSentence;
                            int      currSentenceIdx   = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                            while (currParagraphIdx >= 0)
                            {
                                //              Paragraph prevParagraph = paragraphs.get(prevParagraphIndex);
                                //extract begin and end tokens of
                                while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != currParagraphIdx)
                                {
                                    currSentenceIdx--;
                                    currSentence = sentences[currSentenceIdx];
                                }
                                int prevParagraphBegin = GetParagraphBeginToken(currSentence, sentences);
                                int prevParagraphEnd   = GetParagraphEndToken(currSentence, sentences);
                                if (prevParagraphBegin <= mention.begin && mention.end <= prevParagraphEnd)
                                {
                                    mentionParagraphIdx        = currParagraphIdx;
                                    sentenceInMentionParagraph = currSentence;
                                    features.SetCount("leftParagraphDistance", paragraphDistance);
                                    if (paragraphDistance % 2 == 0)
                                    {
                                        features.SetCount("leftParagraphDistanceEven", 1);
                                    }
                                    break;
                                }
                                paragraphDistance++;
                                currParagraphIdx--;
                            }
                        }
                    }
                    else
                    {
                        //right
                        if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken)
                        {
                            features.SetCount("rightParagraphDistance", 0);
                            sentenceInMentionParagraph = quoteFirstSentence;
                            mentionParagraphIdx        = quoteParagraphIdx;
                        }
                        else
                        {
                            int      paragraphDistance  = 1;
                            int      nextParagraphIndex = quoteParagraphIdx + paragraphDistance;
                            ICoreMap currSentence       = quoteFirstSentence;
                            int      currSentenceIdx    = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                            while (currSentenceIdx < sentences.Count)
                            {
                                while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != nextParagraphIndex)
                                {
                                    currSentenceIdx++;
                                    currSentence = sentences[currSentenceIdx];
                                }
                                int nextParagraphBegin = GetParagraphBeginToken(currSentence, sentences);
                                int nextParagraphEnd   = GetParagraphEndToken(currSentence, sentences);
                                if (nextParagraphBegin <= mention.begin && mention.end <= nextParagraphEnd)
                                {
                                    sentenceInMentionParagraph = currSentence;
                                    features.SetCount("rightParagraphDistance", paragraphDistance);
                                    break;
                                }
                                paragraphDistance++;
                                nextParagraphIndex++;
                            }
                        }
                    }
                    //2. mention features
                    if (sentenceInMentionParagraph != null)
                    {
                        int mentionParagraphBegin = GetParagraphBeginToken(sentenceInMentionParagraph, sentences);
                        int mentionParagraphEnd   = GetParagraphEndToken(sentenceInMentionParagraph, sentences);
                        if (!(mentionParagraphBegin == quoteParagraphBeginToken && mentionParagraphEnd == quoteParagraphEndToken))
                        {
                            IList <ICoreMap> quotesInMentionParagraph = paragraphToQuotes.GetOrDefault(mentionParagraphIdx, new List <ICoreMap>());
                            Pair <List <string>, List <Pair <int, int> > > namesInMentionParagraph = sieve.ScanForNames(new Pair <int, int>(mentionParagraphBegin, mentionParagraphEnd));
                            features.SetCount("quotesInMentionParagraph", quotesInMentionParagraph.Count);
                            features.SetCount("wordsInMentionParagraph", mentionParagraphEnd - mentionParagraphBegin + 1);
                            features.SetCount("namesInMentionParagraph", namesInMentionParagraph.first.Count);
                            //mention ordering in paragraph it is in
                            for (int i = 0; i < namesInMentionParagraph.second.Count; i++)
                            {
                                if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(mention.begin, mention.end), namesInMentionParagraph.second[i]))
                                {
                                    features.SetCount("orderInParagraph", i);
                                }
                            }
                            //if mention paragraph is all one quote
                            if (quotesInMentionParagraph.Count == 1)
                            {
                                ICoreMap qInMentionParagraph = quotesInMentionParagraph[0];
                                if (qInMentionParagraph.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == mentionParagraphBegin && qInMentionParagraph.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1 == mentionParagraphEnd)
                                {
                                    features.SetCount("mentionParagraphIsInConversation", 1);
                                }
                                else
                                {
                                    features.SetCount("mentionParagraphIsInConversation", -1);
                                }
                            }
                            foreach (ICoreMap quoteIMP in quotesInMentionParagraph)
                            {
                                if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(quoteIMP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIMP.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1), new Pair <int, int>(mention.begin, mention.end)))
                                {
                                    features.SetCount("mentionInQuote", 1);
                                }
                            }
                            if (features.GetCount("mentionInQuote") != 1)
                            {
                                features.SetCount("mentionNotInQuote", 1);
                            }
                        }
                    }
                    // nearby word syntax types...make sure to check if there are previous or next words
                    // or there will be an array index crash
                    if (mention.begin > 0)
                    {
                        CoreLabel prevWord = tokens[mention.begin - 1];
                        features.SetCount("prevWordType:" + prevWord.Tag(), 1);
                        if (punctuationForFeatures.Contains(prevWord.Lemma()))
                        {
                            features.SetCount("prevWordPunct:" + prevWord.Lemma(), 1);
                        }
                    }
                    if (mention.end + 1 < tokens.Count)
                    {
                        CoreLabel nextWord = tokens[mention.end + 1];
                        features.SetCount("nextWordType:" + nextWord.Tag(), 1);
                        if (punctuationForFeatures.Contains(nextWord.Lemma()))
                        {
                            features.SetCount("nextWordPunct:" + nextWord.Lemma(), 1);
                        }
                    }
                    //                    features.setCount("prevAndNext:" + prevWord.tag()+ ";" + nextWord.tag(), 1);
                    //quote paragraph features
                    IList <ICoreMap> quotesInQuoteParagraph = paragraphToQuotes[quoteParagraphIdx];
                    features.SetCount("QuotesInQuoteParagraph", quotesInQuoteParagraph.Count);
                    features.SetCount("WordsInQuoteParagraph", quoteParagraphEndToken - quoteParagraphBeginToken + 1);
                    features.SetCount("NamesInQuoteParagraph", sieve.ScanForNames(new Pair <int, int>(quoteParagraphBeginToken, quoteParagraphEndToken)).first.Count);
                    //quote features
                    features.SetCount("quoteLength", quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) + 1);
                    for (int i_1 = 0; i_1 < quotesInQuoteParagraph.Count; i_1++)
                    {
                        if (quotesInQuoteParagraph[i_1].Equals(quote))
                        {
                            features.SetCount("quotePosition", i_1 + 1);
                        }
                    }
                    if (features.GetCount("quotePosition") == 0)
                    {
                        throw new Exception("Check this (equality not working)");
                    }
                    Pair <List <string>, List <Pair <int, int> > > namesData = sieve.ScanForNames(quoteRun);
                    foreach (string name in namesData.first)
                    {
                        features.SetCount("charactersInQuote:" + sd.characterMap[name][0].name, 1);
                    }
                    //if quote encompasses entire paragraph
                    if (quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == quoteParagraphBeginToken && quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) == quoteParagraphEndToken)
                    {
                        features.SetCount("isImplicitSpeaker", 1);
                    }
                    else
                    {
                        features.SetCount("isImplicitSpeaker", -1);
                    }
                    //Vocative detection
                    if (mention.type.Equals("name"))
                    {
                        IList <Person> pList = sd.characterMap[sieve.TokenRangeToString(new Pair <int, int>(mention.begin, mention.end))];
                        Person         p     = null;
                        if (pList != null)
                        {
                            p = pList[0];
                        }
                        else
                        {
                            Pair <List <string>, List <Pair <int, int> > > scanForNamesResultPair = sieve.ScanForNames(new Pair <int, int>(mention.begin, mention.end));
                            if (scanForNamesResultPair.first.Count != 0)
                            {
                                string scanForNamesResultString = scanForNamesResultPair.first[0];
                                if (scanForNamesResultString != null && sd.characterMap.Contains(scanForNamesResultString))
                                {
                                    p = sd.characterMap[scanForNamesResultString][0];
                                }
                            }
                        }
                        if (p != null)
                        {
                            foreach (string name_1 in namesData.first)
                            {
                                if (p.aliases.Contains(name_1))
                                {
                                    features.SetCount("nameInQuote", 1);
                                }
                            }
                            if (quoteParagraphIdx > 0)
                            {
                                //            Paragraph prevParagraph = paragraphs.get(ex.paragraph_idx - 1);
                                IList <ICoreMap>         quotesInPrevParagraph = paragraphToQuotes.GetOrDefault(quoteParagraphIdx - 1, new List <ICoreMap>());
                                IList <Pair <int, int> > exclusionList         = new List <Pair <int, int> >();
                                foreach (ICoreMap quoteIPP in quotesInPrevParagraph)
                                {
                                    Pair <int, int> quoteRange = new Pair <int, int>(quoteIPP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIPP.Get(typeof(CoreAnnotations.TokenEndAnnotation)));
                                    exclusionList.Add(quoteRange);
                                    foreach (string name_2 in sieve.ScanForNames(quoteRange).first)
                                    {
                                        if (p.aliases.Contains(name_2))
                                        {
                                            features.SetCount("nameInPrevParagraphQuote", 1);
                                        }
                                    }
                                }
                                int      sentenceIdx             = quoteFirstSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                                ICoreMap sentenceInPrevParagraph = null;
                                for (int i = sentenceIdx - 1; i_1 >= 0; i_1--)
                                {
                                    ICoreMap currSentence = sentences[i_1];
                                    if (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1)
                                    {
                                        sentenceInPrevParagraph = currSentence;
                                        break;
                                    }
                                }
                                int prevParagraphBegin = GetParagraphBeginToken(sentenceInPrevParagraph, sentences);
                                int prevParagraphEnd   = GetParagraphEndToken(sentenceInPrevParagraph, sentences);
                                IList <Pair <int, int> > prevParagraphNonQuoteRuns = GetRangeExclusion(new Pair <int, int>(prevParagraphBegin, prevParagraphEnd), exclusionList);
                                foreach (Pair <int, int> nonQuoteRange in prevParagraphNonQuoteRuns)
                                {
                                    foreach (string name_2 in sieve.ScanForNames(nonQuoteRange).first)
                                    {
                                        if (p.aliases.Contains(name_2))
                                        {
                                            features.SetCount("nameInPrevParagraphNonQuote", 1);
                                        }
                                    }
                                }
                            }
                        }
                    }
                    if (isTraining)
                    {
                        if (QuoteAttributionUtils.RangeContains(new Pair <int, int>(gold.mentionStartTokenIndex, gold.mentionEndTokenIndex), new Pair <int, int>(mention.begin, mention.end)))
                        {
                            RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isMention");
                            datum.SetID(int.ToString(dataset.Size()));
                            mapDatumToMention[dataset.Size()] = mention;
                            dataset.Add(datum);
                        }
                        else
                        {
                            RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isNotMention");
                            datum.SetID(int.ToString(dataset.Size()));
                            dataset.Add(datum);
                            mapDatumToMention[dataset.Size()] = mention;
                        }
                    }
                    else
                    {
                        RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "none");
                        datum.SetID(int.ToString(dataset.Size()));
                        mapDatumToMention[dataset.Size()] = mention;
                        dataset.Add(datum);
                    }
                }
                mapQuoteToDataRange[quoteIdx] = new Pair <int, int>(initialSize, dataset.Size() - 1);
            }
            return(new SupervisedSieveTraining.FeaturesData(mapQuoteToDataRange, mapDatumToMention, dataset));
        }
Example #5
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                System.Console.Error.Printf("Usage: java %s tree_file morfette_tnt_file%n", typeof(MungeTreesWithMorfetteAnalyses).FullName);
                System.Environment.Exit(-1);
            }
            string             treeFile     = args[0];
            string             morfetteFile = args[1];
            ITreeReaderFactory trf          = new FrenchTreeReaderFactory();

            try
            {
                ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
                IEnumerator <IList <CoreLabel> > morfetteItr = new MungeTreesWithMorfetteAnalyses.MorfetteFileIterator(morfetteFile);
                for (Tree tree; (tree = tr.ReadTree()) != null && morfetteItr.MoveNext();)
                {
                    IList <CoreLabel> analysis = morfetteItr.Current;
                    IList <ILabel>    yield    = tree.Yield();
                    System.Diagnostics.Debug.Assert(analysis.Count == yield.Count);
                    int yieldLen = yield.Count;
                    for (int i = 0; i < yieldLen; ++i)
                    {
                        CoreLabel tokenAnalysis = analysis[i];
                        ILabel    token         = yield[i];
                        string    lemma         = GetLemma(token.Value(), tokenAnalysis.Lemma());
                        string    newLeaf       = string.Format("%s%s%s%s%s", token.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, tokenAnalysis.Tag());
                        ((CoreLabel)token).SetValue(newLeaf);
                    }
                    System.Console.Out.WriteLine(tree.ToString());
                }
                if (tr.ReadTree() != null || morfetteItr.MoveNext())
                {
                    log.Info("WARNING: Uneven input files!");
                }
                tr.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Example #6
0
        public virtual void TrigramPatterns(Annotation doc)
        {
            IList <CoreLabel> docTokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <ICoreMap>  docQuotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));

            foreach (ICoreMap quote in docQuotes)
            {
                if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null)
                {
                    continue;
                }
                int             quoteBeginTokenIndex  = quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                int             quoteEndTokenIndex    = quote.Get(typeof(CoreAnnotations.TokenEndAnnotation));
                int             quoteEndSentenceIndex = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation));
                Pair <int, int> precedingTokenRange   = QuoteAttributionUtils.GetTokenRangePrecedingQuote(doc, quote);
                //get tokens before and after
                if (precedingTokenRange != null)
                {
                    Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(precedingTokenRange);
                    List <string>           names       = namesAndNameIndices.first;
                    List <Pair <int, int> > nameIndices = namesAndNameIndices.second;
                    if (names.Count > 0)
                    {
                        int offset = 0;
                        if (beforeQuotePunctuation.Contains(docTokens[quoteBeginTokenIndex - 1].Word()))
                        {
                            offset = 1;
                        }
                        Pair <int, int> lastNameIndex = nameIndices[nameIndices.Count - 1];
                        CoreLabel       prevToken     = docTokens[quoteBeginTokenIndex - 1 - offset];
                        //CVQ
                        if (prevToken.Tag() != null && prevToken.Tag().StartsWith("V") && lastNameIndex.second.Equals(quoteBeginTokenIndex - 2 - offset))
                        {
                            // verb!
                            FillInMention(quote, names[names.Count - 1], lastNameIndex.first, lastNameIndex.second, "trigram CVQ", Name);
                            continue;
                        }
                        //VCQ
                        if (lastNameIndex.second.Equals(quoteBeginTokenIndex - 1 - offset))
                        {
                            CoreLabel secondPrevToken = docTokens[lastNameIndex.first - 1];
                            if (secondPrevToken.Tag().StartsWith("V"))
                            {
                                FillInMention(quote, names[names.Count - 1], lastNameIndex.first, lastNameIndex.second, "trigram VCQ", Name);
                                continue;
                            }
                        }
                    }
                    List <int> pronounsIndices = ScanForPronouns(precedingTokenRange);
                    if (pronounsIndices.Count > 0)
                    {
                        int offset = 0;
                        if (beforeQuotePunctuation.Contains(docTokens[quoteBeginTokenIndex - 1].Word()))
                        {
                            offset = 1;
                        }
                        CoreLabel prevToken        = docTokens[quoteBeginTokenIndex - 1 - offset];
                        int       lastPronounIndex = pronounsIndices[pronounsIndices.Count - 1];
                        //PVQ
                        if (prevToken.Tag().StartsWith("V") && lastPronounIndex == quoteBeginTokenIndex - 2 - offset)
                        {
                            // verb!
                            FillInMention(quote, TokenRangeToString(lastPronounIndex), lastPronounIndex, lastPronounIndex, "trigram PVQ", Pronoun);
                            continue;
                        }
                        //VPQ
                        if (lastPronounIndex == quoteBeginTokenIndex - 1 - offset && docTokens[quoteBeginTokenIndex - 2 - offset].Tag().StartsWith("V"))
                        {
                            FillInMention(quote, TokenRangeToString(lastPronounIndex), lastPronounIndex, lastPronounIndex, "trigram VPQ", Pronoun);
                            continue;
                        }
                    }
                }
                Pair <int, int> followingTokenRange = QuoteAttributionUtils.GetTokenRangeFollowingQuote(doc, quote);
                if (followingTokenRange != null)
                {
                    Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(followingTokenRange);
                    List <string>           names       = namesAndNameIndices.first;
                    List <Pair <int, int> > nameIndices = namesAndNameIndices.second;
                    if (names.Count > 0)
                    {
                        Pair <int, int> firstNameIndex = nameIndices[0];
                        CoreLabel       nextToken      = docTokens[quoteEndTokenIndex + 1];
                        //QVC
                        if (nextToken.Tag().StartsWith("V") && firstNameIndex.first.Equals(quoteEndTokenIndex + 2))
                        {
                            // verb!
                            FillInMention(quote, names[0], firstNameIndex.first, firstNameIndex.second, "trigram QVC", Name);
                            continue;
                        }
                        //QCV
                        if (firstNameIndex.first.Equals(quoteEndTokenIndex + 1))
                        {
                            CoreLabel secondNextToken = docTokens[firstNameIndex.second + 1];
                            if (secondNextToken.Tag().StartsWith("V"))
                            {
                                FillInMention(quote, names[0], firstNameIndex.first, firstNameIndex.second, "trigram QCV", Name);
                                continue;
                            }
                        }
                    }
                    List <int> pronounsIndices = ScanForPronouns(followingTokenRange);
                    if (pronounsIndices.Count > 0)
                    {
                        CoreLabel nextToken         = docTokens[quoteEndTokenIndex + 1];
                        int       firstPronounIndex = pronounsIndices[0];
                        //QVP
                        if (nextToken.Tag().StartsWith("V") && firstPronounIndex == quoteEndTokenIndex + 2)
                        {
                            // verb!
                            FillInMention(quote, TokenRangeToString(pronounsIndices[0]), firstPronounIndex, firstPronounIndex, "trigram QVP", Pronoun);
                            continue;
                        }
                        //QPV
                        if (firstPronounIndex == quoteEndTokenIndex + 1 && docTokens[quoteEndTokenIndex + 2].Tag().StartsWith("V"))
                        {
                            FillInMention(quote, TokenRangeToString(pronounsIndices[pronounsIndices.Count - 1]), firstPronounIndex, firstPronounIndex, "trigram QPV", Pronoun);
                            continue;
                        }
                    }
                }
            }
        }
        /// <summary>
        /// transformTree does all language-specific tree
        /// transformations.
        /// </summary>
        /// <remarks>
        /// transformTree does all language-specific tree
        /// transformations. Any parameterizations should be inside the
        /// specific TreebankLangParserarams class.
        /// </remarks>
        public override Tree TransformTree(Tree t, Tree root)
        {
            if (t == null || t.IsLeaf())
            {
                return(t);
            }
            IList <string> annotations = new List <string>();
            CoreLabel      lab         = (CoreLabel)t.Label();
            string         word        = lab.Word();
            string         tag         = lab.Tag();
            string         cat         = lab.Value();
            string         baseCat     = TreebankLanguagePack().BasicCategory(cat);

            //Tree parent = t.parent(root);
            // String mcat = "";
            // if (parent != null) {
            //   mcat = parent.label().value();
            // }
            //categories -- at present there is no tag annotation!!
            if (t.IsPhrasal())
            {
                IList <string> childBasicCats = ChildBasicCats(t);
                // mark vp's headed by "zu" verbs
                if (markZuVP && baseCat.Equals("VP") && (childBasicCats.Contains("VZ") || childBasicCats.Contains("VVIZU")))
                {
                    annotations.Add("%ZU");
                }
                // mark relative clause S's
                if (markRC && (t.Label() is NegraLabel) && baseCat.Equals("S") && ((NegraLabel)t.Label()).GetEdge() != null && ((NegraLabel)t.Label()).GetEdge().Equals("RC"))
                {
                    //throw new RuntimeException("damn, not a Negra Label");
                    annotations.Add("%RC");
                }
                //      if(t.children().length == 1) {
                //        annotations.add("%U");
                //      }
                if (markContainsV && ContainsVP(t))
                {
                    annotations.Add("%vp");
                }
                if (markLP && LeftPhrasal(t))
                {
                    annotations.Add("%LP");
                }
                if (markKonjParent)
                {
                    // this depends on functional tags being present
                    foreach (string cCat in childBasicCats)
                    {
                        if (cCat.Contains("-KONJ"))
                        {
                            annotations.Add("%konjp");
                            break;
                        }
                    }
                }
                if (markHDParent)
                {
                    // this depends on functional tags being present
                    foreach (string cCat in childBasicCats)
                    {
                        if (cCat.Contains("-HD"))
                        {
                            annotations.Add("%hdp");
                            break;
                        }
                    }
                }
            }
            else
            {
                //t.isPreTerminal() case
                if (markColon && cat.Equals("$.") && (word.Equals(":") || word.Equals(";")))
                {
                    annotations.Add("-%colon");
                }
            }
            //    if(t.isPreTerminal()) {
            //      if(parent != null) {
            //        String parentVal = parent.label().value();
            //        int cutOffPtD = parentVal.indexOf('-');
            //        int cutOffPtC = parentVal.indexOf('^');
            //        int curMin = parentVal.length();
            //        if(cutOffPtD != -1) {
            //          curMin = cutOffPtD;
            //        }
            //        if(cutOffPtC != -1) {
            //          curMin = Math.min(curMin, cutOffPtC);
            //        }
            //        parentVal = parentVal.substring(0, curMin);
            //        annotations.add("^" + parentVal);
            //      }
            //    }
            // put on all the annotations
            StringBuilder catSB = new StringBuilder(cat);

            foreach (string annotation in annotations)
            {
                catSB.Append(annotation);
            }
            t.SetLabel(new CategoryWordTag(catSB.ToString(), word, tag));
            return(t);
        }
Example #8
0
        public static ICounter <string> ExtractFeatures(Mention p, ICollection <Mention> shares, ICollection <string> neStrings, Dictionaries dict, Properties props)
        {
            ICounter <string> features = new ClassicCounter <string>();
            string            span     = p.LowercaseNormalizedSpanString();
            string            ner      = p.headWord.Ner();
            int sIdx = p.startIndex;
            int eIdx = p.endIndex;
            IList <CoreLabel> sent      = p.sentenceWords;
            CoreLabel         preWord   = (sIdx == 0) ? null : sent[sIdx - 1];
            CoreLabel         nextWord  = (eIdx == sent.Count) ? null : sent[eIdx];
            CoreLabel         firstWord = p.originalSpan[0];
            CoreLabel         lastWord  = p.originalSpan[p.originalSpan.Count - 1];

            features.IncrementCount("B-NETYPE-" + ner);
            if (neStrings.Contains(span))
            {
                features.IncrementCount("B-NE-STRING-EXIST");
                if ((preWord == null || !preWord.Ner().Equals(ner)) && (nextWord == null || !nextWord.Ner().Equals(ner)))
                {
                    features.IncrementCount("B-NE-FULLSPAN");
                }
            }
            if (preWord != null)
            {
                features.IncrementCount("B-PRECEDINGWORD-" + preWord.Word());
            }
            if (nextWord != null)
            {
                features.IncrementCount("B-FOLLOWINGWORD-" + nextWord.Word());
            }
            if (preWord != null)
            {
                features.IncrementCount("B-PRECEDINGPOS-" + preWord.Tag());
            }
            if (nextWord != null)
            {
                features.IncrementCount("B-FOLLOWINGPOS-" + nextWord.Tag());
            }
            features.IncrementCount("B-FIRSTWORD-" + firstWord.Word());
            features.IncrementCount("B-FIRSTPOS-" + firstWord.Tag());
            features.IncrementCount("B-LASTWORD-" + lastWord.Word());
            features.IncrementCount("B-LASTWORD-" + lastWord.Tag());
            foreach (Mention s in shares)
            {
                if (s == p)
                {
                    continue;
                }
                if (s.InsideIn(p))
                {
                    features.IncrementCount("B-BIGGER-THAN-ANOTHER");
                    break;
                }
            }
            foreach (Mention s_1 in shares)
            {
                if (s_1 == p)
                {
                    continue;
                }
                if (p.InsideIn(s_1))
                {
                    features.IncrementCount("B-SMALLER-THAN-ANOTHER");
                    break;
                }
            }
            return(features);
        }
Example #9
0
 public static void WriteConllFile(string outFile, IList <ICoreMap> sentences, IList <DependencyTree> trees)
 {
     try
     {
         PrintWriter output = IOUtils.GetPrintWriter(outFile);
         for (int i = 0; i < sentences.Count; i++)
         {
             ICoreMap          sentence = sentences[i];
             DependencyTree    tree     = trees[i];
             IList <CoreLabel> tokens   = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
             for (int j = 1; j <= size; ++j)
             {
                 CoreLabel token = tokens[j - 1];
                 output.Printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n", j, token.Word(), token.Tag(), token.Tag(), tree.GetHead(j), tree.GetLabel(j));
             }
             output.Println();
         }
         output.Close();
     }
     catch (Exception e)
     {
         throw new RuntimeIOException(e);
     }
 }
Example #10
0
        /// <summary>
        /// Adds stem under annotation
        /// <paramref name="ann"/>
        /// to the given CoreLabel.
        /// Assumes that it has a TextAnnotation and PartOfSpeechAnnotation.
        /// </summary>
        public virtual void Stem(CoreLabel label, Type ann)
        {
            string lemma = Lemmatize(label.Word(), label.Tag(), lexer, lexer.Option(1));

            label.Set(ann, lemma);
        }
Example #11
0
        public static ICollection <SurfacePattern> GetContext(IList <CoreLabel> sent, int i, ICollection <CandidatePhrase> stopWords)
        {
            ICollection <SurfacePattern> prevpatterns     = new HashSet <SurfacePattern>();
            ICollection <SurfacePattern> nextpatterns     = new HashSet <SurfacePattern>();
            ICollection <SurfacePattern> prevnextpatterns = new HashSet <SurfacePattern>();
            CoreLabel token = sent[i];
            string    tag   = null;

            if (usePOS4Pattern)
            {
                string fulltag = token.Tag();
                if (useCoarsePOS)
                {
                    tag = Sharpen.Runtime.Substring(fulltag, 0, Math.Min(fulltag.Length, 2));
                }
                else
                {
                    tag = fulltag;
                }
            }
            string nerTag = token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));

            for (int maxWin = 1; maxWin <= maxWindow4Pattern; maxWin++)
            {
                IList <Token>  previousTokens   = new List <Token>();
                IList <string> originalPrev     = new List <string>();
                IList <string> originalNext     = new List <string>();
                IList <Token>  nextTokens       = new List <Token>();
                int            numStopWordsprev = 0;
                int            numStopWordsnext = 0;
                // int numPrevTokensSpecial = 0, numNextTokensSpecial = 0;
                int          numNonStopWordsNext = 0;
                int          numNonStopWordsPrev = 0;
                bool         useprev             = false;
                bool         usenext             = false;
                PatternToken twithoutPOS         = null;
                //TODO: right now using numWordsCompoundMax.
                if (addPatWithoutPOS)
                {
                    twithoutPOS = new PatternToken(tag, false, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation)));
                }
                PatternToken twithPOS = null;
                if (usePOS4Pattern)
                {
                    twithPOS = new PatternToken(tag, true, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation)));
                }
                if (usePreviousContext)
                {
                    // int j = Math.max(0, i - 1);
                    int j         = i - 1;
                    int numTokens = 0;
                    while (numTokens < maxWin && j >= 0)
                    {
                        // for (int j = Math.max(i - maxWin, 0); j < i; j++) {
                        CoreLabel tokenj = sent[j];
                        string    tokenjStr;
                        if (useLemmaContextTokens)
                        {
                            tokenjStr = tokenj.Lemma();
                        }
                        else
                        {
                            tokenjStr = tokenj.Word();
                        }
                        // do not use this word in context consideration
                        if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower()))
                        {
                            j--;
                            continue;
                        }
                        //          if (!tokenj.containsKey(answerClass.get(label))) {
                        //            throw new RuntimeException("how come the class "
                        //                + answerClass.get(label) + " for token "
                        //                + tokenj.word() + " in " + sent + " is not set");
                        //          }
                        Triple <bool, Token, string> tr = GetContextTokenStr(tokenj);
                        bool   isLabeledO  = tr.first;
                        Token  strgeneric  = tr.second;
                        string strOriginal = tr.third;
                        if (!isLabeledO)
                        {
                            // numPrevTokensSpecial++;
                            previousTokens.Add(0, strgeneric);
                            // previousTokens.add(0,
                            // "[{answer:"
                            // + tokenj.get(answerClass.get(label)).toString()
                            // + "}]");
                            originalPrev.Add(0, strOriginal);
                            numNonStopWordsPrev++;
                        }
                        else
                        {
                            if (tokenj.Word().StartsWith("http"))
                            {
                                useprev = false;
                                previousTokens.Clear();
                                originalPrev.Clear();
                                break;
                            }
                            else
                            {
                                Token str = SurfacePattern.GetContextToken(tokenj);
                                previousTokens.Add(0, str);
                                originalPrev.Add(0, tokenjStr);
                                if (DoNotUse(tokenjStr, stopWords))
                                {
                                    numStopWordsprev++;
                                }
                                else
                                {
                                    numNonStopWordsPrev++;
                                }
                            }
                        }
                        numTokens++;
                        j--;
                    }
                }
                if (useNextContext)
                {
                    int numTokens = 0;
                    int j         = i + 1;
                    while (numTokens < maxWin && j < sent.Count)
                    {
                        // for (int j = i + 1; j < sent.size() && j <= i + maxWin; j++) {
                        CoreLabel tokenj = sent[j];
                        string    tokenjStr;
                        if (useLemmaContextTokens)
                        {
                            tokenjStr = tokenj.Lemma();
                        }
                        else
                        {
                            tokenjStr = tokenj.Word();
                        }
                        // do not use this word in context consideration
                        if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower()))
                        {
                            j++;
                            continue;
                        }
                        //          if (!tokenj.containsKey(answerClass.get(label))) {
                        //            throw new RuntimeException(
                        //                "how come the dict annotation for token " + tokenj.word()
                        //                    + " in " + sent + " is not set");
                        //          }
                        Triple <bool, Token, string> tr = GetContextTokenStr(tokenj);
                        bool   isLabeledO  = tr.first;
                        Token  strgeneric  = tr.second;
                        string strOriginal = tr.third;
                        // boolean isLabeledO = tokenj.get(answerClass.get(label))
                        // .equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL);
                        if (!isLabeledO)
                        {
                            // numNextTokensSpecial++;
                            numNonStopWordsNext++;
                            nextTokens.Add(strgeneric);
                            // nextTokens.add("[{" + label + ":"
                            // + tokenj.get(answerClass.get(label)).toString()
                            // + "}]");
                            originalNext.Add(strOriginal);
                        }
                        else
                        {
                            // originalNextStr += " "
                            // + tokenj.get(answerClass.get(label)).toString();
                            if (tokenj.Word().StartsWith("http"))
                            {
                                usenext = false;
                                nextTokens.Clear();
                                originalNext.Clear();
                                break;
                            }
                            else
                            {
                                // if (!tokenj.word().matches("[.,?()]")) {
                                Token str = SurfacePattern.GetContextToken(tokenj);
                                nextTokens.Add(str);
                                originalNext.Add(tokenjStr);
                                if (DoNotUse(tokenjStr, stopWords))
                                {
                                    numStopWordsnext++;
                                }
                                else
                                {
                                    numNonStopWordsNext++;
                                }
                            }
                        }
                        j++;
                        numTokens++;
                    }
                }
                // String prevContext = null, nextContext = null;
                // int numNonSpecialPrevTokens = previousTokens.size()
                // - numPrevTokensSpecial;
                // int numNonSpecialNextTokens = nextTokens.size() - numNextTokensSpecial;
                Token[] prevContext = null;
                //String[] prevContext = null;
                //String[] prevOriginalArr = null;
                // if (previousTokens.size() >= minWindow4Pattern
                // && (numStopWordsprev < numNonSpecialPrevTokens ||
                // numNonSpecialPrevTokens > numMinStopWordsToAdd)) {
                if (previousTokens.Count >= minWindow4Pattern && (numNonStopWordsPrev > 0 || numStopWordsprev > numMinStopWordsToAdd))
                {
                    // prevContext = StringUtils.join(previousTokens, fw);
                    IList <Token>  prevContextList = new List <Token>();
                    IList <string> prevOriginal    = new List <string>();
                    foreach (Token p in previousTokens)
                    {
                        prevContextList.Add(p);
                        if (!fw.IsEmpty())
                        {
                            prevContextList.Add(fw);
                        }
                    }
                    // add fw and sw to the the originalprev
                    foreach (string p_1 in originalPrev)
                    {
                        prevOriginal.Add(p_1);
                        if (!fw.IsEmpty())
                        {
                            prevOriginal.Add(" FW ");
                        }
                    }
                    if (!sw.IsEmpty())
                    {
                        prevContextList.Add(sw);
                        prevOriginal.Add(" SW ");
                    }
                    // String str = prevContext + fw + sw;
                    if (IsASCII(StringUtils.Join(prevOriginal)))
                    {
                        prevContext = Sharpen.Collections.ToArray(prevContextList, new Token[0]);
                        //prevOriginalArr = prevOriginal.toArray(new String[0]);
                        if (previousTokens.Count >= minWindow4Pattern)
                        {
                            if (twithoutPOS != null)
                            {
                                SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, null, SurfacePatternFactory.Genre.Prev);
                                prevpatterns.Add(pat);
                            }
                            if (twithPOS != null)
                            {
                                SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, null, SurfacePatternFactory.Genre.Prev);
                                prevpatterns.Add(patPOS);
                            }
                        }
                        useprev = true;
                    }
                }
                Token[] nextContext = null;
                //String [] nextOriginalArr = null;
                // if (nextTokens.size() > 0
                // && (numStopWordsnext < numNonSpecialNextTokens ||
                // numNonSpecialNextTokens > numMinStopWordsToAdd)) {
                if (nextTokens.Count > 0 && (numNonStopWordsNext > 0 || numStopWordsnext > numMinStopWordsToAdd))
                {
                    // nextContext = StringUtils.join(nextTokens, fw);
                    IList <Token>  nextContextList = new List <Token>();
                    IList <string> nextOriginal    = new List <string>();
                    if (!sw.IsEmpty())
                    {
                        nextContextList.Add(sw);
                        nextOriginal.Add(" SW ");
                    }
                    foreach (Token n in nextTokens)
                    {
                        if (!fw.IsEmpty())
                        {
                            nextContextList.Add(fw);
                        }
                        nextContextList.Add(n);
                    }
                    foreach (string n_1 in originalNext)
                    {
                        if (!fw.IsEmpty())
                        {
                            nextOriginal.Add(" FW ");
                        }
                        nextOriginal.Add(n_1);
                    }
                    if (nextTokens.Count >= minWindow4Pattern)
                    {
                        nextContext = Sharpen.Collections.ToArray(nextContextList, new Token[0]);
                        //nextOriginalArr =  nextOriginal.toArray(new String[0]);
                        if (twithoutPOS != null)
                        {
                            SurfacePattern pat = new SurfacePattern(null, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Next);
                            nextpatterns.Add(pat);
                        }
                        if (twithPOS != null)
                        {
                            SurfacePattern patPOS = new SurfacePattern(null, twithPOS, nextContext, SurfacePatternFactory.Genre.Next);
                            nextpatterns.Add(patPOS);
                        }
                    }
                    usenext = true;
                }
                if (useprev && usenext)
                {
                    // String strprev = prevContext + fw + sw;
                    // String strnext = sw + fw + nextContext;
                    if (previousTokens.Count + nextTokens.Count >= minWindow4Pattern)
                    {
                        if (twithoutPOS != null)
                        {
                            SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Prevnext);
                            prevnextpatterns.Add(pat);
                        }
                        if (twithPOS != null)
                        {
                            SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, nextContext, SurfacePatternFactory.Genre.Prevnext);
                            prevnextpatterns.Add(patPOS);
                        }
                    }
                }
            }
            //    Triple<Set<Integer>, Set<Integer>, Set<Integer>> patterns = new Triple<Set<Integer>, Set<Integer>, Set<Integer>>(
            //        prevpatterns, nextpatterns, prevnextpatterns);
            // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
            // " prev patterns are " + prevpatterns);
            // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
            // " next patterns are " + nextpatterns);
            // System.out.println("For word " + sent.get(i) + " in sentence " + sent +
            // " prevnext patterns are " + prevnextpatterns);
            //getPatternIndex().finishCommit();
            return(CollectionUtils.UnionAsSet(prevpatterns, nextpatterns, prevnextpatterns));
        }
Example #12
0
        /// <summary>Convert token to a sequence of datums and add to iobList.</summary>
        /// <param name="iobList"/>
        /// <param name="token"/>
        /// <param name="tokType"/>
        /// <param name="tokenLabel"/>
        /// <param name="lastToken"/>
        /// <param name="applyRewriteRules"/>
        /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param>
        /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param>
        private static void TokenToDatums(IList <CoreLabel> iobList, CoreLabel cl, string token, IOBUtils.TokenType tokType, CoreLabel tokenLabel, string lastToken, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText
                                          )
        {
            if (token.IsEmpty())
            {
                return;
            }
            string lastLabel        = ContinuationSymbol;
            string firstLabel       = BeginSymbol;
            string rewritten        = cl.Get(typeof(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation));
            bool   crossRefRewrites = true;

            if (rewritten == null)
            {
                rewritten        = token;
                crossRefRewrites = false;
            }
            else
            {
                rewritten = StripSegmentationMarkers(rewritten, tokType);
            }
            if (applyRewriteRules)
            {
                // Apply Arabic-specific re-write rules
                string rawToken = tokenLabel.Word();
                string tag      = tokenLabel.Tag();
                MorphoFeatureSpecification featureSpec = new ArabicMorphoFeatureSpecification();
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Ngen);
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Nnum);
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Def);
                featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Tense);
                MorphoFeatures features = featureSpec.StrToFeatures(tag);
                // Rule #1 : ت --> ة
                if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Ngen).Equals("F") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Nnum).Equals("SG") && rawToken.EndsWith("ت-") && !stripRewrites)
                {
                    lastLabel = RewriteSymbol;
                }
                else
                {
                    if (rawToken.EndsWith("ة-"))
                    {
                        System.Diagnostics.Debug.Assert(token.EndsWith("ة"));
                        token     = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ت";
                        lastLabel = RewriteSymbol;
                    }
                }
                // Rule #2 : لل --> ل ال
                if (lastToken.Equals("ل") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Def).Equals("D"))
                {
                    if (rawToken.StartsWith("-ال"))
                    {
                        if (!token.StartsWith("ا"))
                        {
                            log.Info("Bad REWAL: " + rawToken + " / " + token);
                        }
                        token     = Sharpen.Runtime.Substring(token, 1);
                        rewritten = Sharpen.Runtime.Substring(rewritten, 1);
                        if (!stripRewrites)
                        {
                            firstLabel = RewriteSymbol;
                        }
                    }
                    else
                    {
                        if (rawToken.StartsWith("-ل"))
                        {
                            if (!token.StartsWith("ل"))
                            {
                                log.Info("Bad REWAL: " + rawToken + " / " + token);
                            }
                            if (!stripRewrites)
                            {
                                firstLabel = RewriteSymbol;
                            }
                        }
                        else
                        {
                            log.Info("Ignoring REWAL: " + rawToken + " / " + token);
                        }
                    }
                }
                // Rule #3 : ي --> ى
                // Rule #4 : ا --> ى
                if (rawToken.EndsWith("ى-"))
                {
                    if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Tense) != null)
                    {
                        // verb: ى becomes ا
                        token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ا";
                    }
                    else
                    {
                        // assume preposition:
                        token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ي";
                    }
                    if (!stripRewrites)
                    {
                        lastLabel = RewriteSymbol;
                    }
                }
                else
                {
                    if (rawToken.Equals("علي-") || rawToken.Equals("-علي-"))
                    {
                        if (!stripRewrites)
                        {
                            lastLabel = RewriteSymbol;
                        }
                    }
                }
            }
            string origWord;

            if (origText == null)
            {
                origWord = tokenLabel.Word();
            }
            else
            {
                origWord = Sharpen.Runtime.Substring(origText, cl.BeginPosition(), cl.EndPosition());
            }
            int origIndex = 0;

            while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf))
            {
                ++origIndex;
            }
            // Create datums and add to iobList
            if (token.IsEmpty())
            {
                log.Info("Rewriting resulted in empty token: " + tokenLabel.Word());
            }
            string firstChar = token[0].ToString();

            // Start at 0 to make sure we include the whole token according to the tokenizer
            iobList.Add(CreateDatum(cl, firstChar, firstLabel, 0, origIndex + 1));
            int numChars = token.Length;

            if (crossRefRewrites && rewritten.Length != numChars)
            {
                System.Console.Error.Printf("Rewritten annotation doesn't have correct length: %s>>>%s%n", token, rewritten);
                crossRefRewrites = false;
            }
            ++origIndex;
            for (int j = 1; j < numChars; ++j, ++origIndex)
            {
                while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf))
                {
                    ++origIndex;
                }
                if (origIndex >= origWord.Length)
                {
                    origIndex = origWord.Length - 1;
                }
                string charLabel = (j == numChars - 1) ? lastLabel : ContinuationSymbol;
                string thisChar  = token[j].ToString();
                if (crossRefRewrites && !rewritten[j].ToString().Equals(thisChar))
                {
                    charLabel = RewriteSymbol;
                }
                if (charLabel == ContinuationSymbol && thisChar.Equals("ى") && j != numChars - 1)
                {
                    charLabel = RewriteSymbol;
                }
                // Assume all mid-word alef maqsura are supposed to be yah
                iobList.Add(CreateDatum(cl, thisChar, charLabel, origIndex, origIndex + 1));
            }
            // End at endPosition to make sure we include the whole token according to the tokenizer
            if (!iobList.IsEmpty())
            {
                iobList[iobList.Count - 1].SetEndPosition(cl.EndPosition());
            }
        }