Пример #1
0
        protected internal virtual GeneralDataset <string, string> CreateDataset(Annotation corpus)
        {
            GeneralDataset <string, string> dataset = new RVFDataset <string, string>();

            foreach (ICoreMap sentence in corpus.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                foreach (RelationMention rel in AnnotationUtils.GetAllRelations(relationMentionFactory, sentence, createUnrelatedRelations))
                {
                    dataset.Add(CreateDatum(rel));
                }
            }
            dataset.ApplyFeatureCountThreshold(featureCountThreshold);
            return(dataset);
        }
 // public void getDecisionTree(Map<String, List<CoreLabel>> sents,
 // List<Pair<String, Integer>> chosen, Counter<String> weights, String
 // wekaOptions) {
 // RVFDataset<String, String> dataset = new RVFDataset<String, String>();
 // for (Pair<String, Integer> d : chosen) {
 // CoreLabel l = sents.get(d.first).get(d.second());
 // String w = l.word();
 // Integer num = this.clusterIds.get(w);
 // if (num == null)
 // num = -1;
 // double wt = weights.getCount("Cluster-" + num);
 // String label;
 // if (l.get(answerClass).toString().equals(answerLabel))
 // label = answerLabel;
 // else
 // label = "O";
 // Counter<String> feat = new ClassicCounter<String>();
 // feat.setCount("DIST", wt);
 // dataset.add(new RVFDatum<String, String>(feat, label));
 // }
 // WekaDatumClassifierFactory wekaFactory = new
 // WekaDatumClassifierFactory("weka.classifiers.trees.J48", wekaOptions);
 // WekaDatumClassifier classifier = wekaFactory.trainClassifier(dataset);
 // Classifier cls = classifier.getClassifier();
 // J48 j48decisiontree = (J48) cls;
 // System.out.println(j48decisiontree.toSummaryString());
 // System.out.println(j48decisiontree.toString());
 //
 // }
 private int Sample(IDictionary <string, DataInstance> sents, Random r, Random rneg, double perSelectNeg, double perSelectRand, int numrand, IList <Pair <string, int> > chosen, RVFDataset <string, string> dataset)
 {
     foreach (KeyValuePair <string, DataInstance> en in sents)
     {
         CoreLabel[] sent = Sharpen.Collections.ToArray(en.Value.GetTokens(), new CoreLabel[0]);
         for (int i = 0; i < sent.Length; i++)
         {
             CoreLabel l          = sent[i];
             bool      chooseThis = false;
             if (l.Get(answerClass).Equals(answerLabel))
             {
                 chooseThis = true;
             }
             else
             {
                 if ((!l.Get(answerClass).Equals("O") || negativeWords.Contains(l.Word().ToLower())) && GetRandomBoolean(r, perSelectNeg))
                 {
                     chooseThis = true;
                 }
                 else
                 {
                     if (GetRandomBoolean(r, perSelectRand))
                     {
                         numrand++;
                         chooseThis = true;
                     }
                     else
                     {
                         chooseThis = false;
                     }
                 }
             }
             if (chooseThis)
             {
                 chosen.Add(new Pair(en.Key, i));
                 RVFDatum <string, string> d = GetDatum(sent, i);
                 dataset.Add(d, en.Key, int.ToString(i));
             }
         }
     }
     return(numrand);
 }
        //goldList null if not training
        public static SupervisedSieveTraining.FeaturesData Featurize(SupervisedSieveTraining.SieveData sd, IList <XMLToAnnotation.GoldQuoteInfo> goldList, bool isTraining)
        {
            Annotation doc = sd.doc;

            sieve = new Sieve(doc, sd.characterMap, sd.pronounCorefMap, sd.animacyList);
            IList <ICoreMap>  quotes    = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation));
            IList <ICoreMap>  sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));
            IList <CoreLabel> tokens    = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            IDictionary <int, IList <ICoreMap> > paragraphToQuotes = GetQuotesInParagraph(doc);
            GeneralDataset <string, string>      dataset           = new RVFDataset <string, string>();
            //necessary for 'ScoreBestMention'
            IDictionary <int, Pair <int, int> > mapQuoteToDataRange = new Dictionary <int, Pair <int, int> >();
            //maps quote to corresponding indices in the dataset
            IDictionary <int, Sieve.MentionData> mapDatumToMention = new Dictionary <int, Sieve.MentionData>();

            if (isTraining && goldList.Count != quotes.Count)
            {
                throw new Exception("Gold Quote List size doesn't match quote list size!");
            }
            for (int quoteIdx = 0; quoteIdx < quotes.Count; quoteIdx++)
            {
                int      initialSize = dataset.Size();
                ICoreMap quote       = quotes[quoteIdx];
                XMLToAnnotation.GoldQuoteInfo gold = null;
                if (isTraining)
                {
                    gold = goldList[quoteIdx];
                    if (gold.speaker == string.Empty)
                    {
                        continue;
                    }
                }
                ICoreMap        quoteFirstSentence = sentences[quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))];
                Pair <int, int> quoteRun           = new Pair <int, int>(quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)));
                //      int quoteChapter = quoteFirstSentence.get(ChapterAnnotator.ChapterAnnotation.class);
                int quoteParagraphIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation));
                //add mentions before quote up to the previous paragraph
                int rightValue = quoteRun.first - 1;
                int leftValue  = quoteRun.first - 1;
                //move left value to be the first token idx of the previous paragraph
                for (int sentIdx = quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation)); sentIdx >= 0; sentIdx--)
                {
                    ICoreMap sentence = sentences[sentIdx];
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx)
                    {
                        continue;
                    }
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1)
                    {
                        //quoteParagraphIdx - 1 for this and prev
                        leftValue = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                    }
                    else
                    {
                        break;
                    }
                }
                IList <Sieve.MentionData> mentionsInPreviousParagraph = new List <Sieve.MentionData>();
                if (leftValue > -1 && rightValue > -1)
                {
                    mentionsInPreviousParagraph = EliminateDuplicates(sieve.FindClosestMentionsInSpanBackward(new Pair <int, int>(leftValue, rightValue)));
                }
                //mentions in next paragraph
                leftValue  = quoteRun.second + 1;
                rightValue = quoteRun.second + 1;
                for (int sentIdx_1 = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation)); sentIdx_1 < sentences.Count; sentIdx_1++)
                {
                    ICoreMap sentence = sentences[sentIdx_1];
                    //        if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx) {
                    //          continue;
                    //        }
                    if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx)
                    {
                        //quoteParagraphIdx + 1
                        rightValue = sentence.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1;
                    }
                    else
                    {
                        break;
                    }
                }
                IList <Sieve.MentionData> mentionsInNextParagraph = new List <Sieve.MentionData>();
                if (leftValue < tokens.Count && rightValue < tokens.Count)
                {
                    mentionsInNextParagraph = sieve.FindClosestMentionsInSpanForward(new Pair <int, int>(leftValue, rightValue));
                }
                IList <Sieve.MentionData> candidateMentions = new List <Sieve.MentionData>();
                Sharpen.Collections.AddAll(candidateMentions, mentionsInPreviousParagraph);
                Sharpen.Collections.AddAll(candidateMentions, mentionsInNextParagraph);
                //      System.out.println(candidateMentions.size());
                int rankedDistance = 1;
                int numBackwards   = mentionsInPreviousParagraph.Count;
                foreach (Sieve.MentionData mention in candidateMentions)
                {
                    IList <CoreLabel> mentionCandidateTokens   = doc.Get(typeof(CoreAnnotations.TokensAnnotation)).SubList(mention.begin, mention.end + 1);
                    ICoreMap          mentionCandidateSentence = sentences[mentionCandidateTokens[0].SentIndex()];
                    //        if (mentionCandidateSentence.get(ChapterAnnotator.ChapterAnnotation.class) != quoteChapter) {
                    //          continue;
                    //        }
                    ICounter <string> features = new ClassicCounter <string>();
                    bool isLeft   = true;
                    int  distance = quoteRun.first - mention.end;
                    if (distance < 0)
                    {
                        isLeft   = false;
                        distance = mention.begin - quoteRun.second;
                    }
                    if (distance < 0)
                    {
                        continue;
                    }
                    //disregard mention-in-quote cases.
                    features.SetCount("wordDistance", distance);
                    IList <CoreLabel> betweenTokens;
                    if (isLeft)
                    {
                        betweenTokens = tokens.SubList(mention.end + 1, quoteRun.first);
                    }
                    else
                    {
                        betweenTokens = tokens.SubList(quoteRun.second + 1, mention.begin);
                    }
                    //Punctuation in between
                    foreach (CoreLabel token in betweenTokens)
                    {
                        if (punctuation.Contains(token.Word()))
                        {
                            features.SetCount("punctuationPresence:" + token.Word(), 1);
                        }
                    }
                    // number of mentions away
                    features.SetCount("rankedDistance", rankedDistance);
                    rankedDistance++;
                    if (rankedDistance == numBackwards)
                    {
                        //reset for the forward
                        rankedDistance = 1;
                    }
                    //        int quoteParagraphIdx = quoteFirstSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class);
                    //third distance: # of paragraphs away
                    int      mentionParagraphIdx        = -1;
                    ICoreMap sentenceInMentionParagraph = null;
                    int      quoteParagraphBeginToken   = GetParagraphBeginToken(quoteFirstSentence, sentences);
                    int      quoteParagraphEndToken     = GetParagraphEndToken(quoteFirstSentence, sentences);
                    if (isLeft)
                    {
                        if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken)
                        {
                            features.SetCount("leftParagraphDistance", 0);
                            mentionParagraphIdx        = quoteParagraphIdx;
                            sentenceInMentionParagraph = quoteFirstSentence;
                        }
                        else
                        {
                            int      paragraphDistance = 1;
                            int      currParagraphIdx  = quoteParagraphIdx - paragraphDistance;
                            ICoreMap currSentence      = quoteFirstSentence;
                            int      currSentenceIdx   = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                            while (currParagraphIdx >= 0)
                            {
                                //              Paragraph prevParagraph = paragraphs.get(prevParagraphIndex);
                                //extract begin and end tokens of
                                while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != currParagraphIdx)
                                {
                                    currSentenceIdx--;
                                    currSentence = sentences[currSentenceIdx];
                                }
                                int prevParagraphBegin = GetParagraphBeginToken(currSentence, sentences);
                                int prevParagraphEnd   = GetParagraphEndToken(currSentence, sentences);
                                if (prevParagraphBegin <= mention.begin && mention.end <= prevParagraphEnd)
                                {
                                    mentionParagraphIdx        = currParagraphIdx;
                                    sentenceInMentionParagraph = currSentence;
                                    features.SetCount("leftParagraphDistance", paragraphDistance);
                                    if (paragraphDistance % 2 == 0)
                                    {
                                        features.SetCount("leftParagraphDistanceEven", 1);
                                    }
                                    break;
                                }
                                paragraphDistance++;
                                currParagraphIdx--;
                            }
                        }
                    }
                    else
                    {
                        //right
                        if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken)
                        {
                            features.SetCount("rightParagraphDistance", 0);
                            sentenceInMentionParagraph = quoteFirstSentence;
                            mentionParagraphIdx        = quoteParagraphIdx;
                        }
                        else
                        {
                            int      paragraphDistance  = 1;
                            int      nextParagraphIndex = quoteParagraphIdx + paragraphDistance;
                            ICoreMap currSentence       = quoteFirstSentence;
                            int      currSentenceIdx    = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                            while (currSentenceIdx < sentences.Count)
                            {
                                while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != nextParagraphIndex)
                                {
                                    currSentenceIdx++;
                                    currSentence = sentences[currSentenceIdx];
                                }
                                int nextParagraphBegin = GetParagraphBeginToken(currSentence, sentences);
                                int nextParagraphEnd   = GetParagraphEndToken(currSentence, sentences);
                                if (nextParagraphBegin <= mention.begin && mention.end <= nextParagraphEnd)
                                {
                                    sentenceInMentionParagraph = currSentence;
                                    features.SetCount("rightParagraphDistance", paragraphDistance);
                                    break;
                                }
                                paragraphDistance++;
                                nextParagraphIndex++;
                            }
                        }
                    }
                    //2. mention features
                    if (sentenceInMentionParagraph != null)
                    {
                        int mentionParagraphBegin = GetParagraphBeginToken(sentenceInMentionParagraph, sentences);
                        int mentionParagraphEnd   = GetParagraphEndToken(sentenceInMentionParagraph, sentences);
                        if (!(mentionParagraphBegin == quoteParagraphBeginToken && mentionParagraphEnd == quoteParagraphEndToken))
                        {
                            IList <ICoreMap> quotesInMentionParagraph = paragraphToQuotes.GetOrDefault(mentionParagraphIdx, new List <ICoreMap>());
                            Pair <List <string>, List <Pair <int, int> > > namesInMentionParagraph = sieve.ScanForNames(new Pair <int, int>(mentionParagraphBegin, mentionParagraphEnd));
                            features.SetCount("quotesInMentionParagraph", quotesInMentionParagraph.Count);
                            features.SetCount("wordsInMentionParagraph", mentionParagraphEnd - mentionParagraphBegin + 1);
                            features.SetCount("namesInMentionParagraph", namesInMentionParagraph.first.Count);
                            //mention ordering in paragraph it is in
                            for (int i = 0; i < namesInMentionParagraph.second.Count; i++)
                            {
                                if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(mention.begin, mention.end), namesInMentionParagraph.second[i]))
                                {
                                    features.SetCount("orderInParagraph", i);
                                }
                            }
                            //if mention paragraph is all one quote
                            if (quotesInMentionParagraph.Count == 1)
                            {
                                ICoreMap qInMentionParagraph = quotesInMentionParagraph[0];
                                if (qInMentionParagraph.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == mentionParagraphBegin && qInMentionParagraph.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1 == mentionParagraphEnd)
                                {
                                    features.SetCount("mentionParagraphIsInConversation", 1);
                                }
                                else
                                {
                                    features.SetCount("mentionParagraphIsInConversation", -1);
                                }
                            }
                            foreach (ICoreMap quoteIMP in quotesInMentionParagraph)
                            {
                                if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(quoteIMP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIMP.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1), new Pair <int, int>(mention.begin, mention.end)))
                                {
                                    features.SetCount("mentionInQuote", 1);
                                }
                            }
                            if (features.GetCount("mentionInQuote") != 1)
                            {
                                features.SetCount("mentionNotInQuote", 1);
                            }
                        }
                    }
                    // nearby word syntax types...make sure to check if there are previous or next words
                    // or there will be an array index crash
                    if (mention.begin > 0)
                    {
                        CoreLabel prevWord = tokens[mention.begin - 1];
                        features.SetCount("prevWordType:" + prevWord.Tag(), 1);
                        if (punctuationForFeatures.Contains(prevWord.Lemma()))
                        {
                            features.SetCount("prevWordPunct:" + prevWord.Lemma(), 1);
                        }
                    }
                    if (mention.end + 1 < tokens.Count)
                    {
                        CoreLabel nextWord = tokens[mention.end + 1];
                        features.SetCount("nextWordType:" + nextWord.Tag(), 1);
                        if (punctuationForFeatures.Contains(nextWord.Lemma()))
                        {
                            features.SetCount("nextWordPunct:" + nextWord.Lemma(), 1);
                        }
                    }
                    //                    features.setCount("prevAndNext:" + prevWord.tag()+ ";" + nextWord.tag(), 1);
                    //quote paragraph features
                    IList <ICoreMap> quotesInQuoteParagraph = paragraphToQuotes[quoteParagraphIdx];
                    features.SetCount("QuotesInQuoteParagraph", quotesInQuoteParagraph.Count);
                    features.SetCount("WordsInQuoteParagraph", quoteParagraphEndToken - quoteParagraphBeginToken + 1);
                    features.SetCount("NamesInQuoteParagraph", sieve.ScanForNames(new Pair <int, int>(quoteParagraphBeginToken, quoteParagraphEndToken)).first.Count);
                    //quote features
                    features.SetCount("quoteLength", quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) + 1);
                    for (int i_1 = 0; i_1 < quotesInQuoteParagraph.Count; i_1++)
                    {
                        if (quotesInQuoteParagraph[i_1].Equals(quote))
                        {
                            features.SetCount("quotePosition", i_1 + 1);
                        }
                    }
                    if (features.GetCount("quotePosition") == 0)
                    {
                        throw new Exception("Check this (equality not working)");
                    }
                    Pair <List <string>, List <Pair <int, int> > > namesData = sieve.ScanForNames(quoteRun);
                    foreach (string name in namesData.first)
                    {
                        features.SetCount("charactersInQuote:" + sd.characterMap[name][0].name, 1);
                    }
                    //if quote encompasses entire paragraph
                    if (quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == quoteParagraphBeginToken && quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) == quoteParagraphEndToken)
                    {
                        features.SetCount("isImplicitSpeaker", 1);
                    }
                    else
                    {
                        features.SetCount("isImplicitSpeaker", -1);
                    }
                    //Vocative detection
                    if (mention.type.Equals("name"))
                    {
                        IList <Person> pList = sd.characterMap[sieve.TokenRangeToString(new Pair <int, int>(mention.begin, mention.end))];
                        Person         p     = null;
                        if (pList != null)
                        {
                            p = pList[0];
                        }
                        else
                        {
                            Pair <List <string>, List <Pair <int, int> > > scanForNamesResultPair = sieve.ScanForNames(new Pair <int, int>(mention.begin, mention.end));
                            if (scanForNamesResultPair.first.Count != 0)
                            {
                                string scanForNamesResultString = scanForNamesResultPair.first[0];
                                if (scanForNamesResultString != null && sd.characterMap.Contains(scanForNamesResultString))
                                {
                                    p = sd.characterMap[scanForNamesResultString][0];
                                }
                            }
                        }
                        if (p != null)
                        {
                            foreach (string name_1 in namesData.first)
                            {
                                if (p.aliases.Contains(name_1))
                                {
                                    features.SetCount("nameInQuote", 1);
                                }
                            }
                            if (quoteParagraphIdx > 0)
                            {
                                //            Paragraph prevParagraph = paragraphs.get(ex.paragraph_idx - 1);
                                IList <ICoreMap>         quotesInPrevParagraph = paragraphToQuotes.GetOrDefault(quoteParagraphIdx - 1, new List <ICoreMap>());
                                IList <Pair <int, int> > exclusionList         = new List <Pair <int, int> >();
                                foreach (ICoreMap quoteIPP in quotesInPrevParagraph)
                                {
                                    Pair <int, int> quoteRange = new Pair <int, int>(quoteIPP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIPP.Get(typeof(CoreAnnotations.TokenEndAnnotation)));
                                    exclusionList.Add(quoteRange);
                                    foreach (string name_2 in sieve.ScanForNames(quoteRange).first)
                                    {
                                        if (p.aliases.Contains(name_2))
                                        {
                                            features.SetCount("nameInPrevParagraphQuote", 1);
                                        }
                                    }
                                }
                                int      sentenceIdx             = quoteFirstSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation));
                                ICoreMap sentenceInPrevParagraph = null;
                                for (int i = sentenceIdx - 1; i_1 >= 0; i_1--)
                                {
                                    ICoreMap currSentence = sentences[i_1];
                                    if (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1)
                                    {
                                        sentenceInPrevParagraph = currSentence;
                                        break;
                                    }
                                }
                                int prevParagraphBegin = GetParagraphBeginToken(sentenceInPrevParagraph, sentences);
                                int prevParagraphEnd   = GetParagraphEndToken(sentenceInPrevParagraph, sentences);
                                IList <Pair <int, int> > prevParagraphNonQuoteRuns = GetRangeExclusion(new Pair <int, int>(prevParagraphBegin, prevParagraphEnd), exclusionList);
                                foreach (Pair <int, int> nonQuoteRange in prevParagraphNonQuoteRuns)
                                {
                                    foreach (string name_2 in sieve.ScanForNames(nonQuoteRange).first)
                                    {
                                        if (p.aliases.Contains(name_2))
                                        {
                                            features.SetCount("nameInPrevParagraphNonQuote", 1);
                                        }
                                    }
                                }
                            }
                        }
                    }
                    if (isTraining)
                    {
                        if (QuoteAttributionUtils.RangeContains(new Pair <int, int>(gold.mentionStartTokenIndex, gold.mentionEndTokenIndex), new Pair <int, int>(mention.begin, mention.end)))
                        {
                            RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isMention");
                            datum.SetID(int.ToString(dataset.Size()));
                            mapDatumToMention[dataset.Size()] = mention;
                            dataset.Add(datum);
                        }
                        else
                        {
                            RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isNotMention");
                            datum.SetID(int.ToString(dataset.Size()));
                            dataset.Add(datum);
                            mapDatumToMention[dataset.Size()] = mention;
                        }
                    }
                    else
                    {
                        RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "none");
                        datum.SetID(int.ToString(dataset.Size()));
                        mapDatumToMention[dataset.Size()] = mention;
                        dataset.Add(datum);
                    }
                }
                mapQuoteToDataRange[quoteIdx] = new Pair <int, int>(initialSize, dataset.Size() - 1);
            }
            return(new SupervisedSieveTraining.FeaturesData(mapQuoteToDataRange, mapDatumToMention, dataset));
        }