/// <summary> /// Set the tags of the original tokens and the leaves if they /// aren't already set. /// </summary> private static void SetMissingTags(ICoreMap sentence, Tree tree) { IList <TaggedWord> taggedWords = null; IList <ILabel> leaves = null; IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); for (int i = 0; i < size; ++i) { CoreLabel token = tokens[i]; if (token.Tag() == null) { if (taggedWords == null) { taggedWords = tree.TaggedYield(); } if (leaves == null) { leaves = tree.Yield(); } token.SetTag(taggedWords[i].Tag()); ILabel leaf = leaves[i]; if (leaf is IHasTag) { ((IHasTag)leaf).SetTag(taggedWords[i].Tag()); } } } }
private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i) { ICounter <string> feat = new ClassicCounter <string>(); CoreLabel l = sent[i]; string label; if (l.Get(answerClass).ToString().Equals(answerLabel)) { label = answerLabel; } else { label = "O"; } CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases)); if (matchedPhrases == null) { matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>(); matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word())); } foreach (CandidatePhrase w in matchedPhrases.AllValues()) { int num = this.clusterIds[w.GetPhrase()]; if (num == null) { num = -1; } feat.SetCount("Cluster-" + num, 1.0); } // feat.incrementCount("WORD-" + l.word()); // feat.incrementCount("LEMMA-" + l.lemma()); // feat.incrementCount("TAG-" + l.tag()); int window = 0; for (int j = Math.Max(0, i - window); j < i; j++) { CoreLabel lj = sent[j]; feat.IncrementCount("PREV-" + "WORD-" + lj.Word()); feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("PREV-" + "TAG-" + lj.Tag()); } for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++) { CoreLabel lj = sent[j_1]; feat.IncrementCount("NEXT-" + "WORD-" + lj.Word()); feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag()); } // System.out.println("adding " + l.word() + " as " + label); return(new RVFDatum <string, string>(feat, label)); }
private bool MatchedRestriction(CoreLabel coreLabel, string label) { bool use = false; if (PatternFactory.useTargetNERRestriction) { foreach (string s in constVars.allowedNERsforLabels[label]) { if (coreLabel.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Matches(s)) { use = true; break; } } } else { //System.out.println("not matching NER"); use = true; } if (use) { string tag = coreLabel.Tag(); if (constVars.allowedTagsInitials != null && constVars.allowedTagsInitials.Contains(label)) { foreach (string allowed in constVars.allowedTagsInitials[label]) { if (tag.StartsWith(allowed)) { use = true; break; } use = false; } } } if (constVars.debug >= 4) { if (use) { System.Console.Out.WriteLine(coreLabel.Word() + " matched restriction " + (PatternFactory.useTargetNERRestriction ? constVars.allowedNERsforLabels[label] : string.Empty) + "and" + PatternFactory.useTargetNERRestriction + " and " + (constVars .allowedTagsInitials != null ? constVars.allowedTagsInitials[label] : string.Empty)); } else { System.Console.Out.WriteLine(coreLabel.Word() + " did not matched restrict " + (PatternFactory.useTargetNERRestriction ? constVars.allowedNERsforLabels[label] : string.Empty) + "and" + PatternFactory.useTargetNERRestriction + " and " + (constVars .allowedTagsInitials != null ? constVars.allowedTagsInitials[label] : string.Empty)); } } return(use); }
//goldList null if not training public static SupervisedSieveTraining.FeaturesData Featurize(SupervisedSieveTraining.SieveData sd, IList <XMLToAnnotation.GoldQuoteInfo> goldList, bool isTraining) { Annotation doc = sd.doc; sieve = new Sieve(doc, sd.characterMap, sd.pronounCorefMap, sd.animacyList); IList <ICoreMap> quotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation)); IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); IDictionary <int, IList <ICoreMap> > paragraphToQuotes = GetQuotesInParagraph(doc); GeneralDataset <string, string> dataset = new RVFDataset <string, string>(); //necessary for 'ScoreBestMention' IDictionary <int, Pair <int, int> > mapQuoteToDataRange = new Dictionary <int, Pair <int, int> >(); //maps quote to corresponding indices in the dataset IDictionary <int, Sieve.MentionData> mapDatumToMention = new Dictionary <int, Sieve.MentionData>(); if (isTraining && goldList.Count != quotes.Count) { throw new Exception("Gold Quote List size doesn't match quote list size!"); } for (int quoteIdx = 0; quoteIdx < quotes.Count; quoteIdx++) { int initialSize = dataset.Size(); ICoreMap quote = quotes[quoteIdx]; XMLToAnnotation.GoldQuoteInfo gold = null; if (isTraining) { gold = goldList[quoteIdx]; if (gold.speaker == string.Empty) { continue; } } ICoreMap quoteFirstSentence = sentences[quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation))]; Pair <int, int> quoteRun = new Pair <int, int>(quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quote.Get(typeof(CoreAnnotations.TokenEndAnnotation))); // int quoteChapter = quoteFirstSentence.get(ChapterAnnotator.ChapterAnnotation.class); int quoteParagraphIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)); //add mentions before quote up to the previous paragraph int rightValue = quoteRun.first - 1; int leftValue = quoteRun.first - 1; //move left value to be the first token idx of the previous paragraph for (int sentIdx = quote.Get(typeof(CoreAnnotations.SentenceBeginAnnotation)); sentIdx >= 0; sentIdx--) { ICoreMap sentence = sentences[sentIdx]; if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx) { continue; } if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1) { //quoteParagraphIdx - 1 for this and prev leftValue = sentence.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); } else { break; } } IList <Sieve.MentionData> mentionsInPreviousParagraph = new List <Sieve.MentionData>(); if (leftValue > -1 && rightValue > -1) { mentionsInPreviousParagraph = EliminateDuplicates(sieve.FindClosestMentionsInSpanBackward(new Pair <int, int>(leftValue, rightValue))); } //mentions in next paragraph leftValue = quoteRun.second + 1; rightValue = quoteRun.second + 1; for (int sentIdx_1 = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation)); sentIdx_1 < sentences.Count; sentIdx_1++) { ICoreMap sentence = sentences[sentIdx_1]; // if(sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == quoteParagraphIdx) { // continue; // } if (sentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx) { //quoteParagraphIdx + 1 rightValue = sentence.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1; } else { break; } } IList <Sieve.MentionData> mentionsInNextParagraph = new List <Sieve.MentionData>(); if (leftValue < tokens.Count && rightValue < tokens.Count) { mentionsInNextParagraph = sieve.FindClosestMentionsInSpanForward(new Pair <int, int>(leftValue, rightValue)); } IList <Sieve.MentionData> candidateMentions = new List <Sieve.MentionData>(); Sharpen.Collections.AddAll(candidateMentions, mentionsInPreviousParagraph); Sharpen.Collections.AddAll(candidateMentions, mentionsInNextParagraph); // System.out.println(candidateMentions.size()); int rankedDistance = 1; int numBackwards = mentionsInPreviousParagraph.Count; foreach (Sieve.MentionData mention in candidateMentions) { IList <CoreLabel> mentionCandidateTokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)).SubList(mention.begin, mention.end + 1); ICoreMap mentionCandidateSentence = sentences[mentionCandidateTokens[0].SentIndex()]; // if (mentionCandidateSentence.get(ChapterAnnotator.ChapterAnnotation.class) != quoteChapter) { // continue; // } ICounter <string> features = new ClassicCounter <string>(); bool isLeft = true; int distance = quoteRun.first - mention.end; if (distance < 0) { isLeft = false; distance = mention.begin - quoteRun.second; } if (distance < 0) { continue; } //disregard mention-in-quote cases. features.SetCount("wordDistance", distance); IList <CoreLabel> betweenTokens; if (isLeft) { betweenTokens = tokens.SubList(mention.end + 1, quoteRun.first); } else { betweenTokens = tokens.SubList(quoteRun.second + 1, mention.begin); } //Punctuation in between foreach (CoreLabel token in betweenTokens) { if (punctuation.Contains(token.Word())) { features.SetCount("punctuationPresence:" + token.Word(), 1); } } // number of mentions away features.SetCount("rankedDistance", rankedDistance); rankedDistance++; if (rankedDistance == numBackwards) { //reset for the forward rankedDistance = 1; } // int quoteParagraphIdx = quoteFirstSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class); //third distance: # of paragraphs away int mentionParagraphIdx = -1; ICoreMap sentenceInMentionParagraph = null; int quoteParagraphBeginToken = GetParagraphBeginToken(quoteFirstSentence, sentences); int quoteParagraphEndToken = GetParagraphEndToken(quoteFirstSentence, sentences); if (isLeft) { if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken) { features.SetCount("leftParagraphDistance", 0); mentionParagraphIdx = quoteParagraphIdx; sentenceInMentionParagraph = quoteFirstSentence; } else { int paragraphDistance = 1; int currParagraphIdx = quoteParagraphIdx - paragraphDistance; ICoreMap currSentence = quoteFirstSentence; int currSentenceIdx = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); while (currParagraphIdx >= 0) { // Paragraph prevParagraph = paragraphs.get(prevParagraphIndex); //extract begin and end tokens of while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != currParagraphIdx) { currSentenceIdx--; currSentence = sentences[currSentenceIdx]; } int prevParagraphBegin = GetParagraphBeginToken(currSentence, sentences); int prevParagraphEnd = GetParagraphEndToken(currSentence, sentences); if (prevParagraphBegin <= mention.begin && mention.end <= prevParagraphEnd) { mentionParagraphIdx = currParagraphIdx; sentenceInMentionParagraph = currSentence; features.SetCount("leftParagraphDistance", paragraphDistance); if (paragraphDistance % 2 == 0) { features.SetCount("leftParagraphDistanceEven", 1); } break; } paragraphDistance++; currParagraphIdx--; } } } else { //right if (quoteParagraphBeginToken <= mention.begin && mention.end <= quoteParagraphEndToken) { features.SetCount("rightParagraphDistance", 0); sentenceInMentionParagraph = quoteFirstSentence; mentionParagraphIdx = quoteParagraphIdx; } else { int paragraphDistance = 1; int nextParagraphIndex = quoteParagraphIdx + paragraphDistance; ICoreMap currSentence = quoteFirstSentence; int currSentenceIdx = currSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); while (currSentenceIdx < sentences.Count) { while (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) != nextParagraphIndex) { currSentenceIdx++; currSentence = sentences[currSentenceIdx]; } int nextParagraphBegin = GetParagraphBeginToken(currSentence, sentences); int nextParagraphEnd = GetParagraphEndToken(currSentence, sentences); if (nextParagraphBegin <= mention.begin && mention.end <= nextParagraphEnd) { sentenceInMentionParagraph = currSentence; features.SetCount("rightParagraphDistance", paragraphDistance); break; } paragraphDistance++; nextParagraphIndex++; } } } //2. mention features if (sentenceInMentionParagraph != null) { int mentionParagraphBegin = GetParagraphBeginToken(sentenceInMentionParagraph, sentences); int mentionParagraphEnd = GetParagraphEndToken(sentenceInMentionParagraph, sentences); if (!(mentionParagraphBegin == quoteParagraphBeginToken && mentionParagraphEnd == quoteParagraphEndToken)) { IList <ICoreMap> quotesInMentionParagraph = paragraphToQuotes.GetOrDefault(mentionParagraphIdx, new List <ICoreMap>()); Pair <List <string>, List <Pair <int, int> > > namesInMentionParagraph = sieve.ScanForNames(new Pair <int, int>(mentionParagraphBegin, mentionParagraphEnd)); features.SetCount("quotesInMentionParagraph", quotesInMentionParagraph.Count); features.SetCount("wordsInMentionParagraph", mentionParagraphEnd - mentionParagraphBegin + 1); features.SetCount("namesInMentionParagraph", namesInMentionParagraph.first.Count); //mention ordering in paragraph it is in for (int i = 0; i < namesInMentionParagraph.second.Count; i++) { if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(mention.begin, mention.end), namesInMentionParagraph.second[i])) { features.SetCount("orderInParagraph", i); } } //if mention paragraph is all one quote if (quotesInMentionParagraph.Count == 1) { ICoreMap qInMentionParagraph = quotesInMentionParagraph[0]; if (qInMentionParagraph.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == mentionParagraphBegin && qInMentionParagraph.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1 == mentionParagraphEnd) { features.SetCount("mentionParagraphIsInConversation", 1); } else { features.SetCount("mentionParagraphIsInConversation", -1); } } foreach (ICoreMap quoteIMP in quotesInMentionParagraph) { if (ExtractQuotesUtil.RangeContains(new Pair <int, int>(quoteIMP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIMP.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - 1), new Pair <int, int>(mention.begin, mention.end))) { features.SetCount("mentionInQuote", 1); } } if (features.GetCount("mentionInQuote") != 1) { features.SetCount("mentionNotInQuote", 1); } } } // nearby word syntax types...make sure to check if there are previous or next words // or there will be an array index crash if (mention.begin > 0) { CoreLabel prevWord = tokens[mention.begin - 1]; features.SetCount("prevWordType:" + prevWord.Tag(), 1); if (punctuationForFeatures.Contains(prevWord.Lemma())) { features.SetCount("prevWordPunct:" + prevWord.Lemma(), 1); } } if (mention.end + 1 < tokens.Count) { CoreLabel nextWord = tokens[mention.end + 1]; features.SetCount("nextWordType:" + nextWord.Tag(), 1); if (punctuationForFeatures.Contains(nextWord.Lemma())) { features.SetCount("nextWordPunct:" + nextWord.Lemma(), 1); } } // features.setCount("prevAndNext:" + prevWord.tag()+ ";" + nextWord.tag(), 1); //quote paragraph features IList <ICoreMap> quotesInQuoteParagraph = paragraphToQuotes[quoteParagraphIdx]; features.SetCount("QuotesInQuoteParagraph", quotesInQuoteParagraph.Count); features.SetCount("WordsInQuoteParagraph", quoteParagraphEndToken - quoteParagraphBeginToken + 1); features.SetCount("NamesInQuoteParagraph", sieve.ScanForNames(new Pair <int, int>(quoteParagraphBeginToken, quoteParagraphEndToken)).first.Count); //quote features features.SetCount("quoteLength", quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) - quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) + 1); for (int i_1 = 0; i_1 < quotesInQuoteParagraph.Count; i_1++) { if (quotesInQuoteParagraph[i_1].Equals(quote)) { features.SetCount("quotePosition", i_1 + 1); } } if (features.GetCount("quotePosition") == 0) { throw new Exception("Check this (equality not working)"); } Pair <List <string>, List <Pair <int, int> > > namesData = sieve.ScanForNames(quoteRun); foreach (string name in namesData.first) { features.SetCount("charactersInQuote:" + sd.characterMap[name][0].name, 1); } //if quote encompasses entire paragraph if (quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)) == quoteParagraphBeginToken && quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)) == quoteParagraphEndToken) { features.SetCount("isImplicitSpeaker", 1); } else { features.SetCount("isImplicitSpeaker", -1); } //Vocative detection if (mention.type.Equals("name")) { IList <Person> pList = sd.characterMap[sieve.TokenRangeToString(new Pair <int, int>(mention.begin, mention.end))]; Person p = null; if (pList != null) { p = pList[0]; } else { Pair <List <string>, List <Pair <int, int> > > scanForNamesResultPair = sieve.ScanForNames(new Pair <int, int>(mention.begin, mention.end)); if (scanForNamesResultPair.first.Count != 0) { string scanForNamesResultString = scanForNamesResultPair.first[0]; if (scanForNamesResultString != null && sd.characterMap.Contains(scanForNamesResultString)) { p = sd.characterMap[scanForNamesResultString][0]; } } } if (p != null) { foreach (string name_1 in namesData.first) { if (p.aliases.Contains(name_1)) { features.SetCount("nameInQuote", 1); } } if (quoteParagraphIdx > 0) { // Paragraph prevParagraph = paragraphs.get(ex.paragraph_idx - 1); IList <ICoreMap> quotesInPrevParagraph = paragraphToQuotes.GetOrDefault(quoteParagraphIdx - 1, new List <ICoreMap>()); IList <Pair <int, int> > exclusionList = new List <Pair <int, int> >(); foreach (ICoreMap quoteIPP in quotesInPrevParagraph) { Pair <int, int> quoteRange = new Pair <int, int>(quoteIPP.Get(typeof(CoreAnnotations.TokenBeginAnnotation)), quoteIPP.Get(typeof(CoreAnnotations.TokenEndAnnotation))); exclusionList.Add(quoteRange); foreach (string name_2 in sieve.ScanForNames(quoteRange).first) { if (p.aliases.Contains(name_2)) { features.SetCount("nameInPrevParagraphQuote", 1); } } } int sentenceIdx = quoteFirstSentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)); ICoreMap sentenceInPrevParagraph = null; for (int i = sentenceIdx - 1; i_1 >= 0; i_1--) { ICoreMap currSentence = sentences[i_1]; if (currSentence.Get(typeof(CoreAnnotations.ParagraphIndexAnnotation)) == quoteParagraphIdx - 1) { sentenceInPrevParagraph = currSentence; break; } } int prevParagraphBegin = GetParagraphBeginToken(sentenceInPrevParagraph, sentences); int prevParagraphEnd = GetParagraphEndToken(sentenceInPrevParagraph, sentences); IList <Pair <int, int> > prevParagraphNonQuoteRuns = GetRangeExclusion(new Pair <int, int>(prevParagraphBegin, prevParagraphEnd), exclusionList); foreach (Pair <int, int> nonQuoteRange in prevParagraphNonQuoteRuns) { foreach (string name_2 in sieve.ScanForNames(nonQuoteRange).first) { if (p.aliases.Contains(name_2)) { features.SetCount("nameInPrevParagraphNonQuote", 1); } } } } } } if (isTraining) { if (QuoteAttributionUtils.RangeContains(new Pair <int, int>(gold.mentionStartTokenIndex, gold.mentionEndTokenIndex), new Pair <int, int>(mention.begin, mention.end))) { RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isMention"); datum.SetID(int.ToString(dataset.Size())); mapDatumToMention[dataset.Size()] = mention; dataset.Add(datum); } else { RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "isNotMention"); datum.SetID(int.ToString(dataset.Size())); dataset.Add(datum); mapDatumToMention[dataset.Size()] = mention; } } else { RVFDatum <string, string> datum = new RVFDatum <string, string>(features, "none"); datum.SetID(int.ToString(dataset.Size())); mapDatumToMention[dataset.Size()] = mention; dataset.Add(datum); } } mapQuoteToDataRange[quoteIdx] = new Pair <int, int>(initialSize, dataset.Size() - 1); } return(new SupervisedSieveTraining.FeaturesData(mapQuoteToDataRange, mapDatumToMention, dataset)); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 2) { System.Console.Error.Printf("Usage: java %s tree_file morfette_tnt_file%n", typeof(MungeTreesWithMorfetteAnalyses).FullName); System.Environment.Exit(-1); } string treeFile = args[0]; string morfetteFile = args[1]; ITreeReaderFactory trf = new FrenchTreeReaderFactory(); try { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"))); IEnumerator <IList <CoreLabel> > morfetteItr = new MungeTreesWithMorfetteAnalyses.MorfetteFileIterator(morfetteFile); for (Tree tree; (tree = tr.ReadTree()) != null && morfetteItr.MoveNext();) { IList <CoreLabel> analysis = morfetteItr.Current; IList <ILabel> yield = tree.Yield(); System.Diagnostics.Debug.Assert(analysis.Count == yield.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { CoreLabel tokenAnalysis = analysis[i]; ILabel token = yield[i]; string lemma = GetLemma(token.Value(), tokenAnalysis.Lemma()); string newLeaf = string.Format("%s%s%s%s%s", token.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, tokenAnalysis.Tag()); ((CoreLabel)token).SetValue(newLeaf); } System.Console.Out.WriteLine(tree.ToString()); } if (tr.ReadTree() != null || morfetteItr.MoveNext()) { log.Info("WARNING: Uneven input files!"); } tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
public virtual void TrigramPatterns(Annotation doc) { IList <CoreLabel> docTokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <ICoreMap> docQuotes = doc.Get(typeof(CoreAnnotations.QuotationsAnnotation)); foreach (ICoreMap quote in docQuotes) { if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null) { continue; } int quoteBeginTokenIndex = quote.Get(typeof(CoreAnnotations.TokenBeginAnnotation)); int quoteEndTokenIndex = quote.Get(typeof(CoreAnnotations.TokenEndAnnotation)); int quoteEndSentenceIndex = quote.Get(typeof(CoreAnnotations.SentenceEndAnnotation)); Pair <int, int> precedingTokenRange = QuoteAttributionUtils.GetTokenRangePrecedingQuote(doc, quote); //get tokens before and after if (precedingTokenRange != null) { Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(precedingTokenRange); List <string> names = namesAndNameIndices.first; List <Pair <int, int> > nameIndices = namesAndNameIndices.second; if (names.Count > 0) { int offset = 0; if (beforeQuotePunctuation.Contains(docTokens[quoteBeginTokenIndex - 1].Word())) { offset = 1; } Pair <int, int> lastNameIndex = nameIndices[nameIndices.Count - 1]; CoreLabel prevToken = docTokens[quoteBeginTokenIndex - 1 - offset]; //CVQ if (prevToken.Tag() != null && prevToken.Tag().StartsWith("V") && lastNameIndex.second.Equals(quoteBeginTokenIndex - 2 - offset)) { // verb! FillInMention(quote, names[names.Count - 1], lastNameIndex.first, lastNameIndex.second, "trigram CVQ", Name); continue; } //VCQ if (lastNameIndex.second.Equals(quoteBeginTokenIndex - 1 - offset)) { CoreLabel secondPrevToken = docTokens[lastNameIndex.first - 1]; if (secondPrevToken.Tag().StartsWith("V")) { FillInMention(quote, names[names.Count - 1], lastNameIndex.first, lastNameIndex.second, "trigram VCQ", Name); continue; } } } List <int> pronounsIndices = ScanForPronouns(precedingTokenRange); if (pronounsIndices.Count > 0) { int offset = 0; if (beforeQuotePunctuation.Contains(docTokens[quoteBeginTokenIndex - 1].Word())) { offset = 1; } CoreLabel prevToken = docTokens[quoteBeginTokenIndex - 1 - offset]; int lastPronounIndex = pronounsIndices[pronounsIndices.Count - 1]; //PVQ if (prevToken.Tag().StartsWith("V") && lastPronounIndex == quoteBeginTokenIndex - 2 - offset) { // verb! FillInMention(quote, TokenRangeToString(lastPronounIndex), lastPronounIndex, lastPronounIndex, "trigram PVQ", Pronoun); continue; } //VPQ if (lastPronounIndex == quoteBeginTokenIndex - 1 - offset && docTokens[quoteBeginTokenIndex - 2 - offset].Tag().StartsWith("V")) { FillInMention(quote, TokenRangeToString(lastPronounIndex), lastPronounIndex, lastPronounIndex, "trigram VPQ", Pronoun); continue; } } } Pair <int, int> followingTokenRange = QuoteAttributionUtils.GetTokenRangeFollowingQuote(doc, quote); if (followingTokenRange != null) { Pair <List <string>, List <Pair <int, int> > > namesAndNameIndices = ScanForNames(followingTokenRange); List <string> names = namesAndNameIndices.first; List <Pair <int, int> > nameIndices = namesAndNameIndices.second; if (names.Count > 0) { Pair <int, int> firstNameIndex = nameIndices[0]; CoreLabel nextToken = docTokens[quoteEndTokenIndex + 1]; //QVC if (nextToken.Tag().StartsWith("V") && firstNameIndex.first.Equals(quoteEndTokenIndex + 2)) { // verb! FillInMention(quote, names[0], firstNameIndex.first, firstNameIndex.second, "trigram QVC", Name); continue; } //QCV if (firstNameIndex.first.Equals(quoteEndTokenIndex + 1)) { CoreLabel secondNextToken = docTokens[firstNameIndex.second + 1]; if (secondNextToken.Tag().StartsWith("V")) { FillInMention(quote, names[0], firstNameIndex.first, firstNameIndex.second, "trigram QCV", Name); continue; } } } List <int> pronounsIndices = ScanForPronouns(followingTokenRange); if (pronounsIndices.Count > 0) { CoreLabel nextToken = docTokens[quoteEndTokenIndex + 1]; int firstPronounIndex = pronounsIndices[0]; //QVP if (nextToken.Tag().StartsWith("V") && firstPronounIndex == quoteEndTokenIndex + 2) { // verb! FillInMention(quote, TokenRangeToString(pronounsIndices[0]), firstPronounIndex, firstPronounIndex, "trigram QVP", Pronoun); continue; } //QPV if (firstPronounIndex == quoteEndTokenIndex + 1 && docTokens[quoteEndTokenIndex + 2].Tag().StartsWith("V")) { FillInMention(quote, TokenRangeToString(pronounsIndices[pronounsIndices.Count - 1]), firstPronounIndex, firstPronounIndex, "trigram QPV", Pronoun); continue; } } } } }
/// <summary> /// transformTree does all language-specific tree /// transformations. /// </summary> /// <remarks> /// transformTree does all language-specific tree /// transformations. Any parameterizations should be inside the /// specific TreebankLangParserarams class. /// </remarks> public override Tree TransformTree(Tree t, Tree root) { if (t == null || t.IsLeaf()) { return(t); } IList <string> annotations = new List <string>(); CoreLabel lab = (CoreLabel)t.Label(); string word = lab.Word(); string tag = lab.Tag(); string cat = lab.Value(); string baseCat = TreebankLanguagePack().BasicCategory(cat); //Tree parent = t.parent(root); // String mcat = ""; // if (parent != null) { // mcat = parent.label().value(); // } //categories -- at present there is no tag annotation!! if (t.IsPhrasal()) { IList <string> childBasicCats = ChildBasicCats(t); // mark vp's headed by "zu" verbs if (markZuVP && baseCat.Equals("VP") && (childBasicCats.Contains("VZ") || childBasicCats.Contains("VVIZU"))) { annotations.Add("%ZU"); } // mark relative clause S's if (markRC && (t.Label() is NegraLabel) && baseCat.Equals("S") && ((NegraLabel)t.Label()).GetEdge() != null && ((NegraLabel)t.Label()).GetEdge().Equals("RC")) { //throw new RuntimeException("damn, not a Negra Label"); annotations.Add("%RC"); } // if(t.children().length == 1) { // annotations.add("%U"); // } if (markContainsV && ContainsVP(t)) { annotations.Add("%vp"); } if (markLP && LeftPhrasal(t)) { annotations.Add("%LP"); } if (markKonjParent) { // this depends on functional tags being present foreach (string cCat in childBasicCats) { if (cCat.Contains("-KONJ")) { annotations.Add("%konjp"); break; } } } if (markHDParent) { // this depends on functional tags being present foreach (string cCat in childBasicCats) { if (cCat.Contains("-HD")) { annotations.Add("%hdp"); break; } } } } else { //t.isPreTerminal() case if (markColon && cat.Equals("$.") && (word.Equals(":") || word.Equals(";"))) { annotations.Add("-%colon"); } } // if(t.isPreTerminal()) { // if(parent != null) { // String parentVal = parent.label().value(); // int cutOffPtD = parentVal.indexOf('-'); // int cutOffPtC = parentVal.indexOf('^'); // int curMin = parentVal.length(); // if(cutOffPtD != -1) { // curMin = cutOffPtD; // } // if(cutOffPtC != -1) { // curMin = Math.min(curMin, cutOffPtC); // } // parentVal = parentVal.substring(0, curMin); // annotations.add("^" + parentVal); // } // } // put on all the annotations StringBuilder catSB = new StringBuilder(cat); foreach (string annotation in annotations) { catSB.Append(annotation); } t.SetLabel(new CategoryWordTag(catSB.ToString(), word, tag)); return(t); }
public static ICounter <string> ExtractFeatures(Mention p, ICollection <Mention> shares, ICollection <string> neStrings, Dictionaries dict, Properties props) { ICounter <string> features = new ClassicCounter <string>(); string span = p.LowercaseNormalizedSpanString(); string ner = p.headWord.Ner(); int sIdx = p.startIndex; int eIdx = p.endIndex; IList <CoreLabel> sent = p.sentenceWords; CoreLabel preWord = (sIdx == 0) ? null : sent[sIdx - 1]; CoreLabel nextWord = (eIdx == sent.Count) ? null : sent[eIdx]; CoreLabel firstWord = p.originalSpan[0]; CoreLabel lastWord = p.originalSpan[p.originalSpan.Count - 1]; features.IncrementCount("B-NETYPE-" + ner); if (neStrings.Contains(span)) { features.IncrementCount("B-NE-STRING-EXIST"); if ((preWord == null || !preWord.Ner().Equals(ner)) && (nextWord == null || !nextWord.Ner().Equals(ner))) { features.IncrementCount("B-NE-FULLSPAN"); } } if (preWord != null) { features.IncrementCount("B-PRECEDINGWORD-" + preWord.Word()); } if (nextWord != null) { features.IncrementCount("B-FOLLOWINGWORD-" + nextWord.Word()); } if (preWord != null) { features.IncrementCount("B-PRECEDINGPOS-" + preWord.Tag()); } if (nextWord != null) { features.IncrementCount("B-FOLLOWINGPOS-" + nextWord.Tag()); } features.IncrementCount("B-FIRSTWORD-" + firstWord.Word()); features.IncrementCount("B-FIRSTPOS-" + firstWord.Tag()); features.IncrementCount("B-LASTWORD-" + lastWord.Word()); features.IncrementCount("B-LASTWORD-" + lastWord.Tag()); foreach (Mention s in shares) { if (s == p) { continue; } if (s.InsideIn(p)) { features.IncrementCount("B-BIGGER-THAN-ANOTHER"); break; } } foreach (Mention s_1 in shares) { if (s_1 == p) { continue; } if (p.InsideIn(s_1)) { features.IncrementCount("B-SMALLER-THAN-ANOTHER"); break; } } return(features); }
public static void WriteConllFile(string outFile, IList <ICoreMap> sentences, IList <DependencyTree> trees) { try { PrintWriter output = IOUtils.GetPrintWriter(outFile); for (int i = 0; i < sentences.Count; i++) { ICoreMap sentence = sentences[i]; DependencyTree tree = trees[i]; IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); for (int j = 1; j <= size; ++j) { CoreLabel token = tokens[j - 1]; output.Printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n", j, token.Word(), token.Tag(), token.Tag(), tree.GetHead(j), tree.GetLabel(j)); } output.Println(); } output.Close(); } catch (Exception e) { throw new RuntimeIOException(e); } }
/// <summary> /// Adds stem under annotation /// <paramref name="ann"/> /// to the given CoreLabel. /// Assumes that it has a TextAnnotation and PartOfSpeechAnnotation. /// </summary> public virtual void Stem(CoreLabel label, Type ann) { string lemma = Lemmatize(label.Word(), label.Tag(), lexer, lexer.Option(1)); label.Set(ann, lemma); }
public static ICollection <SurfacePattern> GetContext(IList <CoreLabel> sent, int i, ICollection <CandidatePhrase> stopWords) { ICollection <SurfacePattern> prevpatterns = new HashSet <SurfacePattern>(); ICollection <SurfacePattern> nextpatterns = new HashSet <SurfacePattern>(); ICollection <SurfacePattern> prevnextpatterns = new HashSet <SurfacePattern>(); CoreLabel token = sent[i]; string tag = null; if (usePOS4Pattern) { string fulltag = token.Tag(); if (useCoarsePOS) { tag = Sharpen.Runtime.Substring(fulltag, 0, Math.Min(fulltag.Length, 2)); } else { tag = fulltag; } } string nerTag = token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); for (int maxWin = 1; maxWin <= maxWindow4Pattern; maxWin++) { IList <Token> previousTokens = new List <Token>(); IList <string> originalPrev = new List <string>(); IList <string> originalNext = new List <string>(); IList <Token> nextTokens = new List <Token>(); int numStopWordsprev = 0; int numStopWordsnext = 0; // int numPrevTokensSpecial = 0, numNextTokensSpecial = 0; int numNonStopWordsNext = 0; int numNonStopWordsPrev = 0; bool useprev = false; bool usenext = false; PatternToken twithoutPOS = null; //TODO: right now using numWordsCompoundMax. if (addPatWithoutPOS) { twithoutPOS = new PatternToken(tag, false, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation))); } PatternToken twithPOS = null; if (usePOS4Pattern) { twithPOS = new PatternToken(tag, true, numWordsCompoundMax > 1, numWordsCompoundMax, nerTag, useTargetNERRestriction, useTargetParserParentRestriction, token.Get(typeof(CoreAnnotations.GrandparentAnnotation))); } if (usePreviousContext) { // int j = Math.max(0, i - 1); int j = i - 1; int numTokens = 0; while (numTokens < maxWin && j >= 0) { // for (int j = Math.max(i - maxWin, 0); j < i; j++) { CoreLabel tokenj = sent[j]; string tokenjStr; if (useLemmaContextTokens) { tokenjStr = tokenj.Lemma(); } else { tokenjStr = tokenj.Word(); } // do not use this word in context consideration if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower())) { j--; continue; } // if (!tokenj.containsKey(answerClass.get(label))) { // throw new RuntimeException("how come the class " // + answerClass.get(label) + " for token " // + tokenj.word() + " in " + sent + " is not set"); // } Triple <bool, Token, string> tr = GetContextTokenStr(tokenj); bool isLabeledO = tr.first; Token strgeneric = tr.second; string strOriginal = tr.third; if (!isLabeledO) { // numPrevTokensSpecial++; previousTokens.Add(0, strgeneric); // previousTokens.add(0, // "[{answer:" // + tokenj.get(answerClass.get(label)).toString() // + "}]"); originalPrev.Add(0, strOriginal); numNonStopWordsPrev++; } else { if (tokenj.Word().StartsWith("http")) { useprev = false; previousTokens.Clear(); originalPrev.Clear(); break; } else { Token str = SurfacePattern.GetContextToken(tokenj); previousTokens.Add(0, str); originalPrev.Add(0, tokenjStr); if (DoNotUse(tokenjStr, stopWords)) { numStopWordsprev++; } else { numNonStopWordsPrev++; } } } numTokens++; j--; } } if (useNextContext) { int numTokens = 0; int j = i + 1; while (numTokens < maxWin && j < sent.Count) { // for (int j = i + 1; j < sent.size() && j <= i + maxWin; j++) { CoreLabel tokenj = sent[j]; string tokenjStr; if (useLemmaContextTokens) { tokenjStr = tokenj.Lemma(); } else { tokenjStr = tokenj.Word(); } // do not use this word in context consideration if (useFillerWordsInPat && fillerWords.Contains(tokenj.Word().ToLower())) { j++; continue; } // if (!tokenj.containsKey(answerClass.get(label))) { // throw new RuntimeException( // "how come the dict annotation for token " + tokenj.word() // + " in " + sent + " is not set"); // } Triple <bool, Token, string> tr = GetContextTokenStr(tokenj); bool isLabeledO = tr.first; Token strgeneric = tr.second; string strOriginal = tr.third; // boolean isLabeledO = tokenj.get(answerClass.get(label)) // .equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL); if (!isLabeledO) { // numNextTokensSpecial++; numNonStopWordsNext++; nextTokens.Add(strgeneric); // nextTokens.add("[{" + label + ":" // + tokenj.get(answerClass.get(label)).toString() // + "}]"); originalNext.Add(strOriginal); } else { // originalNextStr += " " // + tokenj.get(answerClass.get(label)).toString(); if (tokenj.Word().StartsWith("http")) { usenext = false; nextTokens.Clear(); originalNext.Clear(); break; } else { // if (!tokenj.word().matches("[.,?()]")) { Token str = SurfacePattern.GetContextToken(tokenj); nextTokens.Add(str); originalNext.Add(tokenjStr); if (DoNotUse(tokenjStr, stopWords)) { numStopWordsnext++; } else { numNonStopWordsNext++; } } } j++; numTokens++; } } // String prevContext = null, nextContext = null; // int numNonSpecialPrevTokens = previousTokens.size() // - numPrevTokensSpecial; // int numNonSpecialNextTokens = nextTokens.size() - numNextTokensSpecial; Token[] prevContext = null; //String[] prevContext = null; //String[] prevOriginalArr = null; // if (previousTokens.size() >= minWindow4Pattern // && (numStopWordsprev < numNonSpecialPrevTokens || // numNonSpecialPrevTokens > numMinStopWordsToAdd)) { if (previousTokens.Count >= minWindow4Pattern && (numNonStopWordsPrev > 0 || numStopWordsprev > numMinStopWordsToAdd)) { // prevContext = StringUtils.join(previousTokens, fw); IList <Token> prevContextList = new List <Token>(); IList <string> prevOriginal = new List <string>(); foreach (Token p in previousTokens) { prevContextList.Add(p); if (!fw.IsEmpty()) { prevContextList.Add(fw); } } // add fw and sw to the the originalprev foreach (string p_1 in originalPrev) { prevOriginal.Add(p_1); if (!fw.IsEmpty()) { prevOriginal.Add(" FW "); } } if (!sw.IsEmpty()) { prevContextList.Add(sw); prevOriginal.Add(" SW "); } // String str = prevContext + fw + sw; if (IsASCII(StringUtils.Join(prevOriginal))) { prevContext = Sharpen.Collections.ToArray(prevContextList, new Token[0]); //prevOriginalArr = prevOriginal.toArray(new String[0]); if (previousTokens.Count >= minWindow4Pattern) { if (twithoutPOS != null) { SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, null, SurfacePatternFactory.Genre.Prev); prevpatterns.Add(pat); } if (twithPOS != null) { SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, null, SurfacePatternFactory.Genre.Prev); prevpatterns.Add(patPOS); } } useprev = true; } } Token[] nextContext = null; //String [] nextOriginalArr = null; // if (nextTokens.size() > 0 // && (numStopWordsnext < numNonSpecialNextTokens || // numNonSpecialNextTokens > numMinStopWordsToAdd)) { if (nextTokens.Count > 0 && (numNonStopWordsNext > 0 || numStopWordsnext > numMinStopWordsToAdd)) { // nextContext = StringUtils.join(nextTokens, fw); IList <Token> nextContextList = new List <Token>(); IList <string> nextOriginal = new List <string>(); if (!sw.IsEmpty()) { nextContextList.Add(sw); nextOriginal.Add(" SW "); } foreach (Token n in nextTokens) { if (!fw.IsEmpty()) { nextContextList.Add(fw); } nextContextList.Add(n); } foreach (string n_1 in originalNext) { if (!fw.IsEmpty()) { nextOriginal.Add(" FW "); } nextOriginal.Add(n_1); } if (nextTokens.Count >= minWindow4Pattern) { nextContext = Sharpen.Collections.ToArray(nextContextList, new Token[0]); //nextOriginalArr = nextOriginal.toArray(new String[0]); if (twithoutPOS != null) { SurfacePattern pat = new SurfacePattern(null, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Next); nextpatterns.Add(pat); } if (twithPOS != null) { SurfacePattern patPOS = new SurfacePattern(null, twithPOS, nextContext, SurfacePatternFactory.Genre.Next); nextpatterns.Add(patPOS); } } usenext = true; } if (useprev && usenext) { // String strprev = prevContext + fw + sw; // String strnext = sw + fw + nextContext; if (previousTokens.Count + nextTokens.Count >= minWindow4Pattern) { if (twithoutPOS != null) { SurfacePattern pat = new SurfacePattern(prevContext, twithoutPOS, nextContext, SurfacePatternFactory.Genre.Prevnext); prevnextpatterns.Add(pat); } if (twithPOS != null) { SurfacePattern patPOS = new SurfacePattern(prevContext, twithPOS, nextContext, SurfacePatternFactory.Genre.Prevnext); prevnextpatterns.Add(patPOS); } } } } // Triple<Set<Integer>, Set<Integer>, Set<Integer>> patterns = new Triple<Set<Integer>, Set<Integer>, Set<Integer>>( // prevpatterns, nextpatterns, prevnextpatterns); // System.out.println("For word " + sent.get(i) + " in sentence " + sent + // " prev patterns are " + prevpatterns); // System.out.println("For word " + sent.get(i) + " in sentence " + sent + // " next patterns are " + nextpatterns); // System.out.println("For word " + sent.get(i) + " in sentence " + sent + // " prevnext patterns are " + prevnextpatterns); //getPatternIndex().finishCommit(); return(CollectionUtils.UnionAsSet(prevpatterns, nextpatterns, prevnextpatterns)); }
/// <summary>Convert token to a sequence of datums and add to iobList.</summary> /// <param name="iobList"/> /// <param name="token"/> /// <param name="tokType"/> /// <param name="tokenLabel"/> /// <param name="lastToken"/> /// <param name="applyRewriteRules"/> /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param> /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param> private static void TokenToDatums(IList <CoreLabel> iobList, CoreLabel cl, string token, IOBUtils.TokenType tokType, CoreLabel tokenLabel, string lastToken, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText ) { if (token.IsEmpty()) { return; } string lastLabel = ContinuationSymbol; string firstLabel = BeginSymbol; string rewritten = cl.Get(typeof(ArabicDocumentReaderAndWriter.RewrittenArabicAnnotation)); bool crossRefRewrites = true; if (rewritten == null) { rewritten = token; crossRefRewrites = false; } else { rewritten = StripSegmentationMarkers(rewritten, tokType); } if (applyRewriteRules) { // Apply Arabic-specific re-write rules string rawToken = tokenLabel.Word(); string tag = tokenLabel.Tag(); MorphoFeatureSpecification featureSpec = new ArabicMorphoFeatureSpecification(); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Ngen); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Nnum); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Def); featureSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Tense); MorphoFeatures features = featureSpec.StrToFeatures(tag); // Rule #1 : ت --> ة if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Ngen).Equals("F") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Nnum).Equals("SG") && rawToken.EndsWith("ت-") && !stripRewrites) { lastLabel = RewriteSymbol; } else { if (rawToken.EndsWith("ة-")) { System.Diagnostics.Debug.Assert(token.EndsWith("ة")); token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ت"; lastLabel = RewriteSymbol; } } // Rule #2 : لل --> ل ال if (lastToken.Equals("ل") && features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Def).Equals("D")) { if (rawToken.StartsWith("-ال")) { if (!token.StartsWith("ا")) { log.Info("Bad REWAL: " + rawToken + " / " + token); } token = Sharpen.Runtime.Substring(token, 1); rewritten = Sharpen.Runtime.Substring(rewritten, 1); if (!stripRewrites) { firstLabel = RewriteSymbol; } } else { if (rawToken.StartsWith("-ل")) { if (!token.StartsWith("ل")) { log.Info("Bad REWAL: " + rawToken + " / " + token); } if (!stripRewrites) { firstLabel = RewriteSymbol; } } else { log.Info("Ignoring REWAL: " + rawToken + " / " + token); } } } // Rule #3 : ي --> ى // Rule #4 : ا --> ى if (rawToken.EndsWith("ى-")) { if (features.GetValue(MorphoFeatureSpecification.MorphoFeatureType.Tense) != null) { // verb: ى becomes ا token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ا"; } else { // assume preposition: token = Sharpen.Runtime.Substring(token, 0, token.Length - 1) + "ي"; } if (!stripRewrites) { lastLabel = RewriteSymbol; } } else { if (rawToken.Equals("علي-") || rawToken.Equals("-علي-")) { if (!stripRewrites) { lastLabel = RewriteSymbol; } } } } string origWord; if (origText == null) { origWord = tokenLabel.Word(); } else { origWord = Sharpen.Runtime.Substring(origText, cl.BeginPosition(), cl.EndPosition()); } int origIndex = 0; while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf)) { ++origIndex; } // Create datums and add to iobList if (token.IsEmpty()) { log.Info("Rewriting resulted in empty token: " + tokenLabel.Word()); } string firstChar = token[0].ToString(); // Start at 0 to make sure we include the whole token according to the tokenizer iobList.Add(CreateDatum(cl, firstChar, firstLabel, 0, origIndex + 1)); int numChars = token.Length; if (crossRefRewrites && rewritten.Length != numChars) { System.Console.Error.Printf("Rewritten annotation doesn't have correct length: %s>>>%s%n", token, rewritten); crossRefRewrites = false; } ++origIndex; for (int j = 1; j < numChars; ++j, ++origIndex) { while (origIndex < origWord.Length && IsDeletedCharacter(origWord[origIndex], tf)) { ++origIndex; } if (origIndex >= origWord.Length) { origIndex = origWord.Length - 1; } string charLabel = (j == numChars - 1) ? lastLabel : ContinuationSymbol; string thisChar = token[j].ToString(); if (crossRefRewrites && !rewritten[j].ToString().Equals(thisChar)) { charLabel = RewriteSymbol; } if (charLabel == ContinuationSymbol && thisChar.Equals("ى") && j != numChars - 1) { charLabel = RewriteSymbol; } // Assume all mid-word alef maqsura are supposed to be yah iobList.Add(CreateDatum(cl, thisChar, charLabel, origIndex, origIndex + 1)); } // End at endPosition to make sure we include the whole token according to the tokenizer if (!iobList.IsEmpty()) { iobList[iobList.Count - 1].SetEndPosition(cl.EndPosition()); } }