// end featuresCpC /// <summary> /// For a CRF, this shouldn't be necessary, since the features duplicate /// those from CpC, but Huihsin found some valuable, presumably becuase /// it modified the regularization a bit. /// </summary> /// <param name="cInfo">The list of characters</param> /// <param name="loc">Position of c in list</param> /// <returns>Collection of String features (sparse set of boolean features</returns> protected internal virtual ICollection <string> FeaturesCnC <_T0>(PaddedList <_T0> cInfo, int loc) where _T0 : CoreLabel { ICollection <string> features = new List <string>(); if (flags.useWordn) { CoreLabel c = cInfo[loc]; CoreLabel c2 = cInfo[loc + 1]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; string charc = c.GetString <CoreAnnotations.CharAnnotation>(); string charc2 = c2.GetString <CoreAnnotations.CharAnnotation>(); string charp = p.GetString <CoreAnnotations.CharAnnotation>(); string charp2 = p2.GetString <CoreAnnotations.CharAnnotation>(); features.Add(charc + "c"); features.Add(charc2 + "c2"); features.Add(charp + "p"); features.Add(charp2 + "p2"); features.Add(charp2 + charp + "p2p"); features.Add(charp + charc + "pc"); features.Add(charc + charc2 + "cc2"); features.Add(charp + "-" + charc2 + "pc2"); features.Add("cliqueCnC"); } return(features); }
/// <summary>Convenience methods for subclasses which use CoreLabel.</summary> /// <remarks> /// Convenience methods for subclasses which use CoreLabel. Gets the /// word after applying any wordFunction present in the /// SeqClassifierFlags. /// </remarks> /// <param name="label">A CoreLabel</param> /// <returns> /// The TextAnnotation of the label, perhaps after passing it through /// a function (flags.wordFunction) /// </returns> protected internal virtual string GetWord(CoreLabel label) { string word = label.GetString <CoreAnnotations.TextAnnotation>(); if (flags.wordFunction != null) { word = flags.wordFunction.Apply(word); } return(word); }
//is EnglishPU private static void DictionaryFeaturesC(Type lbeginFieldName, Type lmiddleFieldName, Type lendFieldName, string dictSuffix, ICollection <string> features, CoreLabel p, CoreLabel c, CoreLabel c2) { string lbegin = c.GetString(lbeginFieldName); string lmiddle = c.GetString(lmiddleFieldName); string lend = c.GetString(lendFieldName); features.Add(lbegin + dictSuffix + "-lb"); features.Add(lmiddle + dictSuffix + "-lm"); features.Add(lend + dictSuffix + "-le"); lbegin = p.GetString(lbeginFieldName); lmiddle = p.GetString(lmiddleFieldName); lend = p.GetString(lendFieldName); features.Add(lbegin + dictSuffix + "-plb"); features.Add(lmiddle + dictSuffix + "-plm"); features.Add(lend + dictSuffix + "-ple"); lbegin = c2.GetString(lbeginFieldName); lmiddle = c2.GetString(lmiddleFieldName); lend = c2.GetString(lendFieldName); features.Add(lbegin + dictSuffix + "-c2lb"); features.Add(lmiddle + dictSuffix + "-c2lm"); features.Add(lend + dictSuffix + "-c2le"); }
private void DictionaryFeaturesCpC(Type lbeginFieldName, Type lmiddleFieldName, Type lendFieldName, string dictSuffix, ICollection <string> features, CoreLabel p2, CoreLabel p, CoreLabel c, CoreLabel c2) { string lbegin = c.GetString(lbeginFieldName); string lmiddle = c.GetString(lmiddleFieldName); string lend = c.GetString(lendFieldName); features.Add(lbegin + dictSuffix + "-lb"); features.Add(lmiddle + dictSuffix + "-lm"); features.Add(lend + dictSuffix + "-le"); lbegin = p.GetString(lbeginFieldName); lmiddle = p.GetString(lmiddleFieldName); lend = p.Get(lendFieldName); features.Add(lbegin + dictSuffix + "-plb"); features.Add(lmiddle + dictSuffix + "-plm"); features.Add(lend + dictSuffix + "-ple"); lbegin = c2.GetString(lbeginFieldName); lmiddle = c2.GetString(lmiddleFieldName); lend = c2.GetString(lendFieldName); features.Add(lbegin + dictSuffix + "-c2lb"); features.Add(lmiddle + dictSuffix + "-c2lm"); features.Add(lend + dictSuffix + "-c2le"); if (flags.useDictionaryConjunctions) { string p2Lend = p2.GetString(lendFieldName); string pLend = p.GetString(lendFieldName); string pLbegin = p.GetString(lbeginFieldName); string cLbegin = c.GetString(lbeginFieldName); string cLmiddle = c.GetString(lmiddleFieldName); if (flags.useDictionaryConjunctions3) { features.Add(pLend + cLbegin + cLmiddle + dictSuffix + "-pcLconj1"); } features.Add(p2Lend + pLend + cLbegin + cLmiddle + dictSuffix + "-p2pcLconj1"); features.Add(p2Lend + pLend + pLbegin + cLbegin + cLmiddle + dictSuffix + "-p2pcLconj2"); } }
private void SetTrueCaseText(CoreLabel l) { string trueCase = l.GetString <CoreAnnotations.TrueCaseAnnotation>(); string text = l.Word(); string trueCaseText = text; switch (trueCase) { case "UPPER": { trueCaseText = text.ToUpper(); break; } case "LOWER": { trueCaseText = text.ToLower(); break; } case "INIT_UPPER": { trueCaseText = char.ToTitleCase(text[0]) + Sharpen.Runtime.Substring(text, 1).ToLower(); break; } case "O": { // The model predicted mixed case, so lookup the map: string lower = text.ToLower(); if (mixedCaseMap.Contains(lower)) { trueCaseText = mixedCaseMap[lower]; } // else leave it as it was? break; } } // System.err.println(text + " was classified as " + trueCase + " and so became " + trueCaseText); l.Set(typeof(CoreAnnotations.TrueCaseTextAnnotation), trueCaseText); if (overwriteText) { l.Set(typeof(CoreAnnotations.TextAnnotation), trueCaseText); l.Set(typeof(CoreAnnotations.ValueAnnotation), trueCaseText); } }
// end featuresCpCp2C protected internal virtual ICollection <string> FeaturesCpCp2Cp3C <_T0>(PaddedList <_T0> cInfo, int loc) where _T0 : CoreLabel { ICollection <string> features = new List <string>(); if (flags.use4Clique && flags.maxLeft >= 3) { CoreLabel c = cInfo[loc]; CoreLabel c2 = cInfo[loc + 1]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; CoreLabel p3 = cInfo[loc - 3]; string charc = c.GetString <CoreAnnotations.CharAnnotation>(); string charp = p.GetString <CoreAnnotations.CharAnnotation>(); string charp2 = p2.GetString <CoreAnnotations.CharAnnotation>(); string charp3 = p3.GetString <CoreAnnotations.CharAnnotation>(); int cI = c.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec = (cI != null ? cI.ToString() : string.Empty); int c2I = c2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec2 = (c2I != null ? c2I.ToString() : string.Empty); int pI = p.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep = (pI != null ? pI.ToString() : string.Empty); int p2I = p2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep2 = (p2I != null ? p2I.ToString() : string.Empty); int p3I = p3.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep3 = (p3I != null ? p3I.ToString() : string.Empty); if (flags.useLongSequences) { features.Add(charp3 + charp2 + charp + charc + "p3p2pc"); } if (flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.Add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-uType4"); } if (flags.useUnicodeType5gram) { features.Add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType5"); } features.Add("cliqueCpCp2Cp3C"); } return(features); }
/// <exception cref="System.Exception"/> public override Document NextDoc() { IList <IList <CoreLabel> > allWords = new List <IList <CoreLabel> >(); IList <Tree> allTrees = new List <Tree>(); IList <IList <Mention> > allGoldMentions = new List <IList <Mention> >(); IList <IList <Mention> > allPredictedMentions; IList <ICoreMap> allSentences = new List <ICoreMap>(); Annotation docAnno = new Annotation(string.Empty); Pattern docPattern = Pattern.Compile("<DOC>(.*?)</DOC>", Pattern.Dotall + Pattern.CaseInsensitive); Pattern sentencePattern = Pattern.Compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.Dotall + Pattern.CaseInsensitive); Matcher docMatcher = docPattern.Matcher(fileContents); if (!docMatcher.Find(currentOffset)) { return(null); } currentOffset = docMatcher.End(); string doc = docMatcher.Group(1); Matcher sentenceMatcher = sentencePattern.Matcher(doc); string ner = null; //Maintain current document ID. Pattern docIDPattern = Pattern.Compile("<DOCNO>(.*?)</DOCNO>", Pattern.Dotall + Pattern.CaseInsensitive); Matcher docIDMatcher = docIDPattern.Matcher(doc); if (docIDMatcher.Find()) { currentDocumentID = docIDMatcher.Group(1); } else { currentDocumentID = "documentAfter " + currentDocumentID; } while (sentenceMatcher.Find()) { string sentenceString = sentenceMatcher.Group(2); IList <CoreLabel> words = tokenizerFactory.GetTokenizer(new StringReader(sentenceString)).Tokenize(); // FIXING TOKENIZATION PROBLEMS for (int i = 0; i < words.Count; i++) { CoreLabel w = words[i]; if (i > 0 && w.Word().Equals("$")) { if (!words[i - 1].Word().EndsWith("PRP") && !words[i - 1].Word().EndsWith("WP")) { continue; } words[i - 1].Set(typeof(CoreAnnotations.TextAnnotation), words[i - 1].Word() + "$"); words.Remove(i); i--; } else { if (w.Word().Equals("\\/")) { if (words[i - 1].Word().Equals("</COREF>")) { continue; } w.Set(typeof(CoreAnnotations.TextAnnotation), words[i - 1].Word() + "\\/" + words[i + 1].Word()); words.Remove(i + 1); words.Remove(i - 1); } } } // END FIXING TOKENIZATION PROBLEMS IList <CoreLabel> sentence = new List <CoreLabel>(); // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open Stack <Mention> stack = new Stack <Mention>(); IList <Mention> mentions = new List <Mention>(); allWords.Add(sentence); allGoldMentions.Add(mentions); foreach (CoreLabel word in words) { string w = word.Get(typeof(CoreAnnotations.TextAnnotation)); // found regular token: WORD/POS if (!w.StartsWith("<") && w.Contains("\\/") && w.LastIndexOf("\\/") != w.Length - 2) { int i_1 = w.LastIndexOf("\\/"); string w1 = Sharpen.Runtime.Substring(w, 0, i_1); // we do NOT set POS info here. We take the POS tags from the parser! word.Set(typeof(CoreAnnotations.TextAnnotation), w1); word.Remove(typeof(CoreAnnotations.OriginalTextAnnotation)); sentence.Add(word); } else { // found the start SGML tag for a NE, e.g., "<ORGANIZATION>" if (w.StartsWith("<") && !w.StartsWith("<COREF") && !w.StartsWith("</")) { Pattern nerPattern = Pattern.Compile("<(.*?)>"); Matcher m = nerPattern.Matcher(w); m.Find(); ner = m.Group(1); } else { // found the end SGML tag for a NE, e.g., "</ORGANIZATION>" if (w.StartsWith("</") && !w.StartsWith("</COREF")) { Pattern nerPattern = Pattern.Compile("</(.*?)>"); Matcher m = nerPattern.Matcher(w); m.Find(); string ner1 = m.Group(1); if (ner != null && !ner.Equals(ner1)) { throw new Exception("Unmatched NE labels in MUC file: " + ner + " v. " + ner1); } ner = null; } else { // found the start SGML tag for a coref mention if (w.StartsWith("<COREF")) { Mention mention = new Mention(); // position of this mention in the sentence mention.startIndex = sentence.Count; // extract GOLD info about this coref chain. needed for eval Pattern idPattern = Pattern.Compile("ID=\"(.*?)\""); Pattern refPattern = Pattern.Compile("REF=\"(.*?)\""); Matcher m = idPattern.Matcher(w); m.Find(); mention.mentionID = System.Convert.ToInt32(m.Group(1)); m = refPattern.Matcher(w); if (m.Find()) { mention.originalRef = System.Convert.ToInt32(m.Group(1)); } // open mention. keep track of all open mentions using the stack stack.Push(mention); } else { // found the end SGML tag for a coref mention if (w.Equals("</COREF>")) { Mention mention = stack.Pop(); mention.endIndex = sentence.Count; // this is a closed mention. add it to the final list of mentions // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef); mentions.Add(mention); } else { word.Remove(typeof(CoreAnnotations.OriginalTextAnnotation)); sentence.Add(word); } } } } } } StringBuilder textContent = new StringBuilder(); for (int i_2 = 0; i_2 < sentence.Count; i_2++) { CoreLabel w = sentence[i_2]; w.Set(typeof(CoreAnnotations.IndexAnnotation), i_2 + 1); w.Set(typeof(CoreAnnotations.UtteranceAnnotation), 0); if (i_2 > 0) { textContent.Append(" "); } textContent.Append(w.GetString <CoreAnnotations.TextAnnotation>()); } ICoreMap sentCoreMap = new Annotation(textContent.ToString()); allSentences.Add(sentCoreMap); sentCoreMap.Set(typeof(CoreAnnotations.TokensAnnotation), sentence); } // assign goldCorefClusterID IDictionary <int, Mention> idMention = Generics.NewHashMap(); // temporary use foreach (IList <Mention> goldMentions in allGoldMentions) { foreach (Mention m in goldMentions) { idMention[m.mentionID] = m; } } foreach (IList <Mention> goldMentions_1 in allGoldMentions) { foreach (Mention m in goldMentions_1) { if (m.goldCorefClusterID == -1) { if (m.originalRef == -1) { m.goldCorefClusterID = m.mentionID; } else { int @ref = m.originalRef; while (true) { Mention m2 = idMention[@ref]; if (m2.goldCorefClusterID != -1) { m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { if (m2.originalRef == -1) { m2.goldCorefClusterID = m2.mentionID; m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { @ref = m2.originalRef; } } } } } } } docAnno.Set(typeof(CoreAnnotations.SentencesAnnotation), allSentences); stanfordProcessor.Annotate(docAnno); if (allSentences.Count != allWords.Count) { throw new InvalidOperationException("allSentences != allWords"); } for (int i_3 = 0; i_3 < allSentences.Count; i_3++) { IList <CoreLabel> annotatedSent = allSentences[i_3].Get(typeof(CoreAnnotations.TokensAnnotation)); IList <CoreLabel> unannotatedSent = allWords[i_3]; IList <Mention> mentionInSent = allGoldMentions[i_3]; foreach (Mention m in mentionInSent) { m.dependency = allSentences[i_3].Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)); } if (annotatedSent.Count != unannotatedSent.Count) { throw new InvalidOperationException("annotatedSent != unannotatedSent"); } for (int j = 0; j < sz; j++) { CoreLabel annotatedWord = annotatedSent[j]; CoreLabel unannotatedWord = unannotatedSent[j]; if (!annotatedWord.Get(typeof(CoreAnnotations.TextAnnotation)).Equals(unannotatedWord.Get(typeof(CoreAnnotations.TextAnnotation)))) { throw new InvalidOperationException("annotatedWord != unannotatedWord"); } } allWords.Set(i_3, annotatedSent); allTrees.Add(allSentences[i_3].Get(typeof(TreeCoreAnnotations.TreeAnnotation))); } // extract predicted mentions allPredictedMentions = mentionFinder.ExtractPredictedMentions(docAnno, maxID, dictionaries); // add the relevant fields to mentions and order them for coref return(Arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true)); }
//end of CnC /// <summary>Second order clique features</summary> /// <param name="cInfo">The list of characters</param> /// <param name="loc">Position of c in list</param> /// <returns>Collection of String features (sparse set of boolean features</returns> protected internal virtual ICollection <string> FeaturesCpCp2C <_T0>(PaddedList <_T0> cInfo, int loc) where _T0 : CoreLabel { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel c2 = cInfo[loc + 1]; CoreLabel c3 = cInfo[loc + 2]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; CoreLabel p3 = cInfo[loc - 3]; string charc = c.GetString <CoreAnnotations.CharAnnotation>(); string charc2 = c2.GetString <CoreAnnotations.CharAnnotation>(); string charc3 = c3.GetString <CoreAnnotations.CharAnnotation>(); string charp = p.GetString <CoreAnnotations.CharAnnotation>(); string charp2 = p2.GetString <CoreAnnotations.CharAnnotation>(); string charp3 = p3.GetString <CoreAnnotations.CharAnnotation>(); // N-gram features. N is up to 3 if (flags.useWord3) { features.Add(charc + "::c"); features.Add(charc2 + "::n"); features.Add(charp + "::p"); features.Add(charp2 + "::p2"); // trying to restore the features that Huihsin described in SIGHAN 2005 paper features.Add(charc + charc2 + "::cn"); features.Add(charc + charc2 + charc3 + "::cnn2"); features.Add(charp + charc + "::pc"); features.Add(charp + charc2 + "::pn"); features.Add(charp2 + charp + "::p2p"); features.Add(charp3 + charp2 + charp + "::p3p2p"); features.Add(charp2 + charc + "::p2c"); features.Add(charc + charc3 + "::cn2"); } if (flags.useShapeStrings) { if (flags.useShapeStrings1) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + "ps"); features.Add(c.GetString <CoreAnnotations.ShapeAnnotation>() + "cs"); features.Add(c2.GetString <CoreAnnotations.ShapeAnnotation>() + "c2s"); } if (flags.useShapeStrings3) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "pscsc2s"); } if (flags.useShapeStrings4) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "p2spscsc2s"); } if (flags.useShapeStrings5) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation >() + "p2spscsc2sc3s"); } if (flags.useWordShapeConjunctions2) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + "pscc"); features.Add(charp + c.GetString <CoreAnnotations.ShapeAnnotation>() + "pccs"); } if (flags.useWordShapeConjunctions3) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + "p2spscc"); features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "psccc2s"); features.Add(charc + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation>() + "ccc2sc3s"); } } /* * Radical N-gram features. N is upto 4. * Smoothing method of N-gram, because there are too many characters in Chinese. * (It works better than N-gram when they are used individually. less sparse) */ char rcharc; char rcharc2; char rcharp; char rcharp2; if (charc.Length == 0) { rcharc = 'n'; } else { rcharc = RadicalMap.GetRadical(charc[0]); } if (charc2.Length == 0) { rcharc2 = 'n'; } else { rcharc2 = RadicalMap.GetRadical(charc2[0]); } if (charp.Length == 0) { rcharp = 'n'; } else { rcharp = RadicalMap.GetRadical(charp[0]); } if (charp2.Length == 0) { rcharp2 = 'n'; } else { rcharp2 = RadicalMap.GetRadical(charp2[0]); } if (flags.useRad2) { features.Add(rcharc + "rc"); features.Add(rcharc2 + "rc2"); features.Add(rcharp + "rp"); features.Add(rcharp + rcharc + "rprc"); features.Add(rcharc + rcharc2 + "rcrc2"); features.Add(rcharp + rcharc + rcharc2 + "rprcrc2"); } if (flags.useRad2b) { features.Add(rcharc + "rc"); features.Add(rcharc2 + "rc2"); features.Add(rcharp + "rp"); features.Add(rcharp + rcharc + "rprc"); features.Add(rcharc + rcharc2 + "rcrc2"); features.Add(rcharp2 + rcharp + "rp2rp"); } features.Add("cliqueCpCp2C"); return(features); }
protected internal virtual ICollection <string> FeaturesCpC <_T0>(PaddedList <_T0> cInfo, int loc) where _T0 : CoreLabel { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel c2 = cInfo[loc + 1]; CoreLabel c3 = cInfo[loc + 2]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; CoreLabel p3 = cInfo[loc - 3]; string charc = c.GetString <CoreAnnotations.CharAnnotation>(); string charc2 = c2.GetString <CoreAnnotations.CharAnnotation>(); string charc3 = c3.GetString <CoreAnnotations.CharAnnotation>(); string charp = p.GetString <CoreAnnotations.CharAnnotation>(); string charp2 = p2.GetString <CoreAnnotations.CharAnnotation>(); string charp3 = p3.GetString <CoreAnnotations.CharAnnotation>(); int cI = c.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec = (cI != null ? cI.ToString() : string.Empty); int c2I = c2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec2 = (c2I != null ? c2I.ToString() : string.Empty); int c3I = c3.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec3 = (c3I != null ? c3I.ToString() : string.Empty); int pI = p.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep = (pI != null ? pI.ToString() : string.Empty); int p2I = p2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep2 = (p2I != null ? p2I.ToString() : string.Empty); if (flags.dictionary != null || flags.serializedDictionary != null) { DictionaryFeaturesCpC(typeof(CoreAnnotations.LBeginAnnotation), typeof(CoreAnnotations.LMiddleAnnotation), typeof(CoreAnnotations.LEndAnnotation), string.Empty, features, p2, p, c, c2); } if (flags.dictionary2 != null) { DictionaryFeaturesCpC(typeof(CoreAnnotations.D2_LBeginAnnotation), typeof(CoreAnnotations.D2_LMiddleAnnotation), typeof(CoreAnnotations.D2_LEndAnnotation), "-D2-", features, p2, p, c, c2); } /* * N-gram features. N is upto 2. */ if (flags.useWord2) { // features.add(charc +"c"); // features.add(charc2+"c2"); // features.add(charp +"p"); // features.add(charp + charc +"pc"); // features.add(charc + charc2 +"cc2"); // // cdm: need hyphen so you can see which of charp or charc2 is null.... // features.add(charp + "-" + charc2 + "pc2"); features.Add(charc + "::c"); features.Add(charc2 + "::c1"); features.Add(charp + "::p"); features.Add(charp2 + "::p2"); // trying to restore the features that Huihsin described in SIGHAN 2005 paper features.Add(charc + charc2 + "::cn"); // (*) features.Add(charp + charc + "::pc"); features.Add(charp + charc2 + "::pn"); features.Add(charp2 + charp + "::p2p"); features.Add(charp2 + charc + "::p2c"); features.Add(charc2 + charc + "::n2c"); } // todo: this is messed up: Same as one above at (*); should be cn2 = charc + charc3 + "::cn2" if (flags.useFeaturesCpC4gram || flags.useFeaturesCpC5gram || flags.useFeaturesCpC6gram) { // todo: Both these features duplicate ones already in useWord2 features.Add(charp2 + charp + "p2p"); features.Add(charp2 + "p2"); } if (flags.useFeaturesCpC5gram || flags.useFeaturesCpC6gram) { features.Add(charc3 + "c3"); features.Add(charc2 + charc3 + "c2c3"); } if (flags.useFeaturesCpC6gram) { features.Add(charp3 + "p3"); features.Add(charp3 + charp2 + "p3p2"); } if (flags.useGoodForNamesCpC) { // these 2 features should be distinctively good at biasing from // picking up a Chinese family name in the p2 or p3 positions: // familyName X X startWord AND familyName X startWord // But actually they seem to have negative value. features.Add(charp2 + "p2"); features.Add(charp3 + "p3"); } if (flags.useUnicodeType || flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.Add(uTypep + "-" + uTypec + "-" + uTypec2 + "-uType3"); } if (flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.Add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType4"); } if (flags.useUnicodeType5gram) { features.Add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-" + uTypec3 + "-uType5"); } if (flags.useWordUTypeConjunctions2) { features.Add(uTypep + charc + "putcc"); features.Add(charp + uTypec + "pccut"); } if (flags.useWordUTypeConjunctions3) { features.Add(uTypep2 + uTypep + charc + "p2utputcc"); features.Add(uTypep + charc + uTypec2 + "putccc2ut"); features.Add(charc + uTypec2 + uTypec3 + "ccc2utc3ut"); } if (flags.useUnicodeBlock) { features.Add(p.GetString <CoreAnnotations.UBlockAnnotation>() + "-" + c.GetString <CoreAnnotations.UBlockAnnotation>() + "-" + c2.GetString <CoreAnnotations.UBlockAnnotation>() + "-uBlock"); } if (flags.useShapeStrings) { if (flags.useShapeStrings1) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + "ps"); features.Add(c.GetString <CoreAnnotations.ShapeAnnotation>() + "cs"); features.Add(c2.GetString <CoreAnnotations.ShapeAnnotation>() + "c2s"); } if (flags.useShapeStrings3) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "pscsc2s"); } if (flags.useShapeStrings4) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "p2spscsc2s"); } if (flags.useShapeStrings5) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation >() + "p2spscsc2sc3s"); } if (flags.useWordShapeConjunctions2) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + "pscc"); features.Add(charp + c.GetString <CoreAnnotations.ShapeAnnotation>() + "pccs"); } if (flags.useWordShapeConjunctions3) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + "p2spscc"); features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "psccc2s"); features.Add(charc + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation>() + "ccc2sc3s"); } } /* * Radical N-gram features. N is upto 4. * Smoothing method of N-gram, because there are too many characters in Chinese. * (It works better than N-gram when they are used individually. less sparse) */ char rcharc; char rcharc2; char rcharp; char rcharp2; if (charc.Length == 0) { rcharc = 'n'; } else { rcharc = RadicalMap.GetRadical(charc[0]); } if (charc2.Length == 0) { rcharc2 = 'n'; } else { rcharc2 = RadicalMap.GetRadical(charc2[0]); } if (charp.Length == 0) { rcharp = 'n'; } else { rcharp = RadicalMap.GetRadical(charp[0]); } if (charp2.Length == 0) { rcharp2 = 'n'; } else { rcharp2 = RadicalMap.GetRadical(charp2[0]); } if (flags.useRad2) { features.Add(rcharc + "rc"); features.Add(rcharc2 + "rc2"); features.Add(rcharp + "rp"); features.Add(rcharp + rcharc + "rprc"); features.Add(rcharc + rcharc2 + "rcrc2"); features.Add(rcharp + rcharc + rcharc2 + "rprcrc2"); } if (flags.useRad2b) { features.Add(rcharc + "rc"); features.Add(rcharc2 + "rc2"); features.Add(rcharp + "rp"); features.Add(rcharp + rcharc + "rprc"); features.Add(rcharc + rcharc2 + "rcrc2"); features.Add(rcharp2 + rcharp + "rp2rp"); } /* Non-word dictionary: SEEN bi-gram marked as non-word. * This is frickin' useful. I hadn't realized. CDM Oct 2007. */ if (flags.useDict2) { NonDict2 nd = new NonDict2(flags); features.Add(nd.CheckDic(charp + charc, flags) + "nondict"); } if (flags.useOutDict2) { if (outDict == null) { CreateOutDict(); } features.Add(outDict.GetW(charp + charc) + "outdict"); // -1 0 features.Add(outDict.GetW(charc + charc2) + "outdict"); // 0 1 features.Add(outDict.GetW(charp2 + charp) + "outdict"); // -2 -1 features.Add(outDict.GetW(charp2 + charp + charc) + "outdict"); // -2 -1 0 features.Add(outDict.GetW(charp3 + charp2 + charp) + "outdict"); // -3 -2 -1 features.Add(outDict.GetW(charp + charc + charc2) + "outdict"); // -1 0 1 features.Add(outDict.GetW(charc + charc2 + charc3) + "outdict"); // 0 1 2 features.Add(outDict.GetW(charp + charc + charc2 + charc3) + "outdict"); } // -1 0 1 2 /* * (CTB/ASBC/HK/PK/MSR) POS information of each characters. * If a character falls into some function categories, * it is very likely there is a boundary. * A lot of Chinese function words belong to single characters. * This feature is also good for numbers and punctuations. * DE* are grouped into DE. */ if (flags.useCTBChar2 || flags.useASBCChar2 || flags.useHKChar2 || flags.usePKChar2 || flags.useMSRChar2) { string[] tagsets; // the "useChPos" now only works for CTB and PK if (flags.useChPos) { if (flags.useCTBChar2) { tagsets = new string[] { "AD", "AS", "BA", "CC", "CD", "CS", "DE", "DT", "ETC", "IJ", "JJ", "LB", "LC", "M", "NN", "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" }; } else { if (flags.usePKChar2) { //tagsets = new String[]{"r", "j", "t", "a", "nz", "l", "vn", "i", "m", "ns", "nr", "v", "n", "q", "Ng", "b", "d", "nt"}; tagsets = new string[] { "2", "3", "4" }; } else { throw new Exception("only support settings for CTB and PK now."); } } } else { //logger.info("Using Derived features"); tagsets = new string[] { "2", "3", "4" }; } if (taDetector == null) { CreateTADetector(); } foreach (string tag in tagsets) { features.Add(taDetector.CheckDic(tag + "p", charp) + taDetector.CheckDic(tag + "i", charp) + taDetector.CheckDic(tag + "s", charc) + taDetector.CheckInDic(charp) + taDetector.CheckInDic(charc) + tag + "prep-sufc"); } } //features.add("|ctbchar2"); /* * In error analysis, we found English words and numbers are often separated. * Rule 1: isNumber feature: check if the current and previous char is a number. * Rule 2: Disambiguation of time point and time duration. * Rule 3: isEnglish feature: check if the current and previous character is an english letter. * Rule 4: English name feature: check if the current char is a conjunct pu for English first and last name, since there is no space between two names. * Most of PUs are a good indicator for word boundary, but - and . is a strong indicator that there is no boundry within a previous , a follow char and it. */ if (flags.useRule2) { /* Reduplication features */ // previous character == current character if (charp.Equals(charc)) { features.Add("11-R2"); } // previous character == next character if (charp.Equals(charc2)) { features.Add("22-R2"); } // current character == next next character // fire only when usePk and useHk are both false. // Notice: this should be (almost) the same as the "22" feature, but we keep it for now. if (!flags.usePk && !flags.useHk) { if (charc.Equals(charc2)) { features.Add("33-R2"); } } char cur1 = ' '; char cur2 = ' '; char cur = ' '; char pre = ' '; // actually their length must be either 0 or 1 if (charc2.Length > 0) { cur1 = charc2[0]; } if (charc3.Length > 0) { cur2 = charc3[0]; } if (charc.Length > 0) { cur = charc[0]; } if (charp.Length > 0) { pre = charp[0]; } string prer = rcharp.ToString(); // the radical of previous character Pattern E = Pattern.Compile("[a-zA-Z]"); Pattern N = Pattern.Compile("[0-9]"); Matcher m = E.Matcher(charp); Matcher ce = E.Matcher(charc); Matcher pe = E.Matcher(charp2); Matcher cn = N.Matcher(charc); Matcher pn = N.Matcher(charp2); // if current and previous characters are numbers... if (cur >= '0' && cur <= '9' && pre >= '0' && pre <= '9') { if (cur == '9' && pre == '1' && cur1 == '9' && cur2 >= '0' && cur2 <= '9') { //199x features.Add("YR-R2"); } else { features.Add("2N-R2"); } } else { // if current and previous characters are not both numbers // but previous char is a number // i.e. patterns like "1N" , "2A", etc if (pre >= '0' && pre <= '9') { features.Add("1N-R2"); } else { // if previous character is an English character if (m.Matches()) { features.Add("E-R2"); } else { // if the previous character contains no radical (and it exist) if (prer.Equals(".") && charp.Length == 1) { if (ce.Matches()) { features.Add("PU+E-R2"); } if (pe.Matches()) { features.Add("E+PU-R2"); } if (cn.Matches()) { features.Add("PU+N-R2"); } if (pn.Matches()) { features.Add("N+PU-R2"); } features.Add("PU-R2"); } } } } string engType = IsEnglish(charp, charc); string engPU = IsEngPU(charp); if (!engType.Equals(string.Empty)) { features.Add(engType); } if (!engPU.Equals(string.Empty) && !engType.Equals(string.Empty)) { StringBuilder sb = new StringBuilder(); sb.Append(engPU).Append(engType).Append("R2"); features.Add(sb.ToString()); } } //end of use rule // features using "Character.getType" information! string origS = c.GetString <CoreAnnotations.OriginalCharAnnotation>(); char origC = ' '; if (origS.Length > 0) { origC = origS[0]; } int type = char.GetType(origC); switch (type) { case char.UppercaseLetter: case char.LowercaseLetter: { // A-Z and full-width A-Z // a-z and full-width a-z features.Add("CHARTYPE-LETTER"); break; } case char.DecimalDigitNumber: { features.Add("CHARTYPE-DECIMAL_DIGIT_NUMBER"); break; } case char.OtherLetter: { // mostly chinese chars features.Add("CHARTYPE-OTHER_LETTER"); break; } default: { // other types features.Add("CHARTYPE-MISC"); break; } } features.Add("cliqueCpC"); return(features); }
protected internal virtual ICollection <string> FeaturesC <_T0>(PaddedList <_T0> cInfo, int loc) where _T0 : CoreLabel { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel c2 = cInfo[loc + 1]; CoreLabel c3 = cInfo[loc + 2]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; CoreLabel p3 = cInfo[loc - 3]; string charc = c.GetString <CoreAnnotations.CharAnnotation>(); string charc2 = c2.GetString <CoreAnnotations.CharAnnotation>(); string charc3 = c3.GetString <CoreAnnotations.CharAnnotation>(); string charp = p.GetString <CoreAnnotations.CharAnnotation>(); string charp2 = p2.GetString <CoreAnnotations.CharAnnotation>(); string charp3 = p3.GetString <CoreAnnotations.CharAnnotation>(); int cI = c.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec = (cI != null ? cI.ToString() : string.Empty); int c2I = c2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec2 = (c2I != null ? c2I.ToString() : string.Empty); int c3I = c3.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec3 = (c3I != null ? c3I.ToString() : string.Empty); int pI = p.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep = (pI != null ? pI.ToString() : string.Empty); int p2I = p2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep2 = (p2I != null ? p2I.ToString() : string.Empty); /* N-gram features. N is upto 2. */ if (flags.useWord1) { // features.add(charc +"c"); // features.add(charc2+"c2"); // features.add(charp +"p"); // features.add(charp + charc +"pc"); // features.add(charc + charc2 +"cc2"); // cdm: need hyphen so you can see which of charp or charc2 is null.... // features.add(charp + "-" + charc2 + "pc2"); features.Add(charc + "::c"); features.Add(charc2 + "::c2"); features.Add(charp + "::p"); features.Add(charp2 + "::p2"); // trying to restore the features that Huishin described in SIGHAN 2005 paper features.Add(charc + charc2 + "::cn"); features.Add(charc + charc3 + "::cn2"); features.Add(charp + charc + "::pc"); features.Add(charp + charc2 + "::pn"); features.Add(charp2 + charp + "::p2p"); features.Add(charp2 + charc + "::p2c"); features.Add(charc2 + charc + "::n2c"); } if (flags.dictionary != null || flags.serializedDictionary != null) { DictionaryFeaturesC(typeof(CoreAnnotations.LBeginAnnotation), typeof(CoreAnnotations.LMiddleAnnotation), typeof(CoreAnnotations.LEndAnnotation), string.Empty, features, p, c, c2); } if (flags.dictionary2 != null) { DictionaryFeaturesC(typeof(CoreAnnotations.D2_LBeginAnnotation), typeof(CoreAnnotations.D2_LMiddleAnnotation), typeof(CoreAnnotations.D2_LEndAnnotation), "-D2-", features, p, c, c2); } if (flags.useFeaturesC4gram || flags.useFeaturesC5gram || flags.useFeaturesC6gram) { features.Add(charp2 + charp + "p2p"); features.Add(charp2 + "p2"); } if (flags.useFeaturesC5gram || flags.useFeaturesC6gram) { features.Add(charc3 + "c3"); features.Add(charc2 + charc3 + "c2c3"); } if (flags.useFeaturesC6gram) { features.Add(charp3 + "p3"); features.Add(charp3 + charp2 + "p3p2"); } if (flags.useUnicodeType || flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.Add(uTypep + "-" + uTypec + "-" + uTypec2 + "-uType3"); } if (flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.Add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType4"); } if (flags.useUnicodeType5gram) { features.Add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-" + uTypec3 + "-uType5"); } if (flags.useUnicodeBlock) { features.Add(p.GetString <CoreAnnotations.UBlockAnnotation>() + "-" + c.GetString <CoreAnnotations.UBlockAnnotation>() + "-" + c2.GetString <CoreAnnotations.UBlockAnnotation>() + "-uBlock"); } if (flags.useShapeStrings) { if (flags.useShapeStrings1) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + "ps"); features.Add(c.GetString <CoreAnnotations.ShapeAnnotation>() + "cs"); features.Add(c2.GetString <CoreAnnotations.ShapeAnnotation>() + "c2s"); } if (flags.useShapeStrings3) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "pscsc2s"); } if (flags.useShapeStrings4) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "p2spscsc2s"); } if (flags.useShapeStrings5) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation >() + "p2spscsc2sc3s"); } } features.Add("cliqueC"); return(features); }