private static ICounter <string> GetFeatures(ClustererDataLoader.ClustererDoc doc, Clusterer.Cluster c1, Clusterer.Cluster c2, Clusterer.GlobalFeatures gf) { Clusterer.MergeKey key = new Clusterer.MergeKey(c1, c2, gf.currentIndex); CompressedFeatureVector cfv = featuresCache[key]; ICounter <string> features = cfv == null ? null : compressor.Uncompress(cfv); if (features != null) { featuresCacheHits += isTraining; return(features); } featuresCacheMisses += isTraining; features = new ClassicCounter <string>(); if (gf.anaphorSeen) { features.IncrementCount("anaphorSeen"); } features.IncrementCount("docSize", gf.docSize); features.IncrementCount("percentComplete", gf.currentIndex / (double)gf.size); features.IncrementCount("bias", 1.0); int earliest1 = EarliestMention(c1, doc); int earliest2 = EarliestMention(c2, doc); if (doc.mentionIndices[earliest1] > doc.mentionIndices[earliest2]) { int tmp = earliest1; earliest1 = earliest2; earliest2 = tmp; } features.IncrementCount("anaphoricity", doc.anaphoricityScores.GetCount(earliest2)); if (c1.mentions.Count == 1 && c2.mentions.Count == 1) { Pair <int, int> mentionPair = new Pair <int, int>(c1.mentions[0], c2.mentions[0]); features.AddAll(AddSuffix(GetFeatures(doc, mentionPair, doc.classificationScores), "-classification")); features.AddAll(AddSuffix(GetFeatures(doc, mentionPair, doc.rankingScores), "-ranking")); features = AddSuffix(features, "-single"); } else { IList <Pair <int, int> > between = new List <Pair <int, int> >(); foreach (int m1 in c1.mentions) { foreach (int m2 in c2.mentions) { between.Add(new Pair <int, int>(m1, m2)); } } features.AddAll(AddSuffix(GetFeatures(doc, between, doc.classificationScores), "-classification")); features.AddAll(AddSuffix(GetFeatures(doc, between, doc.rankingScores), "-ranking")); } featuresCache[key] = compressor.Compress(features); return(features); }
static ChineseQuantifiableEntityNormalizer() { //Entity types that are quantifiable // used by money // used by money // Patterns we need // TODO (yuhao): here we are not considering 1) negative numbers, 2) Chinese traditional characters // This is the all-literal-number-characters sequence, excluding unit characters like 十 or 万 // The decimal part of a float number should be exactly literal number sequence without units // Used by quantity modifiers // All the tags we need // static initialization of useful properties quantifiable = Generics.NewHashSet(); quantifiable.Add(NumberTag); quantifiable.Add(DateTag); quantifiable.Add(TimeTag); quantifiable.Add(MoneyTag); quantifiable.Add(PercentTag); quantifiable.Add(OrdinalTag); quantityUnitToValues = new ClassicCounter <string>(); quantityUnitToValues.SetCount("十", 10.0); quantityUnitToValues.SetCount("百", 100.0); quantityUnitToValues.SetCount("千", 1000.0); quantityUnitToValues.SetCount("万", 10000.0); quantityUnitToValues.SetCount("亿", 100000000.0); wordsToValues = new ClassicCounter <string>(); wordsToValues.SetCount("零", 0.0); wordsToValues.SetCount("〇", 0.0); wordsToValues.SetCount("一", 1.0); wordsToValues.SetCount("二", 2.0); wordsToValues.SetCount("两", 2.0); wordsToValues.SetCount("三", 3.0); wordsToValues.SetCount("四", 4.0); wordsToValues.SetCount("五", 5.0); wordsToValues.SetCount("六", 6.0); wordsToValues.SetCount("七", 7.0); wordsToValues.SetCount("八", 8.0); wordsToValues.SetCount("九", 9.0); wordsToValues.AddAll(quantityUnitToValues); // all units are also quantifiable individually multiCharCurrencyWords = Generics.NewHashMap(); multiCharCurrencyWords["美元"] = '$'; multiCharCurrencyWords["美分"] = '$'; multiCharCurrencyWords["英镑"] = '£'; multiCharCurrencyWords["先令"] = '£'; multiCharCurrencyWords["便士"] = '£'; multiCharCurrencyWords["欧元"] = '€'; multiCharCurrencyWords["日元"] = '¥'; multiCharCurrencyWords["韩元"] = '₩'; oneCharCurrencyWords = Generics.NewHashMap(); oneCharCurrencyWords["刀"] = '$'; oneCharCurrencyWords["镑"] = '£'; oneCharCurrencyWords["元"] = '元'; // We follow the tradition in English to use 元 instead of ¥ for RMB // For all other currency, we use default currency symbol $ yearModifiers = Generics.NewHashMap(); yearModifiers["前"] = -2; yearModifiers["去"] = -1; yearModifiers["上"] = -1; yearModifiers["今"] = 0; yearModifiers["同"] = 0; yearModifiers["此"] = 0; yearModifiers["该"] = 0; yearModifiers["本"] = 0; yearModifiers["明"] = 1; yearModifiers["来"] = 1; yearModifiers["下"] = 1; yearModifiers["后"] = 2; monthDayModifiers = Generics.NewHashMap(); monthDayModifiers["昨"] = -1; monthDayModifiers["上"] = -1; monthDayModifiers["今"] = 0; monthDayModifiers["同"] = 0; monthDayModifiers["此"] = 0; monthDayModifiers["该"] = 0; monthDayModifiers["本"] = 0; monthDayModifiers["来"] = 1; monthDayModifiers["明"] = 1; monthDayModifiers["下"] = 1; fullDigitToHalfDigit = Generics.NewHashMap(); fullDigitToHalfDigit["1"] = "1"; fullDigitToHalfDigit["2"] = "2"; fullDigitToHalfDigit["3"] = "3"; fullDigitToHalfDigit["4"] = "4"; fullDigitToHalfDigit["5"] = "5"; fullDigitToHalfDigit["6"] = "6"; fullDigitToHalfDigit["7"] = "7"; fullDigitToHalfDigit["8"] = "8"; fullDigitToHalfDigit["9"] = "9"; fullDigitToHalfDigit["0"] = "0"; }
public virtual ICounter <string> GetFeatures(Example example, IDictionary <int, CompressedFeatureVector> mentionFeatures, Compressor <string> compressor) { ICounter <string> features = new ClassicCounter <string>(); ICounter <string> pairFeatures = new ClassicCounter <string>(); ICounter <string> features1 = new ClassicCounter <string>(); ICounter <string> features2 = compressor.Uncompress(mentionFeatures[example.mentionId2]); if (!example.IsNewLink()) { System.Diagnostics.Debug.Assert((!anaphoricityClassifier)); pairFeatures = compressor.Uncompress(example.pairwiseFeatures); features1 = compressor.Uncompress(mentionFeatures[example.mentionId1]); } else { features2.IncrementCount("bias"); } if (!disallowedPrefixes.IsEmpty()) { features1 = FilterOut(features1, disallowedPrefixes); features2 = FilterOut(features2, disallowedPrefixes); pairFeatures = FilterOut(pairFeatures, disallowedPrefixes); } IList <string> ids1 = example.IsNewLink() ? new List <string>() : Identifiers(features1, example.mentionType1); IList <string> ids2 = Identifiers(features2, example.mentionType2); features.AddAll(pairFeatures); foreach (string id1 in ids1) { foreach (string id2 in ids2) { if (pairConjunctions.Contains(MetaFeatureExtractor.PairConjunction.First)) { features.AddAll(GetConjunction(pairFeatures, "_m1=" + id1)); } if (pairConjunctions.Contains(MetaFeatureExtractor.PairConjunction.Last)) { features.AddAll(GetConjunction(pairFeatures, "_m2=" + id2)); } if (pairConjunctions.Contains(MetaFeatureExtractor.PairConjunction.Both)) { features.AddAll(GetConjunction(pairFeatures, "_ms=" + id1 + "_" + id2)); } if (singleConjunctions.Contains(MetaFeatureExtractor.SingleConjunction.Index)) { features.AddAll(GetConjunction(features1, "_1")); features.AddAll(GetConjunction(features2, "_2")); } if (singleConjunctions.Contains(MetaFeatureExtractor.SingleConjunction.IndexCurrent)) { features.AddAll(GetConjunction(features1, "_1" + "_m=" + id1)); features.AddAll(GetConjunction(features2, "_2" + "_m=" + id2)); } if (singleConjunctions.Contains(MetaFeatureExtractor.SingleConjunction.IndexLast)) { features.AddAll(GetConjunction(features1, "_1" + "_m2=" + id2)); features.AddAll(GetConjunction(features2, "_2" + "_m2=" + id2)); } if (singleConjunctions.Contains(MetaFeatureExtractor.SingleConjunction.IndexOther)) { features.AddAll(GetConjunction(features1, "_1" + "_m=" + id2)); features.AddAll(GetConjunction(features2, "_2" + "_m=" + id1)); } if (singleConjunctions.Contains(MetaFeatureExtractor.SingleConjunction.IndexBoth)) { features.AddAll(GetConjunction(features1, "_1" + "_ms=" + id1 + "_" + id2)); features.AddAll(GetConjunction(features2, "_2" + "_ms=" + id1 + "_" + id2)); } } } if (example.IsNewLink()) { features.AddAll(features2); features.AddAll(GetConjunction(features2, "_m=" + ids2[0])); ICounter <string> newFeatures = new ClassicCounter <string>(); foreach (KeyValuePair <string, double> e in features.EntrySet()) { newFeatures.IncrementCount(e.Key + "_NEW", e.Value); } features = newFeatures; } return(features); }