/// <exception cref="System.IO.IOException"/> private void WriteObject(ObjectOutputStream stream) { // log.info("\nBefore compression:"); // log.info("arg size: " + argCounter.size() + " total: " + argCounter.totalCount()); // log.info("stop size: " + stopCounter.size() + " total: " + stopCounter.totalCount()); ClassicCounter <IntDependency> fullArgCounter = argCounter; argCounter = new ClassicCounter <IntDependency>(); foreach (IntDependency dependency in fullArgCounter.KeySet()) { if (dependency.head != wildTW && dependency.arg != wildTW && dependency.head.word != -1 && dependency.arg.word != -1) { argCounter.IncrementCount(dependency, fullArgCounter.GetCount(dependency)); } } ClassicCounter <IntDependency> fullStopCounter = stopCounter; stopCounter = new ClassicCounter <IntDependency>(); foreach (IntDependency dependency_1 in fullStopCounter.KeySet()) { if (dependency_1.head.word != -1) { stopCounter.IncrementCount(dependency_1, fullStopCounter.GetCount(dependency_1)); } } // log.info("After compression:"); // log.info("arg size: " + argCounter.size() + " total: " + argCounter.totalCount()); // log.info("stop size: " + stopCounter.size() + " total: " + stopCounter.totalCount()); stream.DefaultWriteObject(); argCounter = fullArgCounter; stopCounter = fullStopCounter; }
public virtual ClassicCounter <L> ScoresOf(RVFDatum <L, F> example) { ClassicCounter <L> scores = new ClassicCounter <L>(); Counters.AddInPlace(scores, priors); if (addZeroValued) { Counters.AddInPlace(scores, priorZero); } foreach (L l in labels) { double score = 0.0; ICounter <F> features = example.AsFeaturesCounter(); foreach (F f in features.KeySet()) { int value = (int)features.GetCount(f); score += Weight(l, f, int.Parse(value)); if (addZeroValued) { score -= Weight(l, f, zero); } } scores.IncrementCount(l, score); } return(scores); }
public override ICounter <E> Score() { ICounter <E> specificity = new ClassicCounter <E>(); ICounter <E> sensitivity = new ClassicCounter <E>(); if (p0Set.KeySet().Count == 0) { throw new Exception("how come p0set size is empty for " + p0 + "?"); } foreach (KeyValuePair <E, ClassicCounter <CandidatePhrase> > en in patternsandWords4Label.EntrySet()) { int common = CollectionUtils.Intersection(en.Value.KeySet(), p0Set.KeySet()).Count; if (common == 0) { continue; } if (en.Value.KeySet().Count == 0) { throw new Exception("how come counter for " + en.Key + " is empty?"); } specificity.SetCount(en.Key, common / (double)en.Value.KeySet().Count); sensitivity.SetCount(en.Key, common / (double)p0Set.Size()); } Counters.RetainNonZeros(specificity); Counters.RetainNonZeros(sensitivity); ICounter <E> add = Counters.Add(sensitivity, specificity); ICounter <E> product = Counters.Product(sensitivity, specificity); Counters.RetainNonZeros(product); Counters.RetainKeys(product, add.KeySet()); ICounter <E> finalPat = Counters.Scale(Counters.Division(product, add), 2); return(finalPat); }
public virtual void ComputeFinalValues() { double denom = (double)numTrees; meanDepth = depth2 / denom; meanLength = length2 / denom; meanBreadth = breadth2 / denom; meanConstituents = phrasalBranchingNum2.TotalCount() / denom; meanBranchingFactor = phrasalBranching2.TotalCount() / phrasalBranchingNum2.TotalCount(); //Compute *actual* stddev (we iterate over the whole population) foreach (int d in depths) { stddevDepth += Math.Pow(d - meanDepth, 2); } stddevDepth = Math.Sqrt(stddevDepth / denom); foreach (int l in lengths) { stddevLength += Math.Pow(l - meanLength, 2); } stddevLength = Math.Sqrt(stddevLength / denom); foreach (int b in breadths) { stddevBreadth += Math.Pow(b - meanBreadth, 2); } stddevBreadth = Math.Sqrt(stddevBreadth / denom); meanBranchingByLabel = new ClassicCounter <string>(); foreach (string label in phrasalBranching2.KeySet()) { double mean = phrasalBranching2.GetCount(label) / phrasalBranchingNum2.GetCount(label); meanBranchingByLabel.IncrementCount(label, mean); } oovWords = Generics.NewHashSet(words.KeySet()); oovWords.RemoveAll(trainVocab); OOVRate = (double)oovWords.Count / (double)words.KeySet().Count; }
/// <summary>Featurize a given sentence.</summary> /// <param name="sentence">The sentence to featurize.</param> /// <returns>A counter encoding the featurized sentence.</returns> private static ICounter <string> Featurize(ICoreMap sentence) { ClassicCounter <string> features = new ClassicCounter <string>(); string lastLemma = "^"; foreach (CoreLabel token in sentence.Get(typeof(CoreAnnotations.TokensAnnotation))) { string lemma = token.Lemma().ToLower(); if (number.Matcher(lemma).Matches()) { features.IncrementCount("**num**"); } else { features.IncrementCount(lemma); } if (alpha.Matcher(lemma).Matches()) { features.IncrementCount(lastLemma + "__" + lemma); lastLemma = lemma; } } features.IncrementCount(lastLemma + "__$"); return(features); }
// Records the number of times word/tag pair was seen in training data. // Counts of each tag (stored as a Label) on unknown words. // tag (Label) --> signature --> count public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees) { base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees); seenCounter = new ClassicCounter <IntTaggedWord>(); unSeenCounter = new ClassicCounter <IntTaggedWord>(); tagHash = Generics.NewHashMap(); tc = new ClassicCounter <ILabel>(); c = Generics.NewHashMap(); seenEnd = Generics.NewHashSet(); useEnd = (op.lexOptions.unknownSuffixSize > 0 && op.lexOptions.useUnknownWordSignatures > 0); useFirstCap = op.lexOptions.useUnknownWordSignatures > 0; useGT = (op.lexOptions.useUnknownWordSignatures == 0); useFirst = false; if (useFirst) { log.Info("Including first letter for unknown words."); } if (useFirstCap) { log.Info("Including whether first letter is capitalized for unknown words"); } if (useEnd) { log.Info("Classing unknown word as the average of their equivalents by identity of last " + op.lexOptions.unknownSuffixSize + " letters."); } if (useGT) { log.Info("Using Good-Turing smoothing for unknown words."); } this.indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting); this.unknownGTTrainer = (useGT) ? new UnknownGTTrainer() : null; this.model = BuildUWM(); }
public override void Train(TaggedWord tw, int loc, double weight) { if (useGT) { unknownGTTrainer.Train(tw, weight); } // scan data string word = tw.Word(); string subString = model.GetSignature(word, loc); ILabel tag = new Tag(tw.Tag()); if (!c.Contains(tag)) { c[tag] = new ClassicCounter <string>(); } c[tag].IncrementCount(subString, weight); tc.IncrementCount(tag, weight); seenEnd.Add(subString); string tagStr = tw.Tag(); IntTaggedWord iW = new IntTaggedWord(word, IntTaggedWord.Any, wordIndex, tagIndex); seenCounter.IncrementCount(iW, weight); if (treesRead > indexToStartUnkCounting) { // start doing this once some way through trees; // treesRead is 1 based counting if (seenCounter.GetCount(iW) < 2) { IntTaggedWord iT = new IntTaggedWord(IntTaggedWord.Any, tagStr, wordIndex, tagIndex); unSeenCounter.IncrementCount(iT, weight); unSeenCounter.IncrementCount(UnknownWordModelTrainerConstants.NullItw, weight); } } }
/// <summary>Method to convert features from counts to L1-normalized TFIDF based features</summary> /// <param name="datum">with a collection of features.</param> /// <param name="featureDocCounts">a counter of doc-count for each feature.</param> /// <returns>RVFDatum with l1-normalized tf-idf features.</returns> public virtual RVFDatum <L, F> GetL1NormalizedTFIDFDatum(IDatum <L, F> datum, ICounter <F> featureDocCounts) { ICounter <F> tfidfFeatures = new ClassicCounter <F>(); foreach (F feature in datum.AsFeatures()) { if (featureDocCounts.ContainsKey(feature)) { tfidfFeatures.IncrementCount(feature, 1.0); } } double l1norm = 0; foreach (F feature_1 in tfidfFeatures.KeySet()) { double idf = Math.Log(((double)(this.Size() + 1)) / (featureDocCounts.GetCount(feature_1) + 0.5)); double tf = tfidfFeatures.GetCount(feature_1); tfidfFeatures.SetCount(feature_1, tf * idf); l1norm += tf * idf; } foreach (F feature_2 in tfidfFeatures.KeySet()) { double tfidf = tfidfFeatures.GetCount(feature_2); tfidfFeatures.SetCount(feature_2, tfidf / l1norm); } RVFDatum <L, F> rvfDatum = new RVFDatum <L, F>(tfidfFeatures, datum.Label()); return(rvfDatum); }
public virtual ICounter <L> ProbabilityOf(IDatum <L, F> example) { // calculate the feature indices and feature values int[] featureIndices = LogisticUtils.IndicesOf(example.AsFeatures(), featureIndex); double[] featureValues; if (example is RVFDatum <object, object> ) { ICollection <double> featureValuesCollection = ((RVFDatum <object, object>)example).AsFeaturesCounter().Values(); featureValues = LogisticUtils.ConvertToArray(featureValuesCollection); } else { featureValues = new double[example.AsFeatures().Count]; Arrays.Fill(featureValues, 1.0); } // calculate probability of each class ICounter <L> result = new ClassicCounter <L>(); int numClasses = labelIndex.Size(); double[] sigmoids = LogisticUtils.CalculateSigmoids(weights, featureIndices, featureValues); for (int c = 0; c < numClasses; c++) { L label = labelIndex.Get(c); result.IncrementCount(label, sigmoids[c]); } return(result); }
public virtual ICounter <string> GetTopSpeakers(IList <Sieve.MentionData> closestMentions, IList <Sieve.MentionData> closestMentionsBackward, Person.Gender gender, ICoreMap quote, bool overrideGender) { ICounter <string> topSpeakerInRange = new ClassicCounter <string>(); ICounter <string> topSpeakerInRangeIgnoreGender = new ClassicCounter <string>(); ICollection <Sieve.MentionData> backwardsMentions = new HashSet <Sieve.MentionData>(closestMentionsBackward); foreach (Sieve.MentionData mention in closestMentions) { double weight = backwardsMentions.Contains(mention) ? BackwardWeight : ForwardWeight; if (mention.type.Equals(Name)) { if (!characterMap.Keys.Contains(mention.text)) { continue; } Person p = characterMap[mention.text][0]; if ((gender == Person.Gender.Male && p.gender == Person.Gender.Male) || (gender == Person.Gender.Female && p.gender == Person.Gender.Female) || (gender == Person.Gender.Unk)) { topSpeakerInRange.IncrementCount(p.name, weight); } topSpeakerInRangeIgnoreGender.IncrementCount(p.name, weight); if (closestMentions.Count == 128 && closestMentionsBackward.Count == 94) { System.Console.Out.WriteLine(p.name + " " + weight + " name"); } } else { if (mention.type.Equals(Pronoun)) { int charBeginKey = doc.Get(typeof(CoreAnnotations.TokensAnnotation))[mention.begin].BeginPosition(); Person p = DoCoreference(charBeginKey, quote); if (p != null) { if ((gender == Person.Gender.Male && p.gender == Person.Gender.Male) || (gender == Person.Gender.Female && p.gender == Person.Gender.Female) || (gender == Person.Gender.Unk)) { topSpeakerInRange.IncrementCount(p.name, weight); } topSpeakerInRangeIgnoreGender.IncrementCount(p.name, weight); if (closestMentions.Count == 128 && closestMentionsBackward.Count == 94) { System.Console.Out.WriteLine(p.name + " " + weight + " pronoun"); } } } } } if (topSpeakerInRange.Size() > 0) { return(topSpeakerInRange); } else { if (gender != Person.Gender.Unk && !overrideGender) { return(topSpeakerInRange); } } return(topSpeakerInRangeIgnoreGender); }
private NaiveBayesClassifier <L, F> TrainClassifier(int[][] data, int[] labels, int numFeatures, int numClasses, IIndex <L> labelIndex, IIndex <F> featureIndex) { ICollection <L> labelSet = Generics.NewHashSet(); NaiveBayesClassifierFactory.NBWeights nbWeights = TrainWeights(data, labels, numFeatures, numClasses); ICounter <L> priors = new ClassicCounter <L>(); double[] pr = nbWeights.priors; for (int i = 0; i < pr.Length; i++) { priors.IncrementCount(labelIndex.Get(i), pr[i]); labelSet.Add(labelIndex.Get(i)); } ICounter <Pair <Pair <L, F>, Number> > weightsCounter = new ClassicCounter <Pair <Pair <L, F>, Number> >(); double[][][] wts = nbWeights.weights; for (int c = 0; c < numClasses; c++) { L label = labelIndex.Get(c); for (int f = 0; f < numFeatures; f++) { F feature = featureIndex.Get(f); Pair <L, F> p = new Pair <L, F>(label, feature); for (int val = 0; val < wts[c][f].Length; val++) { Pair <Pair <L, F>, Number> key = new Pair <Pair <L, F>, Number>(p, int.Parse(val)); weightsCounter.IncrementCount(key, wts[c][f][val]); } } } return(new NaiveBayesClassifier <L, F>(weightsCounter, priors, labelSet)); }
public virtual RVFDatum <L, F> ScaleDatumGaussian(RVFDatum <L, F> datum) { // scale this dataset before scaling the datum if (means == null || stdevs == null) { ScaleFeaturesGaussian(); } ICounter <F> scaledFeatures = new ClassicCounter <F>(); foreach (F feature in datum.AsFeatures()) { int fID = this.featureIndex.IndexOf(feature); if (fID >= 0) { double oldVal = datum.AsFeaturesCounter().GetCount(feature); double newVal; if (stdevs[fID] != 0) { newVal = (oldVal - means[fID]) / stdevs[fID]; } else { newVal = oldVal; } scaledFeatures.IncrementCount(feature, newVal); } } return(new RVFDatum <L, F>(scaledFeatures, datum.Label())); }
public virtual void Train(TaggedWord tw, int loc, double weight) { uwModelTrainer.Train(tw, loc, weight); IntTaggedWord iTW = new IntTaggedWord(tw.Word(), tw.Tag(), wordIndex, tagIndex); seenCounter.IncrementCount(iTW, weight); IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag); seenCounter.IncrementCount(iT, weight); IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag); seenCounter.IncrementCount(iW, weight); IntTaggedWord i = new IntTaggedWord(nullWord, nullTag); seenCounter.IncrementCount(i, weight); // rules.add(iTW); tags.Add(iT); words.Add(iW); string tag = tw.Tag(); string baseTag = op.Langpack().BasicCategory(tag); ICounter <string> counts = baseTagCounts[baseTag]; if (counts == null) { counts = new ClassicCounter <string>(); baseTagCounts[baseTag] = counts; } counts.IncrementCount(tag, weight); }
private static void ModifyUsingCoreNLPNER(Annotation doc) { Properties ann = new Properties(); ann.SetProperty("annotators", "pos, lemma, ner"); StanfordCoreNLP pipeline = new StanfordCoreNLP(ann, false); pipeline.Annotate(doc); foreach (ICoreMap sentence in doc.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <EntityMention> entities = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)); if (entities != null) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); foreach (EntityMention en in entities) { //System.out.println("old ner tag for " + en.getExtentString() + " was " + en.getType()); Span s = en.GetExtent(); ICounter <string> allNertagforSpan = new ClassicCounter <string>(); for (int i = s.Start(); i < s.End(); i++) { allNertagforSpan.IncrementCount(tokens[i].Ner()); } string entityNertag = Counters.Argmax(allNertagforSpan); en.SetType(entityNertag); } } } }
public static void Main(string[] args) { Edu.Stanford.Nlp.Classify.RVFDataset <string, string> data = new Edu.Stanford.Nlp.Classify.RVFDataset <string, string>(); ClassicCounter <string> c1 = new ClassicCounter <string>(); c1.IncrementCount("fever", 3.5); c1.IncrementCount("cough", 1.1); c1.IncrementCount("congestion", 4.2); ClassicCounter <string> c2 = new ClassicCounter <string>(); c2.IncrementCount("fever", 1.5); c2.IncrementCount("cough", 2.1); c2.IncrementCount("nausea", 3.2); ClassicCounter <string> c3 = new ClassicCounter <string>(); c3.IncrementCount("cough", 2.5); c3.IncrementCount("congestion", 3.2); data.Add(new RVFDatum <string, string>(c1, "cold")); data.Add(new RVFDatum <string, string>(c2, "flu")); data.Add(new RVFDatum <string, string>(c3, "cold")); data.SummaryStatistics(); LinearClassifierFactory <string, string> factory = new LinearClassifierFactory <string, string>(); factory.UseQuasiNewton(); LinearClassifier <string, string> c = factory.TrainClassifier(data); ClassicCounter <string> c4 = new ClassicCounter <string>(); c4.IncrementCount("cough", 2.3); c4.IncrementCount("fever", 1.3); RVFDatum <string, string> datum = new RVFDatum <string, string>(c4); c.JustificationOf((IDatum <string, string>)datum); }
/// <summary> /// Calculate sister annotation statistics suitable for doing /// selective sister splitting in the PCFGParser inside the /// FactoredParser. /// </summary> /// <param name="args">One argument: path to the Treebank</param> public static void Main(string[] args) { ClassicCounter <string> c = new ClassicCounter <string>(); c.SetCount("A", 0); c.SetCount("B", 1); double d = Counters.KlDivergence(c, c); System.Console.Out.WriteLine("KL Divergence: " + d); string encoding = "UTF-8"; if (args.Length > 1) { encoding = args[1]; } if (args.Length < 1) { System.Console.Out.WriteLine("Usage: ParentAnnotationStats treebankPath"); } else { SisterAnnotationStats pas = new SisterAnnotationStats(); Treebank treebank = new DiskTreebank(null, encoding); treebank.LoadPath(args[0]); treebank.Apply(pas); pas.PrintStats(); } }
public virtual void RunCoref(Document document) { IDictionary <Pair <int, int>, bool> mentionPairs = CorefUtils.GetUnlabeledMentionPairs(document); if (mentionPairs.Count == 0) { return; } Compressor <string> compressor = new Compressor <string>(); DocumentExamples examples = extractor.Extract(0, document, mentionPairs, compressor); ICounter <Pair <int, int> > classificationScores = new ClassicCounter <Pair <int, int> >(); ICounter <Pair <int, int> > rankingScores = new ClassicCounter <Pair <int, int> >(); ICounter <int> anaphoricityScores = new ClassicCounter <int>(); foreach (Example example in examples.examples) { CorefUtils.CheckForInterrupt(); Pair <int, int> mentionPair = new Pair <int, int>(example.mentionId1, example.mentionId2); classificationScores.IncrementCount(mentionPair, classificationModel.Predict(example, examples.mentionFeatures, compressor)); rankingScores.IncrementCount(mentionPair, rankingModel.Predict(example, examples.mentionFeatures, compressor)); if (!anaphoricityScores.ContainsKey(example.mentionId2)) { anaphoricityScores.IncrementCount(example.mentionId2, anaphoricityModel.Predict(new Example(example, false), examples.mentionFeatures, compressor)); } } ClustererDataLoader.ClustererDoc doc = new ClustererDataLoader.ClustererDoc(0, classificationScores, rankingScores, anaphoricityScores, mentionPairs, null, document.predictedMentionsByID.Stream().Collect(Collectors.ToMap(null, null))); foreach (Pair <int, int> mentionPair_1 in clusterer.GetClusterMerges(doc)) { CorefUtils.MergeCoreferenceClusters(mentionPair_1, document); } }
public virtual EntityMention MakeEntityMention(ICoreMap sentence, int start, int end, string label, string identifier) { Span span = new Span(start, end); string type = null; string subtype = null; if (!label.StartsWith("B-") && !label.StartsWith("I-")) { type = label; subtype = null; } else { // TODO: add support for subtypes! (needed at least in ACE) type = Sharpen.Runtime.Substring(label, 2); subtype = null; } // TODO: add support for subtypes! (needed at least in ACE) EntityMention entity = entityMentionFactory.ConstructEntityMention(identifier, sentence, span, span, type, subtype, null); ICounter <string> probs = new ClassicCounter <string>(); probs.SetCount(entity.GetType(), 1.0); entity.SetTypeProbabilities(probs); return(entity); }
public virtual void TestGetDistributionFromLogValues() { ICounter <string> c1 = new ClassicCounter <string>(); c1.SetCount("p", 1.0); c1.SetCount("q", 2.0); c1.SetCount("r", 3.0); c1.SetCount("s", 4.0); // take log Counters.LogInPlace(c1); // now call distribution Distribution <string> distribution = Distribution.GetDistributionFromLogValues(c1); // test NUnit.Framework.Assert.AreEqual(distribution.KeySet().Count, 4); // size // keys NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("p"), true); NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("q"), true); NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("r"), true); NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("s"), true); // values NUnit.Framework.Assert.AreEqual(distribution.GetCount("p"), 1.0E-1, 1E-10); NUnit.Framework.Assert.AreEqual(distribution.GetCount("q"), 2.0E-1, 1E-10); NUnit.Framework.Assert.AreEqual(distribution.GetCount("r"), 3.0E-1, 1E-10); NUnit.Framework.Assert.AreEqual(distribution.GetCount("s"), 4.0E-1, 1E-10); }
public override Pair <double, double> GetScore(IList <IList <int> > clusters, IDictionary <int, IList <int> > mentionToGold) { double num = 0; int dem = 0; foreach (IList <int> c in clusters) { if (c.Count == 1) { continue; } ICounter <IList <int> > goldCounts = new ClassicCounter <IList <int> >(); double correct = 0; foreach (int m in c) { IList <int> goldCluster = mentionToGold[m]; if (goldCluster != null) { goldCounts.IncrementCount(goldCluster); } } foreach (KeyValuePair <IList <int>, double> e in goldCounts.EntrySet()) { if (e.Key.Count != 1) { correct += e.Value * e.Value; } } num += correct / c.Count; dem += c.Count; } return(new Pair <double, double>(num, (double)dem)); }
public override IUnknownWordModel FinishTraining() { // Map<String,Float> unknownGT = null; if (useGT) { unknownGTTrainer.FinishTraining(); } // unknownGT = unknownGTTrainer.unknownGT; foreach (ILabel tagLab in c.Keys) { // outer iteration is over tags as Labels ClassicCounter <string> wc = c[tagLab]; // counts for words given a tag if (!tagHash.Contains(tagLab)) { tagHash[tagLab] = new ClassicCounter <string>(); } // the UNKNOWN first character is assumed to be seen once in // each tag // this is really sort of broken! (why??) tc.IncrementCount(tagLab); wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0); // inner iteration is over words as strings foreach (string first in wc.KeySet()) { double prob = Math.Log(((wc.GetCount(first))) / tc.GetCount(tagLab)); tagHash[tagLab].SetCount(first, prob); } } //if (Test.verbose) //EncodingPrintWriter.out.println(tag + " rewrites as " + first + " first char with probability " + prob,encoding); return(model); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> private void ReadObject(ObjectInputStream stream) { stream.DefaultReadObject(); // log.info("Before decompression:"); // log.info("arg size: " + argCounter.size() + " total: " + argCounter.totalCount()); // log.info("stop size: " + stopCounter.size() + " total: " + stopCounter.totalCount()); ClassicCounter <IntDependency> compressedArgC = argCounter; argCounter = new ClassicCounter <IntDependency>(); ClassicCounter <IntDependency> compressedStopC = stopCounter; stopCounter = new ClassicCounter <IntDependency>(); foreach (IntDependency d in compressedArgC.KeySet()) { double count = compressedArgC.GetCount(d); ExpandArg(d, d.distance, count); } foreach (IntDependency d_1 in compressedStopC.KeySet()) { double count = compressedStopC.GetCount(d_1); ExpandStop(d_1, d_1.distance, count, false); } // log.info("After decompression:"); // log.info("arg size: " + argCounter.size() + " total: " + argCounter.totalCount()); // log.info("stop size: " + stopCounter.size() + " total: " + stopCounter.totalCount()); expandDependencyMap = null; }
private static void LoadSignatures(string file, IDictionary <string, ICounter <string> > sigs) { BufferedReader reader = null; try { reader = IOUtils.ReaderFromString(file); while (reader.Ready()) { string[] split = reader.ReadLine().Split("\t"); ICounter <string> cntr = new ClassicCounter <string>(); sigs[split[0]] = cntr; for (int i = 1; i < split.Length; i = i + 2) { cntr.SetCount(split[i], double.ParseDouble(split[i + 1])); } } } catch (IOException e) { throw new Exception(e); } finally { IOUtils.CloseIgnoringExceptions(reader); } }
public override void PrintResults(PrintWriter pw, IList <ICoreMap> goldStandard, IList <ICoreMap> extractorOutput) { ResultsPrinter.Align(goldStandard, extractorOutput); // the mention factory cannot be null here System.Diagnostics.Debug.Assert(relationMentionFactory != null, "ERROR: RelationExtractorResultsPrinter.relationMentionFactory cannot be null in printResults!"); // Count predicted-actual relation type pairs ICounter <Pair <string, string> > results = new ClassicCounter <Pair <string, string> >(); ClassicCounter <string> labelCount = new ClassicCounter <string>(); // TODO: assumes binary relations for (int goldSentenceIndex = 0; goldSentenceIndex < goldStandard.Count; goldSentenceIndex++) { foreach (RelationMention goldRelation in AnnotationUtils.GetAllRelations(relationMentionFactory, goldStandard[goldSentenceIndex], createUnrelatedRelations)) { ICoreMap extractorSentence = extractorOutput[goldSentenceIndex]; IList <RelationMention> extractorRelations = AnnotationUtils.GetRelations(relationMentionFactory, extractorSentence, goldRelation.GetArg(0), goldRelation.GetArg(1)); labelCount.IncrementCount(goldRelation.GetType()); foreach (RelationMention extractorRelation in extractorRelations) { results.IncrementCount(new Pair <string, string>(extractorRelation.GetType(), goldRelation.GetType())); } } } PrintResultsInternal(pw, results, labelCount); }
/// <summary>Destructively modifies the input and returns it as a convenience.</summary> public virtual Pair <UnaryGrammar, BinaryGrammar> Apply(Pair <UnaryGrammar, BinaryGrammar> bgug) { Alpha = trainOptions.ruleSmoothingAlpha; ICounter <string> symWeights = new ClassicCounter <string>(); ICounter <string> symCounts = new ClassicCounter <string>(); //Tally unary rules foreach (UnaryRule rule in bgug.First()) { if (!tagIndex.Contains(rule.parent)) { UpdateCounters(rule, symWeights, symCounts); } } //Tally binary rules foreach (BinaryRule rule_1 in bgug.Second()) { UpdateCounters(rule_1, symWeights, symCounts); } //Compute smoothed rule scores, unary foreach (UnaryRule rule_2 in bgug.First()) { if (!tagIndex.Contains(rule_2.parent)) { rule_2.score = SmoothRuleWeight(rule_2, symWeights, symCounts); } } //Compute smoothed rule scores, binary foreach (BinaryRule rule_3 in bgug.Second()) { rule_3.score = SmoothRuleWeight(rule_3, symWeights, symCounts); } return(bgug); }
public override IUnknownWordModel FinishTraining() { if (useGT) { unknownGTTrainer.FinishTraining(); } foreach (KeyValuePair <ILabel, ClassicCounter <string> > entry in c) { /* outer iteration is over tags */ ILabel key = entry.Key; ClassicCounter <string> wc = entry.Value; // counts for words given a tag if (!tagHash.Contains(key)) { tagHash[key] = new ClassicCounter <string>(); } /* the UNKNOWN sequence is assumed to be seen once in each tag */ // This is sort of broken, but you can regard it as a Dirichlet prior. tc.IncrementCount(key); wc.SetCount(UnknownWordModelTrainerConstants.unknown, 1.0); /* inner iteration is over words */ foreach (string end in wc.KeySet()) { double prob = Math.Log((wc.GetCount(end)) / (tc.GetCount(key))); // p(sig|tag) tagHash[key].SetCount(end, prob); } } //if (Test.verbose) //EncodingPrintWriter.out.println(tag + " rewrites as " + end + " endchar with probability " + prob,encoding); return(model); }
private void PrintResultsInternal(PrintWriter pw, ICounter <Pair <string, string> > results, ClassicCounter <string> labelCount) { ClassicCounter <string> correct = new ClassicCounter <string>(); ClassicCounter <string> predictionCount = new ClassicCounter <string>(); bool countGoldLabels = false; if (labelCount == null) { labelCount = new ClassicCounter <string>(); countGoldLabels = true; } foreach (Pair <string, string> predictedActual in results.KeySet()) { string predicted = predictedActual.first; string actual = predictedActual.second; if (predicted.Equals(actual)) { correct.IncrementCount(actual, results.GetCount(predictedActual)); } predictionCount.IncrementCount(predicted, results.GetCount(predictedActual)); if (countGoldLabels) { labelCount.IncrementCount(actual, results.GetCount(predictedActual)); } } DecimalFormat formatter = new DecimalFormat(); formatter.SetMaximumFractionDigits(1); formatter.SetMinimumFractionDigits(1); double totalCount = 0; double totalCorrect = 0; double totalPredicted = 0; pw.Println("Label\tCorrect\tPredict\tActual\tPrecn\tRecall\tF"); IList <string> labels = new List <string>(labelCount.KeySet()); labels.Sort(); foreach (string label in labels) { double numcorrect = correct.GetCount(label); double predicted = predictionCount.GetCount(label); double trueCount = labelCount.GetCount(label); double precision = (predicted > 0) ? (numcorrect / predicted) : 0; double recall = numcorrect / trueCount; double f = (precision + recall > 0) ? 2 * precision * recall / (precision + recall) : 0.0; pw.Println(StringUtils.PadOrTrim(label, MaxLabelLength) + "\t" + numcorrect + "\t" + predicted + "\t" + trueCount + "\t" + formatter.Format(precision * 100) + "\t" + formatter.Format(100 * recall) + "\t" + formatter.Format(100 * f)); if (!RelationMention.IsUnrelatedLabel(label)) { totalCount += trueCount; totalCorrect += numcorrect; totalPredicted += predicted; } } double precision_1 = (totalPredicted > 0) ? (totalCorrect / totalPredicted) : 0; double recall_1 = totalCorrect / totalCount; double f_1 = (totalPredicted > 0 && totalCorrect > 0) ? 2 * precision_1 * recall_1 / (precision_1 + recall_1) : 0.0; pw.Println("Total\t" + totalCorrect + "\t" + totalPredicted + "\t" + totalCount + "\t" + formatter.Format(100 * precision_1) + "\t" + formatter.Format(100 * recall_1) + "\t" + formatter.Format(100 * f_1)); }
protected internal override void Calculate(double[] x) { classifier.SetWeights(To2D(x)); if (derivative == null) { derivative = new double[x.Length]; } else { Arrays.Fill(derivative, 0.0); } ICounter <Triple <int, int, int> > feature2classPairDerivatives = new ClassicCounter <Triple <int, int, int> >(); value = 0.0; for (int n = 0; n < geFeatures.Count; n++) { //F feature = geFeatures.get(n); double[] modelDist = new double[numClasses]; Arrays.Fill(modelDist, 0); //go over the unlabeled active data to compute expectations IList <int> activeData = geFeature2DatumList[n]; foreach (int activeDatum in activeData) { IDatum <L, F> datum = unlabeledDataList[activeDatum]; double[] probs = GetModelProbs(datum); for (int c = 0; c < numClasses; c++) { modelDist[c] += probs[c]; } UpdateDerivative(datum, probs, feature2classPairDerivatives); } //computes p(y_d)*(1-p(y_d))*f_d for all active features. //now compute the value (KL-divergence) and the final value of the derivative. if (activeData.Count > 0) { for (int c = 0; c < numClasses; c++) { modelDist[c] /= activeData.Count; } SmoothDistribution(modelDist); for (int c_1 = 0; c_1 < numClasses; c_1++) { value += -geFeature2EmpiricalDist[n][c_1] * Math.Log(modelDist[c_1]); } for (int f = 0; f < labeledDataset.FeatureIndex().Size(); f++) { for (int c_2 = 0; c_2 < numClasses; c_2++) { int wtIndex = IndexOf(f, c_2); for (int cPrime = 0; cPrime < numClasses; cPrime++) { derivative[wtIndex] += feature2classPairDerivatives.GetCount(new Triple <int, int, int>(f, c_2, cPrime)) * geFeature2EmpiricalDist[n][cPrime] / modelDist[cPrime]; } derivative[wtIndex] /= activeData.Count; } } } } }
// true: DT JJ NN -> DT "JJ NN", false: DT "DT" // = false; /// <summary> /// If this is set to true, then the binarizer will choose selectively whether or not to /// split states based on how many counts the states had in a previous run. /// </summary> /// <remarks> /// If this is set to true, then the binarizer will choose selectively whether or not to /// split states based on how many counts the states had in a previous run. These counts are /// stored in an internal counter, which will be added to when doSelectiveSplit is false. /// If passed false, this will initialize (clear) the counts. /// </remarks> /// <param name="doSelectiveSplit">Record this value and reset internal counter if false</param> public virtual void SetDoSelectiveSplit(bool doSelectiveSplit) { this.doSelectiveSplit = doSelectiveSplit; if (!doSelectiveSplit) { stateCounter = new ClassicCounter <string>(); } }
// boundary tag -- assumed not a real tag public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees) { base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees); indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting); seenCounter = new ClassicCounter <IntTaggedWord>(); unSeenCounter = new ClassicCounter <IntTaggedWord>(); model = new FrenchUnknownWordModel(op, lex, wordIndex, tagIndex, unSeenCounter); }