public virtual void TestPairsHolder() { PairsHolder pairsHolder = new PairsHolder(); for (int i = 0; i < 10; i++) { pairsHolder.Add(new WordTag("girl", "NN")); } MaxentTagger maxentTagger = new MaxentTagger(); maxentTagger.Init(null); //maxentTagger.pairs = pairsHolder; History h = new History(0, 5, 3, pairsHolder, maxentTagger.extractors); TaggerExperiments te = new TaggerExperiments(maxentTagger); int x = te.GetHistoryTable().Add(h); //int x = maxentTagger.tHistories.add(h); int y = te.GetHistoryTable().GetIndex(h); //int y = maxentTagger.tHistories.getIndex(h); NUnit.Framework.Assert.AreEqual("Failing to get same index for history", x, y); Extractor e = new Extractor(0, false); string k = e.Extract(h); NUnit.Framework.Assert.AreEqual("Extractor didn't find stored word", k, "girl"); }
internal override string Extract(History h, PairsHolder pH) { // I ran a bunch of timing tests that seem to indicate it is // cheaper to simply add string + char + string than use a // StringBuilder or go through the StringBuildMemoizer -horatio return(pH.GetTag(h, leftPosition) + '!' + pH.GetTag(h, rightPosition)); }
internal override string Extract(History h, PairsHolder pH) { StringBuilder sb = new StringBuilder(); if (position < 0) { for (int idx = position; idx < 0; idx++) { if (idx != position) { sb.Append('!'); } sb.Append(pH.GetTag(h, idx)); } } else { for (int idx = position; idx > 0; idx--) { if (idx != position) { sb.Append('!'); } sb.Append(pH.GetTag(h, idx)); } } return(sb.ToString()); }
internal override string Extract(History h, PairsHolder pH) { string s = base.Extract(h, pH); string shape = WordShapeClassifier.WordShape(s, wordShaper); return(shape); }
internal History(PairsHolder pairs, Extractors extractors) { // this is the index of the first word of the sentence //this is the index of the last word in the sentence - the dot // this is the index of the current word this.pairs = pairs; this.extractors = extractors; }
internal override string Extract(History h, PairsHolder pH) { string cw = pH.GetWord(h, 0); string lk = cw.ToLower(Locale.English); if (lk.Equals(cw)) { return(zeroSt); } return(cw); }
internal override string Extract(History h, PairsHolder pH) { StringBuilder sb = new StringBuilder(); for (int j = left; j <= right; j++) { string s = pH.GetWord(h, j); sb.Append(WordShapeClassifier.WordShape(s, wordShaper)); if (j < right) { sb.Append('|'); } } return(sb.ToString()); }
internal override string Extract(History h, PairsHolder pH) { StringBuilder sb = new StringBuilder(); for (int j = left; j <= right; j++) { string word = pH.GetWord(h, j); string distSim = lexicon.GetMapping(word); sb.Append(distSim); if (j < right) { sb.Append('|'); } } return(sb.ToString()); }
protected internal ReadDataTagged(TaggerConfig config, MaxentTagger maxentTagger, PairsHolder pairs) { //TODO: make a class DataHolder that holds the dict, tags, pairs, etc, for tagger and pass it around this.maxentTagger = maxentTagger; this.pairs = pairs; IList <TaggedFileRecord> fileRecords = TaggedFileRecord.CreateRecords(config, config.GetFile()); IDictionary <string, IntCounter <string> > wordTagCounts = Generics.NewHashMap(); foreach (TaggedFileRecord record in fileRecords) { LoadFile(record.Reader(), wordTagCounts); } // By counting the words and then filling the Dictionary, we can // make it so there are no calls that mutate the Dictionary or its // TagCount objects later maxentTagger.dict.FillWordTagCounts(wordTagCounts); }
internal override string Extract(History h, PairsHolder pH) { string cword = pH.GetWord(h, 0); int allCount = dict.Sum(cword); int vBNCount = dict.GetCount(cword, vbnTag); int vBDCount = dict.GetCount(cword, vbdTag); // Conditions for deciding inapplicable if ((allCount == 0) && (!(cword.EndsWith(edSuff) || cword.EndsWith(enSuff)))) { return(zeroSt); } if ((allCount > 0) && (vBNCount + vBDCount <= allCount / 100)) { return(zeroSt); } string lastverb = naWord; //String lastvtag = zeroSt; // mg: written but never read for (int index = -1; index >= -bound; index--) { string word2 = pH.GetWord(h, index); if ("NA".Equals(word2)) { break; } if (stopper.Matcher(word2).Matches()) { break; } if (vbnWord.Matcher(word2).Matches()) { lastverb = word2; break; } index--; } if (!lastverb.Equals(naWord)) { log.Info("VBN: For " + cword + ", found preceding VBN cue " + lastverb); return(oneSt); } return(zeroSt); }
/// <summary>This method gets feature statistics from a training file found in the TaggerConfig.</summary> /// <remarks> /// This method gets feature statistics from a training file found in the TaggerConfig. /// It is the start of the training process. /// </remarks> /// <exception cref="System.IO.IOException"/> protected internal TaggerExperiments(TaggerConfig config, MaxentTagger maxentTagger) : this(maxentTagger) { log.Info("TaggerExperiments: adding word/tags"); PairsHolder pairs = new PairsHolder(); ReadDataTagged c = new ReadDataTagged(config, maxentTagger, pairs); vArray = new int[][] { }; InitTemplatesNew(); log.Info("Featurizing tagged data tokens..."); for (int i = 0; i < size; i++) { DataWordTag d = c.Get(i); string yS = d.GetY(); History h = d.GetHistory(); int indX = tHistories.Add(h); int indY = d.GetYInd(); AddTemplatesNew(h, yS); AddRareTemplatesNew(h, yS); vArray[i][0] = indX; vArray[i][1] = indY; } // It's the 2010s now and it doesn't take so long to featurize.... // if (i > 0 && (i % 10000) == 0) { // System.err.printf("%d ", i); // if (i % 100000 == 0) { System.err.println(); } // } // log.info(); log.Info("Featurized " + c.GetSize() + " data tokens [done]."); c.Release(); Ptilde(); maxentTagger.xSize = xSize; maxentTagger.ySize = ySize; log.Info("xSize [num Phi templates] = " + xSize + "; ySize [num classes] = " + ySize); HashHistories(); // if we'll look at occurring tags only, we need the histories and pairs still if (!maxentTagger.occurringTagsOnly && !maxentTagger.possibleTagsOnly) { tHistories.Release(); pairs.Clear(); } GetFeaturesNew(); }
internal virtual string ExtractLV(History h, PairsHolder pH, int bound) { // should extract last verbal word and also the current word int start = h.start; string lastverb = "NA"; int current = h.current; int index = current - 1; while ((index >= start) && (index >= current - bound)) { string tag = pH.GetTag(index); if (tag.StartsWith("VB")) { lastverb = pH.GetWord(index); break; } if (tag.StartsWith(",")) { break; } index--; } return(lastverb); }
// By default the bound is ignored, but a few subclasses make use of it. internal virtual string Extract(History h, PairsHolder pH, int bound) { return(Extract(h, pH)); }
internal virtual string Extract(History h, PairsHolder pH) { return(isTag ? pH.GetTag(h, position) : pH.GetWord(h, position)); }
internal override string Extract(History h, PairsHolder pH) { string word = base.Extract(h, pH); return(lexicon.GetMapping(word)); }
internal History(int start, int end, int current, PairsHolder pairs, Extractors extractors) { this.pairs = pairs; this.extractors = extractors; Init(start, end, current); }
internal override string Extract(History h, PairsHolder pH) { string tag = base.Extract(h, pH); return(tag.StartsWith("vs") ? "1" : "0"); }
internal override string Extract(History h, PairsHolder pH) { return(pH.GetWord(h, position).ToLower(Locale.English)); }
internal override string Extract(History h, PairsHolder pH) { return(pH.GetTag(h, position1) + '!' + pH.GetWord(h, word) + '!' + pH.GetTag(h, position2)); }
internal override string Extract(History h, PairsHolder pH) { return(pH.GetWord(h, leftWord) + '!' + pH.GetTag(h, tag) + '!' + pH.GetWord(h, rightWord)); }