internal override string Extract(History h, PairsHolder pH) { string s = base.Extract(h, pH); string shape = WordShapeClassifier.WordShape(s, wordShaper); return(shape); }
public static void GenericCheck(int wordshape, string[] @in, string[] shape, string[] knownLCWords) { NUnit.Framework.Assert.AreEqual("WordShapeClassifierTest is bung: array sizes differ", @in.Length, shape.Length); ICollection <string> knownLCset = null; if (knownLCWords != null) { knownLCset = new HashSet <string>(Arrays.AsList(knownLC)); } for (int i = 0; i < @in.Length; i++) { NUnit.Framework.Assert.AreEqual("WordShape " + wordshape + " for " + @in[i] + " with " + (knownLCset == null ? "null" : "non-null") + " knownLCwords is not correct!", shape[i], WordShapeClassifier.WordShape(@in[i], wordshape, knownLCset)); } try { WordShapeClassifier.WordShape(null, wordshape); Fail("WordShapeClassifier threw no exception on null"); } catch (ArgumentNullException) { } catch (Exception) { // this is the good answer Fail("WordShapeClassifier didn't throw NullPointerException on null"); } }
public virtual string WordShape(string word) { string wordShape = constVars.GetWordShapeCache()[word]; if (wordShape == null) { wordShape = WordShapeClassifier.WordShape(word, constVars.wordShaper); constVars.GetWordShapeCache()[word] = wordShape; } return(wordShape); }
internal override string Extract(History h, PairsHolder pH) { StringBuilder sb = new StringBuilder(); for (int j = left; j <= right; j++) { string s = pH.GetWord(h, j); sb.Append(WordShapeClassifier.WordShape(s, wordShaper)); if (j < right) { sb.Append('|'); } } return(sb.ToString()); }
public virtual string DistSimClass(string word) { if (!cased) { word = word.ToLower(); } if (numberEquivalence) { word = WordShapeClassifier.WordShape(word, WordShapeClassifier.Wordshapedigits); } string distSim = lexicon[word]; if (distSim == null) { distSim = unknownWordClass; } return(distSim); }
public DistSimClassifier(string filename, string format, string encoding, int distSimMaxBits, bool cased, bool numberEquivalence, string unknownWordClass) { this.cased = cased; this.numberEquivalence = numberEquivalence; this.unknownWordClass = unknownWordClass; Timing.StartDoing("Loading distsim lexicon from " + filename); lexicon = Generics.NewHashMap(1 << 15); // make a reasonable starting size bool terryKoo = "terryKoo".Equals(format); foreach (string line in ObjectBank.GetLineIterator(filename, encoding)) { string word; string wordClass; if (terryKoo) { string[] bits = line.Split("\\t"); word = bits[1]; wordClass = bits[0]; if (distSimMaxBits > 0 && wordClass.Length > distSimMaxBits) { wordClass = Sharpen.Runtime.Substring(wordClass, 0, distSimMaxBits); } } else { // "alexClark" string[] bits = line.Split("\\s+"); word = bits[0]; wordClass = bits[1]; } if (!cased) { word = word.ToLower(); } if (numberEquivalence) { word = WordShapeClassifier.WordShape(word, WordShapeClassifier.Wordshapedigits); } lexicon[word] = wordClass; } Timing.EndDoing(); }
private void DoBasicStuff(IList <In> doc) { int position = 0; foreach (IN fl in doc) { // position in document fl.Set(typeof(CoreAnnotations.PositionAnnotation), int.ToString((position++))); // word shape if ((flags.wordShape > WordShapeClassifier.Nowordshape) && !flags.useShapeStrings) { // TODO: if we pass in a FeatureFactory, as suggested by an earlier comment, // we should use that FeatureFactory's getWord function string word = fl.Get(typeof(CoreAnnotations.TextAnnotation)); if (flags.wordFunction != null) { word = flags.wordFunction.Apply(word); } if (!word.IsEmpty() && char.IsLowerCase(word.CodePointAt(0))) { knownLCWords.Add(word); } string s = Intern(WordShapeClassifier.WordShape(word, flags.wordShape, knownLCWords)); fl.Set(typeof(CoreAnnotations.ShapeAnnotation), s); } // normalizing and interning was the following; should presumably now be: // if ("CTBSegDocumentReader".equalsIgnoreCase(flags.documentReader)) { if (Sharpen.Runtime.EqualsIgnoreCase("edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter", flags.readerAndWriter)) { // for Chinese segmentation, "word" is no use and ignore goldAnswer for memory efficiency. fl.Set(typeof(CoreAnnotations.CharAnnotation), Intern(Fix(fl.Get(typeof(CoreAnnotations.CharAnnotation))))); } else { fl.Set(typeof(CoreAnnotations.TextAnnotation), Intern(Fix(fl.Get(typeof(CoreAnnotations.TextAnnotation))))); // only override GoldAnswer if not set - so that a DocumentReaderAndWriter can set it right in the first place. if (fl.Get(typeof(CoreAnnotations.GoldAnswerAnnotation)) == null) { fl.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), fl.Get(typeof(CoreAnnotations.AnswerAnnotation))); } } } }
public static void OutputResults(int wordshape, string[] @in, string[] shape, string[] knownLCWords) { System.Console.Out.WriteLine("======================"); System.Console.Out.WriteLine(" Classifier " + wordshape); System.Console.Out.WriteLine("======================"); ICollection <string> knownLCset = null; if (knownLCWords != null) { knownLCset = new HashSet <string>(Arrays.AsList(knownLC)); } for (int i = 0; i < @in.Length; ++i) { string result = WordShapeClassifier.WordShape(@in[i], wordshape, knownLCset); System.Console.Out.Write(" " + @in[i] + ": " + result); if (i < shape.Length) { System.Console.Out.Write(" (" + shape[i] + ")"); } System.Console.Out.WriteLine(); } }