示例#1
0
        internal override string Extract(History h, PairsHolder pH)
        {
            string s     = base.Extract(h, pH);
            string shape = WordShapeClassifier.WordShape(s, wordShaper);

            return(shape);
        }
示例#2
0
        public static void GenericCheck(int wordshape, string[] @in, string[] shape, string[] knownLCWords)
        {
            NUnit.Framework.Assert.AreEqual("WordShapeClassifierTest is bung: array sizes differ", @in.Length, shape.Length);
            ICollection <string> knownLCset = null;

            if (knownLCWords != null)
            {
                knownLCset = new HashSet <string>(Arrays.AsList(knownLC));
            }
            for (int i = 0; i < @in.Length; i++)
            {
                NUnit.Framework.Assert.AreEqual("WordShape " + wordshape + " for " + @in[i] + " with " + (knownLCset == null ? "null" : "non-null") + " knownLCwords is not correct!", shape[i], WordShapeClassifier.WordShape(@in[i], wordshape, knownLCset));
            }
            try
            {
                WordShapeClassifier.WordShape(null, wordshape);
                Fail("WordShapeClassifier threw no exception on null");
            }
            catch (ArgumentNullException)
            {
            }
            catch (Exception)
            {
                // this is the good answer
                Fail("WordShapeClassifier didn't throw NullPointerException on null");
            }
        }
示例#3
0
 internal ExtractorWordShapeClassifier(int position, string wsc)
     : base(position, false)
 {
     // end class ExtractorFrames
     // This cache speeds things up a little bit.  I used
     // -Xrunhprof:cpu=samples,interval=1 when using the "distsim" tagger
     // on the training set to measure roughly how much time was spent in
     // this method.  I concluded that with the cache, 1.24% of the time
     // is spent here, and without the cache, 1.26% of the time is spent
     // here.  This is a very small savings, which would be even smaller
     // if we make the cache thread safe.  It turns out that, as written,
     // the cache is not thread safe for various reasons.  In particular,
     // it assumes only one wordshape classifier is ever used, which
     // might not be true even with just one tagger, and has an even
     // higher chance of not being true if there are multiple taggers.
     // Furthermore, access to the cache should really be synchronized
     // regardless.  The easiest solution is to comment out the cache and
     // note that if you want to bring it back, make it a map from wsc to
     // cache rather than just a single cache.  -- horatio
     //private static final Map<String, String> shapes =
     //  Generics.newHashMap();
     // --- should be:
     //private static final Map<String, Map<String, String>> ...
     wordShaper = WordShapeClassifier.LookupShaper(wsc);
     name       = "ExtractorWordShapeClassifier(" + position + ',' + wsc + ')';
 }
示例#4
0
 internal ExtractorWordShapeConjunction(int left, int right, string wsc)
     : base()
 {
     this.left  = left;
     this.right = right;
     wordShaper = WordShapeClassifier.LookupShaper(wsc);
     name       = "ExtractorWordShapeConjunction(" + left + ',' + right + ',' + wsc + ')';
 }
        public virtual string WordShape(string word)
        {
            string wordShape = constVars.GetWordShapeCache()[word];

            if (wordShape == null)
            {
                wordShape = WordShapeClassifier.WordShape(word, constVars.wordShaper);
                constVars.GetWordShapeCache()[word] = wordShape;
            }
            return(wordShape);
        }
示例#6
0
        internal override string Extract(History h, PairsHolder pH)
        {
            StringBuilder sb = new StringBuilder();

            for (int j = left; j <= right; j++)
            {
                string s = pH.GetWord(h, j);
                sb.Append(WordShapeClassifier.WordShape(s, wordShaper));
                if (j < right)
                {
                    sb.Append('|');
                }
            }
            return(sb.ToString());
        }
        public virtual string DistSimClass(string word)
        {
            if (!cased)
            {
                word = word.ToLower();
            }
            if (numberEquivalence)
            {
                word = WordShapeClassifier.WordShape(word, WordShapeClassifier.Wordshapedigits);
            }
            string distSim = lexicon[word];

            if (distSim == null)
            {
                distSim = unknownWordClass;
            }
            return(distSim);
        }
        public DistSimClassifier(string filename, string format, string encoding, int distSimMaxBits, bool cased, bool numberEquivalence, string unknownWordClass)
        {
            this.cased             = cased;
            this.numberEquivalence = numberEquivalence;
            this.unknownWordClass  = unknownWordClass;
            Timing.StartDoing("Loading distsim lexicon from " + filename);
            lexicon = Generics.NewHashMap(1 << 15);
            // make a reasonable starting size
            bool terryKoo = "terryKoo".Equals(format);

            foreach (string line in ObjectBank.GetLineIterator(filename, encoding))
            {
                string word;
                string wordClass;
                if (terryKoo)
                {
                    string[] bits = line.Split("\\t");
                    word      = bits[1];
                    wordClass = bits[0];
                    if (distSimMaxBits > 0 && wordClass.Length > distSimMaxBits)
                    {
                        wordClass = Sharpen.Runtime.Substring(wordClass, 0, distSimMaxBits);
                    }
                }
                else
                {
                    // "alexClark"
                    string[] bits = line.Split("\\s+");
                    word      = bits[0];
                    wordClass = bits[1];
                }
                if (!cased)
                {
                    word = word.ToLower();
                }
                if (numberEquivalence)
                {
                    word = WordShapeClassifier.WordShape(word, WordShapeClassifier.Wordshapedigits);
                }
                lexicon[word] = wordClass;
            }
            Timing.EndDoing();
        }
        private void DoBasicStuff(IList <In> doc)
        {
            int position = 0;

            foreach (IN fl in doc)
            {
                // position in document
                fl.Set(typeof(CoreAnnotations.PositionAnnotation), int.ToString((position++)));
                // word shape
                if ((flags.wordShape > WordShapeClassifier.Nowordshape) && !flags.useShapeStrings)
                {
                    // TODO: if we pass in a FeatureFactory, as suggested by an earlier comment,
                    // we should use that FeatureFactory's getWord function
                    string word = fl.Get(typeof(CoreAnnotations.TextAnnotation));
                    if (flags.wordFunction != null)
                    {
                        word = flags.wordFunction.Apply(word);
                    }
                    if (!word.IsEmpty() && char.IsLowerCase(word.CodePointAt(0)))
                    {
                        knownLCWords.Add(word);
                    }
                    string s = Intern(WordShapeClassifier.WordShape(word, flags.wordShape, knownLCWords));
                    fl.Set(typeof(CoreAnnotations.ShapeAnnotation), s);
                }
                // normalizing and interning was the following; should presumably now be:
                // if ("CTBSegDocumentReader".equalsIgnoreCase(flags.documentReader)) {
                if (Sharpen.Runtime.EqualsIgnoreCase("edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter", flags.readerAndWriter))
                {
                    // for Chinese segmentation, "word" is no use and ignore goldAnswer for memory efficiency.
                    fl.Set(typeof(CoreAnnotations.CharAnnotation), Intern(Fix(fl.Get(typeof(CoreAnnotations.CharAnnotation)))));
                }
                else
                {
                    fl.Set(typeof(CoreAnnotations.TextAnnotation), Intern(Fix(fl.Get(typeof(CoreAnnotations.TextAnnotation)))));
                    // only override GoldAnswer if not set - so that a DocumentReaderAndWriter can set it right in the first place.
                    if (fl.Get(typeof(CoreAnnotations.GoldAnswerAnnotation)) == null)
                    {
                        fl.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), fl.Get(typeof(CoreAnnotations.AnswerAnnotation)));
                    }
                }
            }
        }
示例#10
0
        public static void OutputResults(int wordshape, string[] @in, string[] shape, string[] knownLCWords)
        {
            System.Console.Out.WriteLine("======================");
            System.Console.Out.WriteLine(" Classifier " + wordshape);
            System.Console.Out.WriteLine("======================");
            ICollection <string> knownLCset = null;

            if (knownLCWords != null)
            {
                knownLCset = new HashSet <string>(Arrays.AsList(knownLC));
            }
            for (int i = 0; i < @in.Length; ++i)
            {
                string result = WordShapeClassifier.WordShape(@in[i], wordshape, knownLCset);
                System.Console.Out.Write("  " + @in[i] + ": " + result);
                if (i < shape.Length)
                {
                    System.Console.Out.Write("  (" + shape[i] + ")");
                }
                System.Console.Out.WriteLine();
            }
        }