コード例 #1
0
 /// <summary>Configures morpho-syntactic annotations for POS tags.</summary>
 /// <param name="activeFeats">
 /// A comma-separated list of feature values with names according
 /// to MorphoFeatureType.
 /// </param>
 private string SetupMorphoFeatures(string activeFeats)
 {
     string[] feats = activeFeats.Split(",");
     morphoSpec = tlp.MorphFeatureSpec();
     foreach (string feat in feats)
     {
         MorphoFeatureSpecification.MorphoFeatureType fType = MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feat.Trim());
         morphoSpec.Activate(fType);
     }
     return(morphoSpec.ToString());
 }
コード例 #2
0
 public override void Setup(File path, params string[] options)
 {
     //Setup the Bies tag mapping
     base.Setup(path, new string[0]);
     foreach (string opt in options)
     {
         string[] optToks = opt.Split(":");
         if (optToks[0].Equals("UniversalMap") && optToks.Length == 2)
         {
             LoadUniversalMap(optToks[1]);
         }
         else
         {
             //Maybe it's a morphological feature
             //Both of these calls will throw exceptions if the feature is illegal/invalid
             MorphoFeatureSpecification.MorphoFeatureType feat = MorphoFeatureSpecification.MorphoFeatureType.ValueOf(optToks[0]);
             IList <string> featVals = morphoSpec.GetValues(feat);
             morphoSpec.Activate(feat);
         }
     }
 }
コード例 #3
0
        //  private static String stripTag(String tag) {
        //    if (tag.startsWith("DT")) {
        //      String newTag = tag.substring(2, tag.length());
        //      return newTag.length() > 0 ? newTag : tag;
        //    }
        //    return tag;
        //  }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName);
                System.Environment.Exit(-1);
            }
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;

            if (language.Equals(Language.Arabic))
            {
                string[] options = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            else
            {
                string[] options = new string[] { "-frenchFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            Treebank tb = tlpp.DiskTreebank();

            tb.LoadPath(args[1]);
            MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();

            string[] features = args[2].Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            // Counters
            ICounter <string> wordTagCounter  = new ClassicCounter <string>(30000);
            ICounter <string> morphTagCounter = new ClassicCounter <string>(500);
            //    Counter<String> signatureTagCounter = new ClassicCounter<String>();
            ICounter <string> morphCounter           = new ClassicCounter <string>(500);
            ICounter <string> wordCounter            = new ClassicCounter <string>(30000);
            ICounter <string> tagCounter             = new ClassicCounter <string>(300);
            ICounter <string> lemmaCounter           = new ClassicCounter <string>(25000);
            ICounter <string> lemmaTagCounter        = new ClassicCounter <string>(25000);
            ICounter <string> richTagCounter         = new ClassicCounter <string>(1000);
            ICounter <string> reducedTagCounter      = new ClassicCounter <string>(500);
            ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500);
            IDictionary <string, ICollection <string> > wordLemmaMap           = Generics.NewHashMap();
            TwoDimensionalIntCounter <string, string>   lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000);
            TwoDimensionalIntCounter <string, string>   reducedTagTagCounter   = new TwoDimensionalIntCounter <string, string>(500);
            TwoDimensionalIntCounter <string, string>   tagReducedTagCounter   = new TwoDimensionalIntCounter <string, string>(300);
            int numTrees = 0;

            foreach (Tree tree in tb)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                IList <ILabel> pretermList = tree.PreTerminalYield();
                IList <ILabel> yield       = tree.Yield();
                System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count);
                int yieldLen = yield.Count;
                for (int i = 0; i < yieldLen; ++i)
                {
                    string tag   = pretermList[i].Value();
                    string word  = yield[i].Value();
                    string morph = ((CoreLabel)yield[i]).OriginalText();
                    // Note: if there is no lemma, then we use the surface form.
                    Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph);
                    string lemma   = lemmaTag.First();
                    string richTag = lemmaTag.Second();
                    // WSGDEBUG
                    if (tag.Contains("MW"))
                    {
                        lemma += "-MWE";
                    }
                    lemmaCounter.IncrementCount(lemma);
                    lemmaTagCounter.IncrementCount(lemma + tag);
                    richTagCounter.IncrementCount(richTag);
                    string reducedTag = morphoSpec.StrToFeatures(richTag).ToString();
                    reducedTagCounter.IncrementCount(reducedTag);
                    reducedTagLemmaCounter.IncrementCount(reducedTag + lemma);
                    wordTagCounter.IncrementCount(word + tag);
                    morphTagCounter.IncrementCount(morph + tag);
                    morphCounter.IncrementCount(morph);
                    wordCounter.IncrementCount(word);
                    tagCounter.IncrementCount(tag);
                    reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag;
                    if (wordLemmaMap.Contains(word))
                    {
                        wordLemmaMap[word].Add(lemma);
                    }
                    else
                    {
                        ICollection <string> lemmas = Generics.NewHashSet(1);
                        wordLemmaMap[word] = lemmas;
                    }
                    lemmaReducedTagCounter.IncrementCount(lemma, reducedTag);
                    reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag);
                    tagReducedTagCounter.IncrementCount(tag, reducedTag);
                }
                ++numTrees;
            }
            // Barf...
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.Printf("#trees:\t%d%n", numTrees);
            System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount());
            System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count);
            System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count);
            System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count);
            // Extra
            System.Console.Out.WriteLine("==================");
            StringBuilder sbNoLemma    = new StringBuilder();
            StringBuilder sbMultLemmas = new StringBuilder();

            foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap)
            {
                string word = wordLemmas.Key;
                ICollection <string> lemmas = wordLemmas.Value;
                if (lemmas.Count == 0)
                {
                    sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n");
                    continue;
                }
                if (lemmas.Count > 1)
                {
                    sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n");
                    continue;
                }
                string lemma = lemmas.GetEnumerator().Current;
                ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet();
                if (reducedTags.Count > 1)
                {
                    System.Console.Out.Printf("%s --> %s%n", word, lemma);
                    foreach (string reducedTag in reducedTags)
                    {
                        int    count   = lemmaReducedTagCounter.GetCount(lemma, reducedTag);
                        string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet());
                        System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
                    }
                    System.Console.Out.WriteLine();
                }
            }
            System.Console.Out.WriteLine("==================");
            System.Console.Out.WriteLine(sbNoLemma.ToString());
            System.Console.Out.WriteLine(sbMultLemmas.ToString());
            System.Console.Out.WriteLine("==================");
            IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet());

            tags.Sort();
            foreach (string tag_1 in tags)
            {
                System.Console.Out.WriteLine(tag_1);
                ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet();
                foreach (string reducedTag in reducedTags)
                {
                    int count = tagReducedTagCounter.GetCount(tag_1, reducedTag);
                    //        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
                    System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count);
                }
                System.Console.Out.WriteLine();
            }
            System.Console.Out.WriteLine("==================");
        }
コード例 #4
0
 public override int SetOptionFlag(string[] args, int i)
 {
     if (annotations.Contains(args[i]))
     {
         AddFeature(args[i]);
         i++;
     }
     else
     {
         if (args[i].Equals("-collinizerRetainsPunctuation"))
         {
             optionsString.Append("Collinizer retains punctuation.\n");
             collinizerRetainsPunctuation = true;
             i++;
         }
         else
         {
             if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-headFinder") && (i + 1 < args.Length))
             {
                 try
                 {
                     IHeadFinder hf = (IHeadFinder)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1]));
                     SetHeadFinder(hf);
                     optionsString.Append("HeadFinder: " + args[i + 1] + "\n");
                 }
                 catch (Exception e)
                 {
                     log.Info(e);
                     log.Info(this.GetType().FullName + ": Could not load head finder " + args[i + 1]);
                 }
                 i += 2;
             }
             else
             {
                 if (args[i].Equals("-xmlFormat"))
                 {
                     optionsString.Append("Reading trees in XML format.\n");
                     readPennFormat = false;
                     SetInputEncoding(tlp.GetEncoding());
                     i++;
                 }
                 else
                 {
                     if (args[i].Equals("-frenchFactored"))
                     {
                         foreach (string feature in factoredFeatures)
                         {
                             AddFeature(feature);
                         }
                         i++;
                     }
                     else
                     {
                         if (args[i].Equals("-frenchMWMap"))
                         {
                             LoadMWMap(args[i + 1]);
                             i += 2;
                         }
                         else
                         {
                             if (args[i].Equals("-tsg"))
                             {
                                 //wsg2011: These features should be removed for TSG extraction.
                                 //If they are retained, the resulting grammar seems to be too brittle....
                                 optionsString.Append("Removing baseline features: -markVN, -coord1");
                                 RemoveFeature("-markVN");
                                 optionsString.Append(" (removed -markVN)");
                                 RemoveFeature("-coord1");
                                 optionsString.Append(" (removed -coord1)\n");
                                 i++;
                             }
                             else
                             {
                                 if (args[i].Equals("-factlex") && (i + 1 < args.Length))
                                 {
                                     string activeFeats = SetupMorphoFeatures(args[i + 1]);
                                     optionsString.Append("Factored Lexicon: active features: ").Append(activeFeats);
                                     // WSGDEBUG Maybe add -mweTag in place of -tagPAFr?
                                     RemoveFeature("-tagPAFr");
                                     optionsString.Append(" (removed -tagPAFr)\n");
                                     // Add -mweTag
                                     string[] option = new string[] { "-mweTag" };
                                     SetOptionFlag(option, 0);
                                     i += 2;
                                 }
                                 else
                                 {
                                     if (args[i].Equals("-noFeatures"))
                                     {
                                         foreach (string feature in annotations.Keys)
                                         {
                                             RemoveFeature(feature);
                                         }
                                         optionsString.Append("Removed all manual features.\n");
                                         i++;
                                     }
                                     else
                                     {
                                         if (args[i].Equals("-ccTagsetAnnotations"))
                                         {
                                             tagSpec = new FrenchMorphoFeatureSpecification();
                                             tagSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Other);
                                             optionsString.Append("Adding CC tagset as POS state splits.\n");
                                             ++i;
                                         }
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
     }
     return(i);
 }
コード例 #5
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 4)
            {
                System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName);
                System.Environment.Exit(-1);
            }
            // Command line options
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;
            Treebank trainTreebank         = tlpp.DiskTreebank();

            trainTreebank.LoadPath(args[2]);
            Treebank devTreebank = tlpp.DiskTreebank();

            devTreebank.LoadPath(args[3]);
            MorphoFeatureSpecification morphoSpec;
            Options options = GetOptions(language);

            if (language.Equals(Language.Arabic))
            {
                morphoSpec = new ArabicMorphoFeatureSpecification();
                string[] languageOptions = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(languageOptions, 0);
            }
            else
            {
                if (language.Equals(Language.French))
                {
                    morphoSpec = new FrenchMorphoFeatureSpecification();
                    string[] languageOptions = new string[] { "-frenchFactored" };
                    tlpp.SetOptionFlag(languageOptions, 0);
                }
                else
                {
                    throw new NotSupportedException();
                }
            }
            string featureList = args[1];

            string[] features = featureList.Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.WriteLine("Features: " + args[1]);
            // Create word and tag indices
            // Save trees in a collection since the interface requires that....
            System.Console.Out.Write("Loading training trees...");
            IList <Tree>    trainTrees = new List <Tree>(19000);
            IIndex <string> wordIndex  = new HashIndex <string>();
            IIndex <string> tagIndex   = new HashIndex <string>();

            foreach (Tree tree in trainTreebank)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                trainTrees.Add(tree);
            }
            System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count);
            // Setup and train the lexicon.
            System.Console.Out.Write("Collecting sufficient statistics for lexicon...");
            Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
            lexicon.InitializeTraining(trainTrees.Count);
            lexicon.Train(trainTrees, null);
            lexicon.FinishTraining();
            System.Console.Out.WriteLine("Done!");
            trainTrees = null;
            // Load the tuning set
            System.Console.Out.Write("Loading tuning set...");
            IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp);

            System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count);
            // Print the probabilities that we obtain
            // TODO(spenceg): Implement tagging accuracy with FactLex
            int nCorrect             = 0;
            ICounter <string> errors = new ClassicCounter <string>();

            foreach (FactoredLexiconEvent @event in tuningSet)
            {
                IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr());
                ICounter <int> logScores        = new ClassicCounter <int>();
                bool           noRules          = true;
                int            goldTagId        = -1;
                while (itr.MoveNext())
                {
                    noRules = false;
                    IntTaggedWord iTW = itr.Current;
                    if (iTW.Tag() == @event.TagId())
                    {
                        log.Info("GOLD-");
                        goldTagId = iTW.Tag();
                    }
                    float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr());
                    logScores.IncrementCount(iTW.Tag(), tagScore);
                }
                if (noRules)
                {
                    System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr());
                }
                else
                {
                    // Score the tagging
                    int hypTagId = Counters.Argmax(logScores);
                    if (hypTagId == goldTagId)
                    {
                        ++nCorrect;
                    }
                    else
                    {
                        string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId);
                        errors.IncrementCount(goldTag);
                    }
                }
                log.Info();
            }
            // Output accuracy
            double acc = (double)nCorrect / (double)tuningSet.Count;

            System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0);
            log.Info("% of errors by type:");
            IList <string> biggestKeys = new List <string>(errors.KeySet());

            biggestKeys.Sort(Counters.ToComparator(errors, false, true));
            Counters.Normalize(errors);
            foreach (string key in biggestKeys)
            {
                System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0);
            }
        }