MorphoFeatureSpecification C# (CSharp)のコード例

コード例 #1

0

ファイルを表示

ファイル: FactoredLexicon.cs プロジェクト: zerouid/Stanford.CoreNLP.NET

        public override float Score(IntTaggedWord iTW, int loc, string word, string featureSpec)
        {
            int wordId = iTW.Word();
            int tagId  = iTW.Tag();
            // Force 1-best path to go through the boundary symbol
            // (deterministic tagging)
            int boundaryId    = wordIndex.IndexOf(LexiconConstants.Boundary);
            int boundaryTagId = tagIndex.IndexOf(LexiconConstants.BoundaryTag);

            if (wordId == boundaryId && tagId == boundaryTagId)
            {
                return(0.0f);
            }
            // Morphological features
            string tag = tagIndex.Get(iTW.Tag());
            Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureSpec);
            string lemma           = lemmaMorph.First();
            int    lemmaId         = wordIndex.IndexOf(lemma);
            string richMorphTag    = lemmaMorph.Second();
            string reducedMorphTag = morphoSpec.StrToFeatures(richMorphTag).ToString().Trim();

            reducedMorphTag = reducedMorphTag.Length == 0 ? NoMorphAnalysis : reducedMorphTag;
            int morphId = morphIndex.AddToIndex(reducedMorphTag);
            // Score the factors and create the rule score p_W_T
            double p_W_Tf = Math.Log(ProbWordTag(word, loc, wordId, tagId));
            //    double p_L_T = Math.log(probLemmaTag(word, loc, tagId, lemmaId));
            double p_L_T = 0.0;
            double p_M_T = Math.Log(ProbMorphTag(tagId, morphId));
            double p_W_T = p_W_Tf + p_L_T + p_M_T;

            //      String tag = tagIndex.get(tagId);
            // Filter low probability taggings
            return(p_W_T > -100.0 ? (float)p_W_T : float.NegativeInfinity);
        }

コード例 #2

0

ファイルを表示

ファイル: FrenchTreebankParserParams.cs プロジェクト: zerouid/Stanford.CoreNLP.NET

        public override Tree TransformTree(Tree t, Tree root)
        {
            // Perform tregex-powered annotations
            t = base.TransformTree(t, root);
            string cat = t.Value();

            //Add morphosyntactic features if this is a POS tag
            if (t.IsPreTerminal() && tagSpec != null)
            {
                if (!(t.FirstChild().Label() is CoreLabel) || ((CoreLabel)t.FirstChild().Label()).OriginalText() == null)
                {
                    throw new Exception(string.Format("%s: Term lacks morpho analysis: %s", this.GetType().FullName, t.ToString()));
                }
                string morphoStr = ((CoreLabel)t.FirstChild().Label()).OriginalText();
                Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(string.Empty, morphoStr);
                MorphoFeatures        feats      = tagSpec.StrToFeatures(lemmaMorph.Second());
                cat = feats.GetTag(cat);
            }
            //Update the label(s)
            t.SetValue(cat);
            if (t.IsPreTerminal() && t.Label() is IHasTag)
            {
                ((IHasTag)t.Label()).SetTag(cat);
            }
            return(t);
        }

コード例 #3

0

ファイルを表示

 public UniversalPOSMapper()
     : base(false)
 {
     //Don't add the determiner split
     universalMap = Generics.NewHashMap();
     morphoSpec   = new ArabicMorphoFeatureSpecification();
 }

コード例 #4

0

ファイルを表示

ファイル: FrenchTreebankParserParams.cs プロジェクト: zerouid/Stanford.CoreNLP.NET

 /// <summary>Configures morpho-syntactic annotations for POS tags.</summary>
 /// <param name="activeFeats">
 /// A comma-separated list of feature values with names according
 /// to MorphoFeatureType.
 /// </param>
 private string SetupMorphoFeatures(string activeFeats)
 {
     string[] feats = activeFeats.Split(",");
     morphoSpec = tlp.MorphFeatureSpec();
     foreach (string feat in feats)
     {
         MorphoFeatureSpecification.MorphoFeatureType fType = MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feat.Trim());
         morphoSpec.Activate(fType);
     }
     return(morphoSpec.ToString());
 }

コード例 #5

0

ファイルを表示

ファイル: TreeToMorfette.cs プロジェクト: awesomedotnetcore/Stanford.CoreNLP.NET

        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToMorfette).FullName);
                System.Environment.Exit(-1);
            }
            string             treeFile = args[0];
            ITreeReaderFactory trf      = new FrenchTreeReaderFactory();

            try
            {
                ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")));
                for (Tree tree1; (tree1 = tr.ReadTree()) != null;)
                {
                    IList <ILabel> pretermYield = tree1.PreTerminalYield();
                    IList <ILabel> yield        = tree1.Yield();
                    int            yieldLen     = yield.Count;
                    for (int i = 0; i < yieldLen; ++i)
                    {
                        CoreLabel             rawToken   = (CoreLabel)yield[i];
                        string                word       = rawToken.Value();
                        string                morphStr   = rawToken.OriginalText();
                        Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, morphStr);
                        string                lemma      = lemmaMorph.First();
                        string                morph      = lemmaMorph.Second();
                        if (morph == null || morph.Equals(string.Empty) || morph.Equals("XXX"))
                        {
                            morph = ((CoreLabel)pretermYield[i]).Value();
                        }
                        System.Console.Out.Printf("%s %s %s%n", word, lemma, morph);
                    }
                    System.Console.Out.WriteLine();
                }
                tr.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }

コード例 #6

0

ファイルを表示

ファイル: FrenchTreeNormalizer.cs プロジェクト: awesomedotnetcore/Stanford.CoreNLP.NET

        private static void ReplacePOSTag(Tree t, MorphoFeatureSpecification morpho)
        {
            if (!t.IsPreTerminal())
            {
                throw new ArgumentException("Can only operate on preterminals");
            }
            if (!(t.Label() is CoreLabel))
            {
                throw new ArgumentException("Only operates on CoreLabels");
            }
            CoreLabel label = (CoreLabel)t.Label();
            Tree      child = t.Children()[0];

            if (!(child.Label() is CoreLabel))
            {
                throw new ArgumentException("Only operates on CoreLabels");
            }
            CoreLabel childLabel = (CoreLabel)child.Label();
            // Morphological Analysis
            string morphStr = childLabel.OriginalText();

            if (morphStr == null || morphStr.Equals(string.Empty))
            {
                morphStr = label.Value();
                // POS subcategory
                string subCat = childLabel.Category();
                if (subCat != null && subCat != string.Empty)
                {
                    morphStr += "-" + subCat + "--";
                }
                else
                {
                    morphStr += "---";
                }
            }
            MorphoFeatures feats = morpho.StrToFeatures(morphStr);

            if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty))
            {
                label.SetValue(feats.GetAltTag());
                label.SetTag(feats.GetAltTag());
            }
        }

コード例 #7

0

ファイルを表示

ファイル: FactoredLexicon.cs プロジェクト: zerouid/Stanford.CoreNLP.NET

        /// <summary>
        /// Convert a treebank to factored lexicon events for fast iteration in the
        /// optimizer.
        /// </summary>
        private static IList <FactoredLexiconEvent> TreebankToLexiconEvents(IList <Tree> treebank, Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon)
        {
            IList <FactoredLexiconEvent> events = new List <FactoredLexiconEvent>(70000);

            foreach (Tree tree in treebank)
            {
                IList <ILabel> yield   = tree.Yield();
                IList <ILabel> preterm = tree.PreTerminalYield();
                System.Diagnostics.Debug.Assert(yield.Count == preterm.Count);
                int yieldLen = yield.Count;
                for (int i = 0; i < yieldLen; ++i)
                {
                    string tag    = preterm[i].Value();
                    int    tagId  = lexicon.tagIndex.IndexOf(tag);
                    string word   = yield[i].Value();
                    int    wordId = lexicon.wordIndex.IndexOf(word);
                    // Two checks to see if we keep this example
                    if (tagId < 0)
                    {
                        log.Info("Discarding training example: " + word + " " + tag);
                        continue;
                    }
                    //        if (counts.probWordTag(wordId, tagId) == 0.0) {
                    //          log.info("Discarding low counts <w,t> pair: " + word + " " + tag);
                    //          continue;
                    //        }
                    string featureStr = ((CoreLabel)yield[i]).OriginalText();
                    Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureStr);
                    string lemma      = lemmaMorph.First();
                    string richTag    = lemmaMorph.Second();
                    string reducedTag = lexicon.morphoSpec.StrToFeatures(richTag).ToString();
                    reducedTag = reducedTag.Length == 0 ? NoMorphAnalysis : reducedTag;
                    int lemmaId = lexicon.wordIndex.IndexOf(lemma);
                    int morphId = lexicon.morphIndex.IndexOf(reducedTag);
                    FactoredLexiconEvent @event = new FactoredLexiconEvent(wordId, tagId, lemmaId, morphId, i, word, featureStr);
                    events.Add(@event);
                }
            }
            return(events);
        }

コード例 #8

0

ファイルを表示

        //  private static String stripTag(String tag) {
        //    if (tag.startsWith("DT")) {
        //      String newTag = tag.substring(2, tag.length());
        //      return newTag.length() > 0 ? newTag : tag;
        //    }
        //    return tag;
        //  }
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName);
                System.Environment.Exit(-1);
            }
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;

            if (language.Equals(Language.Arabic))
            {
                string[] options = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            else
            {
                string[] options = new string[] { "-frenchFactored" };
                tlpp.SetOptionFlag(options, 0);
            }
            Treebank tb = tlpp.DiskTreebank();

            tb.LoadPath(args[1]);
            MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification();

            string[] features = args[2].Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            // Counters
            ICounter <string> wordTagCounter  = new ClassicCounter <string>(30000);
            ICounter <string> morphTagCounter = new ClassicCounter <string>(500);
            //    Counter<String> signatureTagCounter = new ClassicCounter<String>();
            ICounter <string> morphCounter           = new ClassicCounter <string>(500);
            ICounter <string> wordCounter            = new ClassicCounter <string>(30000);
            ICounter <string> tagCounter             = new ClassicCounter <string>(300);
            ICounter <string> lemmaCounter           = new ClassicCounter <string>(25000);
            ICounter <string> lemmaTagCounter        = new ClassicCounter <string>(25000);
            ICounter <string> richTagCounter         = new ClassicCounter <string>(1000);
            ICounter <string> reducedTagCounter      = new ClassicCounter <string>(500);
            ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500);
            IDictionary <string, ICollection <string> > wordLemmaMap           = Generics.NewHashMap();
            TwoDimensionalIntCounter <string, string>   lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000);
            TwoDimensionalIntCounter <string, string>   reducedTagTagCounter   = new TwoDimensionalIntCounter <string, string>(500);
            TwoDimensionalIntCounter <string, string>   tagReducedTagCounter   = new TwoDimensionalIntCounter <string, string>(300);
            int numTrees = 0;

            foreach (Tree tree in tb)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                IList <ILabel> pretermList = tree.PreTerminalYield();
                IList <ILabel> yield       = tree.Yield();
                System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count);
                int yieldLen = yield.Count;
                for (int i = 0; i < yieldLen; ++i)
                {
                    string tag   = pretermList[i].Value();
                    string word  = yield[i].Value();
                    string morph = ((CoreLabel)yield[i]).OriginalText();
                    // Note: if there is no lemma, then we use the surface form.
                    Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph);
                    string lemma   = lemmaTag.First();
                    string richTag = lemmaTag.Second();
                    // WSGDEBUG
                    if (tag.Contains("MW"))
                    {
                        lemma += "-MWE";
                    }
                    lemmaCounter.IncrementCount(lemma);
                    lemmaTagCounter.IncrementCount(lemma + tag);
                    richTagCounter.IncrementCount(richTag);
                    string reducedTag = morphoSpec.StrToFeatures(richTag).ToString();
                    reducedTagCounter.IncrementCount(reducedTag);
                    reducedTagLemmaCounter.IncrementCount(reducedTag + lemma);
                    wordTagCounter.IncrementCount(word + tag);
                    morphTagCounter.IncrementCount(morph + tag);
                    morphCounter.IncrementCount(morph);
                    wordCounter.IncrementCount(word);
                    tagCounter.IncrementCount(tag);
                    reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag;
                    if (wordLemmaMap.Contains(word))
                    {
                        wordLemmaMap[word].Add(lemma);
                    }
                    else
                    {
                        ICollection <string> lemmas = Generics.NewHashSet(1);
                        wordLemmaMap[word] = lemmas;
                    }
                    lemmaReducedTagCounter.IncrementCount(lemma, reducedTag);
                    reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag);
                    tagReducedTagCounter.IncrementCount(tag, reducedTag);
                }
                ++numTrees;
            }
            // Barf...
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.Printf("#trees:\t%d%n", numTrees);
            System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount());
            System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count);
            System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count);
            System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count);
            System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count);
            System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count);
            System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count);
            // Extra
            System.Console.Out.WriteLine("==================");
            StringBuilder sbNoLemma    = new StringBuilder();
            StringBuilder sbMultLemmas = new StringBuilder();

            foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap)
            {
                string word = wordLemmas.Key;
                ICollection <string> lemmas = wordLemmas.Value;
                if (lemmas.Count == 0)
                {
                    sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n");
                    continue;
                }
                if (lemmas.Count > 1)
                {
                    sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n");
                    continue;
                }
                string lemma = lemmas.GetEnumerator().Current;
                ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet();
                if (reducedTags.Count > 1)
                {
                    System.Console.Out.Printf("%s --> %s%n", word, lemma);
                    foreach (string reducedTag in reducedTags)
                    {
                        int    count   = lemmaReducedTagCounter.GetCount(lemma, reducedTag);
                        string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet());
                        System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags);
                    }
                    System.Console.Out.WriteLine();
                }
            }
            System.Console.Out.WriteLine("==================");
            System.Console.Out.WriteLine(sbNoLemma.ToString());
            System.Console.Out.WriteLine(sbMultLemmas.ToString());
            System.Console.Out.WriteLine("==================");
            IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet());

            tags.Sort();
            foreach (string tag_1 in tags)
            {
                System.Console.Out.WriteLine(tag_1);
                ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet();
                foreach (string reducedTag in reducedTags)
                {
                    int count = tagReducedTagCounter.GetCount(tag_1, reducedTag);
                    //        reducedTag = reducedTag.equals("") ? "NONE" : reducedTag;
                    System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count);
                }
                System.Console.Out.WriteLine();
            }
            System.Console.Out.WriteLine("==================");
        }

コード例 #9

0

ファイルを表示

ファイル: FrenchTreebankParserParams.cs プロジェクト: zerouid/Stanford.CoreNLP.NET

 public override int SetOptionFlag(string[] args, int i)
 {
     if (annotations.Contains(args[i]))
     {
         AddFeature(args[i]);
         i++;
     }
     else
     {
         if (args[i].Equals("-collinizerRetainsPunctuation"))
         {
             optionsString.Append("Collinizer retains punctuation.\n");
             collinizerRetainsPunctuation = true;
             i++;
         }
         else
         {
             if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-headFinder") && (i + 1 < args.Length))
             {
                 try
                 {
                     IHeadFinder hf = (IHeadFinder)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1]));
                     SetHeadFinder(hf);
                     optionsString.Append("HeadFinder: " + args[i + 1] + "\n");
                 }
                 catch (Exception e)
                 {
                     log.Info(e);
                     log.Info(this.GetType().FullName + ": Could not load head finder " + args[i + 1]);
                 }
                 i += 2;
             }
             else
             {
                 if (args[i].Equals("-xmlFormat"))
                 {
                     optionsString.Append("Reading trees in XML format.\n");
                     readPennFormat = false;
                     SetInputEncoding(tlp.GetEncoding());
                     i++;
                 }
                 else
                 {
                     if (args[i].Equals("-frenchFactored"))
                     {
                         foreach (string feature in factoredFeatures)
                         {
                             AddFeature(feature);
                         }
                         i++;
                     }
                     else
                     {
                         if (args[i].Equals("-frenchMWMap"))
                         {
                             LoadMWMap(args[i + 1]);
                             i += 2;
                         }
                         else
                         {
                             if (args[i].Equals("-tsg"))
                             {
                                 //wsg2011: These features should be removed for TSG extraction.
                                 //If they are retained, the resulting grammar seems to be too brittle....
                                 optionsString.Append("Removing baseline features: -markVN, -coord1");
                                 RemoveFeature("-markVN");
                                 optionsString.Append(" (removed -markVN)");
                                 RemoveFeature("-coord1");
                                 optionsString.Append(" (removed -coord1)\n");
                                 i++;
                             }
                             else
                             {
                                 if (args[i].Equals("-factlex") && (i + 1 < args.Length))
                                 {
                                     string activeFeats = SetupMorphoFeatures(args[i + 1]);
                                     optionsString.Append("Factored Lexicon: active features: ").Append(activeFeats);
                                     // WSGDEBUG Maybe add -mweTag in place of -tagPAFr?
                                     RemoveFeature("-tagPAFr");
                                     optionsString.Append(" (removed -tagPAFr)\n");
                                     // Add -mweTag
                                     string[] option = new string[] { "-mweTag" };
                                     SetOptionFlag(option, 0);
                                     i += 2;
                                 }
                                 else
                                 {
                                     if (args[i].Equals("-noFeatures"))
                                     {
                                         foreach (string feature in annotations.Keys)
                                         {
                                             RemoveFeature(feature);
                                         }
                                         optionsString.Append("Removed all manual features.\n");
                                         i++;
                                     }
                                     else
                                     {
                                         if (args[i].Equals("-ccTagsetAnnotations"))
                                         {
                                             tagSpec = new FrenchMorphoFeatureSpecification();
                                             tagSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.Other);
                                             optionsString.Append("Adding CC tagset as POS state splits.\n");
                                             ++i;
                                         }
                                     }
                                 }
                             }
                         }
                     }
                 }
             }
         }
     }
     return(i);
 }

コード例 #10

0

ファイルを表示

ファイル: FactoredLexicon.cs プロジェクト: zerouid/Stanford.CoreNLP.NET

 public FactoredLexicon(Options op, MorphoFeatureSpecification morphoSpec, IIndex <string> wordIndex, IIndex <string> tagIndex)
     : base(op, wordIndex, tagIndex)
 {
     this.morphoSpec = morphoSpec;
 }

コード例 #11

0

ファイルを表示

ファイル: FactoredLexicon.cs プロジェクト: zerouid/Stanford.CoreNLP.NET

        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 4)
            {
                System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName);
                System.Environment.Exit(-1);
            }
            // Command line options
            Language language = Language.ValueOf(args[0]);
            ITreebankLangParserParams tlpp = language.@params;
            Treebank trainTreebank         = tlpp.DiskTreebank();

            trainTreebank.LoadPath(args[2]);
            Treebank devTreebank = tlpp.DiskTreebank();

            devTreebank.LoadPath(args[3]);
            MorphoFeatureSpecification morphoSpec;
            Options options = GetOptions(language);

            if (language.Equals(Language.Arabic))
            {
                morphoSpec = new ArabicMorphoFeatureSpecification();
                string[] languageOptions = new string[] { "-arabicFactored" };
                tlpp.SetOptionFlag(languageOptions, 0);
            }
            else
            {
                if (language.Equals(Language.French))
                {
                    morphoSpec = new FrenchMorphoFeatureSpecification();
                    string[] languageOptions = new string[] { "-frenchFactored" };
                    tlpp.SetOptionFlag(languageOptions, 0);
                }
                else
                {
                    throw new NotSupportedException();
                }
            }
            string featureList = args[1];

            string[] features = featureList.Trim().Split(",");
            foreach (string feature in features)
            {
                morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature));
            }
            System.Console.Out.WriteLine("Language: " + language.ToString());
            System.Console.Out.WriteLine("Features: " + args[1]);
            // Create word and tag indices
            // Save trees in a collection since the interface requires that....
            System.Console.Out.Write("Loading training trees...");
            IList <Tree>    trainTrees = new List <Tree>(19000);
            IIndex <string> wordIndex  = new HashIndex <string>();
            IIndex <string> tagIndex   = new HashIndex <string>();

            foreach (Tree tree in trainTreebank)
            {
                foreach (Tree subTree in tree)
                {
                    if (!subTree.IsLeaf())
                    {
                        tlpp.TransformTree(subTree, tree);
                    }
                }
                trainTrees.Add(tree);
            }
            System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count);
            // Setup and train the lexicon.
            System.Console.Out.Write("Collecting sufficient statistics for lexicon...");
            Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex);
            lexicon.InitializeTraining(trainTrees.Count);
            lexicon.Train(trainTrees, null);
            lexicon.FinishTraining();
            System.Console.Out.WriteLine("Done!");
            trainTrees = null;
            // Load the tuning set
            System.Console.Out.Write("Loading tuning set...");
            IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp);

            System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count);
            // Print the probabilities that we obtain
            // TODO(spenceg): Implement tagging accuracy with FactLex
            int nCorrect             = 0;
            ICounter <string> errors = new ClassicCounter <string>();

            foreach (FactoredLexiconEvent @event in tuningSet)
            {
                IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr());
                ICounter <int> logScores        = new ClassicCounter <int>();
                bool           noRules          = true;
                int            goldTagId        = -1;
                while (itr.MoveNext())
                {
                    noRules = false;
                    IntTaggedWord iTW = itr.Current;
                    if (iTW.Tag() == @event.TagId())
                    {
                        log.Info("GOLD-");
                        goldTagId = iTW.Tag();
                    }
                    float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr());
                    logScores.IncrementCount(iTW.Tag(), tagScore);
                }
                if (noRules)
                {
                    System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr());
                }
                else
                {
                    // Score the tagging
                    int hypTagId = Counters.Argmax(logScores);
                    if (hypTagId == goldTagId)
                    {
                        ++nCorrect;
                    }
                    else
                    {
                        string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId);
                        errors.IncrementCount(goldTag);
                    }
                }
                log.Info();
            }
            // Output accuracy
            double acc = (double)nCorrect / (double)tuningSet.Count;

            System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0);
            log.Info("% of errors by type:");
            IList <string> biggestKeys = new List <string>(errors.KeySet());

            biggestKeys.Sort(Counters.ToComparator(errors, false, true));
            Counters.Normalize(errors);
            foreach (string key in biggestKeys)
            {
                System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0);
            }
        }

コード例 #12

0

ファイルを表示

ファイル: FactoredLexicon.cs プロジェクト: zerouid/Stanford.CoreNLP.NET

        /// <summary>This method should populate wordIndex, tagIndex, and morphIndex.</summary>
        public override void Train(ICollection <Tree> trees, ICollection <Tree> rawTrees)
        {
            double weight = 1.0;

            // Train uw model on words
            uwModelTrainer.Train(trees, weight);
            double             numTrees    = trees.Count;
            IEnumerator <Tree> rawTreesItr = rawTrees == null ? null : rawTrees.GetEnumerator();
            IEnumerator <Tree> treeItr     = trees.GetEnumerator();
            // Train factored lexicon on lemmas and morph tags
            int treeId = 0;

            while (treeItr.MoveNext())
            {
                Tree tree = treeItr.Current;
                // CoreLabels, with morph analysis in the originalText annotation
                IList <ILabel> yield = rawTrees == null?tree.Yield() : rawTreesItr.Current.Yield();

                // Annotated, binarized tree for the tags (labels are usually CategoryWordTag)
                IList <ILabel> pretermYield = tree.PreTerminalYield();
                int            yieldLen     = yield.Count;
                for (int i = 0; i < yieldLen; ++i)
                {
                    string word   = yield[i].Value();
                    int    wordId = wordIndex.AddToIndex(word);
                    // Don't do anything with words
                    string tag   = pretermYield[i].Value();
                    int    tagId = tagIndex.AddToIndex(tag);
                    // Use the word as backup if there is no lemma
                    string featureStr = ((CoreLabel)yield[i]).OriginalText();
                    Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, featureStr);
                    string lemma           = lemmaMorph.First();
                    int    lemmaId         = wordIndex.AddToIndex(lemma);
                    string richMorphTag    = lemmaMorph.Second();
                    string reducedMorphTag = morphoSpec.StrToFeatures(richMorphTag).ToString().Trim();
                    reducedMorphTag = reducedMorphTag.IsEmpty() ? NoMorphAnalysis : reducedMorphTag;
                    int morphId = morphIndex.AddToIndex(reducedMorphTag);
                    // Seen event counts
                    wordTag.IncrementCount(wordId, tagId);
                    lemmaTag.IncrementCount(lemmaId, tagId);
                    morphTag.IncrementCount(morphId, tagId);
                    tagCounter.IncrementCount(tagId);
                    // Unseen event counts
                    if (treeId > op.trainOptions.fractionBeforeUnseenCounting * numTrees)
                    {
                        if (!wordTag.FirstKeySet().Contains(wordId) || wordTag.GetCounter(wordId).TotalCount() < 2)
                        {
                            wordTagUnseen.IncrementCount(tagId);
                        }
                        if (!lemmaTag.FirstKeySet().Contains(lemmaId) || lemmaTag.GetCounter(lemmaId).TotalCount() < 2)
                        {
                            lemmaTagUnseen.IncrementCount(tagId);
                        }
                        if (!morphTag.FirstKeySet().Contains(morphId) || morphTag.GetCounter(morphId).TotalCount() < 2)
                        {
                            morphTagUnseen.IncrementCount(tagId);
                        }
                    }
                }
                ++treeId;
                if (Debug && (treeId % 100) == 0)
                {
                    System.Console.Error.Printf("[%d]", treeId);
                }
                if (Debug && (treeId % 10000) == 0)
                {
                    log.Info();
                }
            }
        }

コード例 #13

0

ファイルを表示

ファイル: ArabicTreeNormalizer.cs プロジェクト: awesomedotnetcore/Stanford.CoreNLP.NET

 public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf)
 {
     tree = tree.Prune(emptyFilter, tf).SpliceOut(aOverAFilter, tf);
     foreach (Tree t in tree)
     {
         if (t.IsLeaf())
         {
             //Strip off morphological analyses and place them in the OriginalTextAnnotation, which is
             //specified by HasContext.
             if (t.Value().Contains(MorphoFeatureSpecification.MorphoMark))
             {
                 string[] toks = t.Value().Split(MorphoFeatureSpecification.MorphoMark);
                 if (toks.Length != 2)
                 {
                     log.Err(string.Format("%s: Word contains malformed morph annotation: %s", this.GetType().FullName, t.Value()));
                 }
                 else
                 {
                     if (t.Label() is CoreLabel)
                     {
                         CoreLabel cl = (CoreLabel)t.Label();
                         cl.SetValue(string.Intern(toks[0].Trim()));
                         cl.SetWord(string.Intern(toks[0].Trim()));
                         Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(toks[0], toks[1]);
                         string lemma         = lemmaMorph.First();
                         string morphAnalysis = lemmaMorph.Second();
                         if (lemma.Equals(toks[0]))
                         {
                             cl.SetOriginalText(string.Intern(toks[1].Trim()));
                         }
                         else
                         {
                             // TODO(spenceg): Does this help?
                             string newLemma = lexMapper.Map(null, lemma);
                             if (newLemma == null || newLemma.Trim().IsEmpty())
                             {
                                 newLemma = lemma;
                             }
                             string newMorphAnalysis = newLemma + MorphoFeatureSpecification.LemmaMark + morphAnalysis;
                             cl.SetOriginalText(string.Intern(newMorphAnalysis));
                         }
                     }
                     else
                     {
                         log.Error(string.Format("%s: Cannot store morph analysis in non-CoreLabel: %s", this.GetType().FullName, t.Label().GetType().FullName));
                     }
                 }
             }
         }
         else
         {
             if (t.IsPreTerminal())
             {
                 if (t.Value() == null || t.Value().IsEmpty())
                 {
                     log.Warn(string.Format("%s: missing tag for %s", this.GetType().FullName, t.PennString()));
                 }
                 else
                 {
                     if (t.Label() is IHasTag)
                     {
                         ((IHasTag)t.Label()).SetTag(t.Value());
                     }
                 }
             }
             else
             {
                 //Phrasal nodes
                 // there are some nodes "/" missing preterminals.  We'll splice in a tag for these.
                 int          nk      = t.NumChildren();
                 IList <Tree> newKids = new List <Tree>(nk);
                 for (int j = 0; j < nk; j++)
                 {
                     Tree child = t.GetChild(j);
                     if (child.IsLeaf())
                     {
                         log.Warn(string.Format("%s: Splicing in DUMMYTAG for %s", this.GetType().FullName, t.ToString()));
                         newKids.Add(tf.NewTreeNode("DUMMYTAG", Java.Util.Collections.SingletonList(child)));
                     }
                     else
                     {
                         newKids.Add(child);
                     }
                 }
                 t.SetChildren(newKids);
             }
         }
     }
     //Every node in the tree has now been processed
     //
     // Additional processing for specific phrasal annotations
     //
     // special global coding for moving PRD annotation from constituent to verb tag.
     if (markPRDverb)
     {
         TregexMatcher m     = prdVerbPattern.Matcher(tree);
         Tree          match = null;
         while (m.Find())
         {
             if (m.GetMatch() != match)
             {
                 match = m.GetMatch();
                 match.Label().SetValue(match.Label().Value() + "-PRDverb");
                 Tree prd = m.GetNode("prd");
                 prd.Label().SetValue(base.NormalizeNonterminal(prd.Label().Value()));
             }
         }
     }
     //Mark *only* subjects in verb-initial clauses
     if (retainNPSbj)
     {
         TregexMatcher m = npSbjPattern.Matcher(tree);
         while (m.Find())
         {
             Tree match = m.GetMatch();
             match.Label().SetValue("NP");
         }
     }
     if (tree.IsPreTerminal())
     {
         // The whole tree is a bare tag: bad!
         string val = tree.Label().Value();
         if (val.Equals("CC") || val.StartsWith("PUNC") || val.Equals("CONJ"))
         {
             log.Warn(string.Format("%s: Bare tagged word being wrapped in FRAG %s", this.GetType().FullName, tree.PennString()));
             tree = tf.NewTreeNode("FRAG", Java.Util.Collections.SingletonList(tree));
         }
         else
         {
             log.Warn(string.Format("%s: Bare tagged word %s", this.GetType().FullName, tree.PennString()));
         }
     }
     //Add start symbol so that the root has only one sub-state. Escape any enclosing brackets.
     //If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method
     //will return null. In this case, readers e.g. PennTreeReader will try to read the next tree.
     while (tree != null && (tree.Value() == null || tree.Value().IsEmpty()) && tree.NumChildren() <= 1)
     {
         tree = tree.FirstChild();
     }
     if (tree != null && !tree.Value().Equals(rootLabel))
     {
         tree = tf.NewTreeNode(rootLabel, Java.Util.Collections.SingletonList(tree));
     }
     return(tree);
 }

C# (CSharp) MorphoFeatureSpecificationの例