public virtual string GetF1Description(int numDigits, L label)
        {
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(numDigits);
            return(nf.Format(GetFMeasure(label)));
        }
Ejemplo n.º 2
0
        public virtual string GetDescription(int numDigits)
        {
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(numDigits);
            StringBuilder sb = new StringBuilder();

            sb.Append("--- Accuracy Stats ---").Append('\n');
            sb.Append("accuracy: ").Append(nf.Format(accuracy)).Append('\n');
            sb.Append("optimal fn accuracy: ").Append(nf.Format(optAccuracy)).Append('\n');
            sb.Append("confidence weighted accuracy :").Append(nf.Format(confWeightedAccuracy)).Append('\n');
            sb.Append("optimal confidence weighted accuracy: ").Append(nf.Format(optConfWeightedAccuracy)).Append('\n');
            sb.Append("log-likelihood: ").Append(logLikelihood).Append('\n');
            if (saveFile != null)
            {
                string f = saveFile + '-' + saveIndex;
                sb.Append("saving accuracy info to ").Append(f).Append(".accuracy\n");
                StringUtils.PrintToFile(f + ".accuracy", ToStringArr(accrecall));
                sb.Append("saving optimal accuracy info to ").Append(f).Append(".optimal_accuracy\n");
                StringUtils.PrintToFile(f + ".optimal_accuracy", ToStringArr(optaccrecall));
                saveIndex++;
            }
            //sb.append("accuracy coverage: ").append(toStringArr(accrecall)).append("\n");
            //sb.append("optimal accuracy coverage: ").append(toStringArr(optaccrecall));
            return(sb.ToString());
        }
Ejemplo n.º 3
0
        /// <summary>Returns a String summarizing recall that will print nicely.</summary>
        public virtual string GetRecallDescription(int numDigits)
        {
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(numDigits);
            return(nf.Format(GetRecall()) + "  (" + tpCount + "/" + (tpCount + fnCount) + ")");
        }
        public virtual string GetRecallDescription(int numDigits, L label)
        {
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(numDigits);
            Triple <double, int, int> recall = GetRecallInfo(label);

            return(nf.Format(recall.First()) + "  (" + recall.Second() + "/" + (recall.Second() + recall.Third()) + ")");
        }
        public virtual string GetPrecisionDescription(int numDigits, L label)
        {
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(numDigits);
            Triple <double, int, int> prec = GetPrecisionInfo(label);

            return(nf.Format(prec.First()) + "  (" + prec.Second() + "/" + (prec.Second() + prec.Third()) + ")");
        }
Ejemplo n.º 6
0
        /// <summary>Returns a String summarizing overall accuracy that will print nicely.</summary>
        public virtual string GetAccuracyDescription(int numDigits)
        {
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(numDigits);
            Triple <double, int, int> accu = GetAccuracyInfo();

            return(nf.Format(accu.First()) + "  (" + accu.Second() + "/" + (accu.Second() + accu.Third()) + ")");
        }
Ejemplo n.º 7
0
        private static NumberFormat CreateFormat(Entry type)
        {
            var locale = type.FormatProvider.ToLocale();

            if (type.FormatType == FormatHex)
            {
                return(new HexFormat(type.Digits, type.IsUpperCase));
            }

            if (type.FormatType == FormatScientific)
            {
                var pattern = type.Pattern;
                if (pattern == null)
                {
                    pattern = GetPatternByDigits(type.Digits) + (type.IsUpperCase ? "E0" : "e0");
                }
                return(new ScientificFormat(pattern, locale));
            }

            if (type.FormatType == FormatDecimal)
            {
                DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale)
                {
                    Infinity = "Infinity"
                };
                string pattern = type.Pattern ?? GetPatternByDigits(type.Digits);
                return(new DecimalFormat(pattern, symbols));
            }

            if (type.FormatType == FormatCurrency)
            {
                return(NumberFormat.GetCurrencyInstance(locale));
            }

            // default
            var format = NumberFormat.GetNumberInstance(locale);

            if (type.Pattern != null)
            {
                var decf = format as DecimalFormat;
                if (decf != null)
                {
                    decf.ApplyLocalizedPattern(type.Pattern);
                }
            }

            return(format);
        }
Ejemplo n.º 8
0
        internal NumberFormatInfo(Locale locale)
        {
            _locale = locale;

            _numbers = NumberFormat.GetNumberInstance(locale);

            _symbols = new DecimalFormatSymbols(locale)
            {
                Infinity = "Infinity"
            };

            _decimals = _numbers as DecimalFormat ?? new DecimalFormat();
            _decimals.DecimalFormatSymbols = _symbols;

            _currency = NumberFormat.GetCurrencyInstance(locale) as DecimalFormat ?? _decimals;
            _percent  = NumberFormat.GetPercentInstance(locale) as DecimalFormat ?? _decimals;
        }
Ejemplo n.º 9
0
        /// <summary>
        /// Works out whether the model expectations match the empirical
        /// expectations.
        /// </summary>
        /// <returns>Whether the model is correct</returns>
        public override bool CheckCorrectness()
        {
            log.Info("Checking model correctness; x size " + p.data.xSize + ' ' + ", ysize " + p.data.ySize);
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(4);
            bool flag = true;

            for (int f = 0; f < lambda.Length; f++)
            {
                if (Math.Abs(lambda[f]) > 100)
                {
                    log.Info(" Lambda too big " + lambda[f]);
                    log.Info(" empirical " + ftildeArr[f] + " expected " + FExpected(p.functions.Get(f)));
                }
            }
            for (int i = 0; i < ftildeArr.Length; i++)
            {
                double exp = Math.Abs(ftildeArr[i] - FExpected(p.functions.Get(i)));
                if (exp > 0.001)
                {
                    flag = false;
                    log.Info("Constraint " + i + " not satisfied emp " + nf.Format(ftildeArr[i]) + " exp " + nf.Format(FExpected(p.functions.Get(i))) + " diff " + nf.Format(exp) + " lambda " + nf.Format(lambda[i]));
                }
            }
            for (int x = 0; x < p.data.xSize; x++)
            {
                double s = 0.0;
                for (int y = 0; y < p.data.ySize; y++)
                {
                    s = s + probConds[x][y];
                }
                if (Math.Abs(s - 1) > 0.0001)
                {
                    for (int y_1 = 0; y_1 < p.data.ySize; y_1++)
                    {
                        log.Info(y_1 + " : " + probConds[x][y_1]);
                    }
                    log.Info("probabilities do not sum to one " + x + ' ' + (float)s);
                }
            }
            return(flag);
        }
        public override string ToString()
        {
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(2);
            StringBuilder sb = new StringBuilder(2000);
            string        cl = GetType().FullName;

            sb.Append(Sharpen.Runtime.Substring(cl, cl.LastIndexOf('.') + 1)).Append("[tagbins=");
            sb.Append(numTagBins).Append(",wordTokens=").Append(numWordTokens).Append("; head -> arg\n");
            //    for (Iterator dI = coreDependencies.keySet().iterator(); dI.hasNext();) {
            //      IntDependency d = (IntDependency) dI.next();
            //      double count = coreDependencies.getCount(d);
            //      sb.append(d + " count " + nf.format(count));
            //      if (dI.hasNext()) {
            //        sb.append(",");
            //      }
            //      sb.append("\n");
            //    }
            sb.Append("]");
            return(sb.ToString());
        }
Ejemplo n.º 11
0
        public virtual void PrintStats()
        {
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(2);
            // System.out.println("Node rules");
            // System.out.println(nodeRules);
            // System.out.println("Parent rules");
            // System.out.println(pRules);
            // System.out.println("Grandparent rules");
            // System.out.println(gPRules);
            // Store java code for selSplit
            StringBuilder[] javaSB = new StringBuilder[Cutoffs.Length];
            for (int i = 0; i < Cutoffs.Length; i++)
            {
                javaSB[i] = new StringBuilder("  private static String[] splitters" + (i + 1) + " = new String[] {");
            }
            ClassicCounter <IList <string> > allScores = new ClassicCounter <IList <string> >();

            // do value of parent
            foreach (string node in nodeRules.Keys)
            {
                List <Pair <IList <string>, double> > answers = Generics.NewArrayList();
                ClassicCounter <IList <string> >      cntr    = nodeRules[node];
                double support = (cntr.TotalCount());
                System.Console.Out.WriteLine("Node " + node + " support is " + support);
                foreach (IList <string> key in pRules.Keys)
                {
                    if (key[0].Equals(node))
                    {
                        // only do it if they match
                        ClassicCounter <IList <string> > cntr2 = pRules[key];
                        double support2 = (cntr2.TotalCount());
                        double kl       = Counters.KlDivergence(cntr2, cntr);
                        System.Console.Out.WriteLine("KL(" + key + "||" + node + ") = " + nf.Format(kl) + "\t" + "support(" + key + ") = " + support2);
                        double score = kl * support2;
                        answers.Add(new Pair <IList <string>, double>(key, score));
                        allScores.SetCount(key, score);
                    }
                }
                System.Console.Out.WriteLine("----");
                System.Console.Out.WriteLine("Sorted descending support * KL");
                answers.Sort(null);
                foreach (Pair <IList <string>, double> answer in answers)
                {
                    Pair   p   = (Pair)answer;
                    double psd = ((double)p.Second());
                    System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd));
                    if (psd >= Cutoffs[0])
                    {
                        IList  lst = (IList)p.First();
                        string nd  = (string)lst[0];
                        string par = (string)lst[1];
                        for (int j = 0; j < Cutoffs.Length; j++)
                        {
                            if (psd >= Cutoffs[j])
                            {
                                javaSB[j].Append("\"").Append(nd).Append("^");
                                javaSB[j].Append(par).Append("\", ");
                            }
                        }
                    }
                }
                System.Console.Out.WriteLine();
            }

            /*
             * // do value of parent with info gain -- yet to finish this
             * for (Iterator it = nodeRules.entrySet().iterator(); it.hasNext(); ) {
             * Map.Entry pair = (Map.Entry) it.next();
             * String node = (String) pair.getKey();
             * Counter cntr = (Counter) pair.getValue();
             * double support = (cntr.totalCount());
             * System.out.println("Node " + node + " support is " + support);
             * ArrayList dtrs = new ArrayList();
             * for (Iterator it2 = pRules.entrySet().iterator(); it2.hasNext();) {
             * HashMap annotated = new HashMap();
             * Map.Entry pair2 = (Map.Entry) it2.next();
             * List node2 = (List) pair2.getKey();
             * Counter cntr2 = (Counter) pair2.getValue();
             * if (node2.get(0).equals(node)) {   // only do it if they match
             * annotated.put(node2, cntr2);
             * }
             * }
             *
             * // upto
             *
             * List answers = new ArrayList();
             * System.out.println("----");
             * System.out.println("Sorted descending support * KL");
             * Collections.sort(answers,
             * new Comparator() {
             * public int compare(Object o1, Object o2) {
             * Pair p1 = (Pair) o1;
             * Pair p2 = (Pair) o2;
             * Double p12 = (Double) p1.second();
             * Double p22 = (Double) p2.second();
             * return p22.compareTo(p12);
             * }
             * });
             * for (int i = 0, size = answers.size(); i < size; i++) {
             * Pair p = (Pair) answers.get(i);
             * double psd = ((Double) p.second()).doubleValue();
             * System.out.println(p.first() + ": " + nf.format(psd));
             * if (psd >= CUTOFFS[0]) {
             * List lst = (List) p.first();
             * String nd = (String) lst.get(0);
             * String par = (String) lst.get(1);
             * for (int j=0; j < CUTOFFS.length; j++) {
             * if (psd >= CUTOFFS[j]) {
             * javaSB[j].append("\"").append(nd).append("^");
             * javaSB[j].append(par).append("\", ");
             * }
             * }
             * }
             * }
             * System.out.println();
             * }
             */
            // do value of grandparent
            foreach (IList <string> node_1 in pRules.Keys)
            {
                List <Pair <IList <string>, double> > answers = Generics.NewArrayList();
                ClassicCounter <IList <string> >      cntr    = pRules[node_1];
                double support = (cntr.TotalCount());
                if (support < Suppcutoff)
                {
                    continue;
                }
                System.Console.Out.WriteLine("Node " + node_1 + " support is " + support);
                foreach (IList <string> key in gPRules.Keys)
                {
                    if (key[0].Equals(node_1[0]) && key[1].Equals(node_1[1]))
                    {
                        // only do it if they match
                        ClassicCounter <IList <string> > cntr2 = gPRules[key];
                        double support2 = (cntr2.TotalCount());
                        double kl       = Counters.KlDivergence(cntr2, cntr);
                        System.Console.Out.WriteLine("KL(" + key + "||" + node_1 + ") = " + nf.Format(kl) + "\t" + "support(" + key + ") = " + support2);
                        double score = kl * support2;
                        answers.Add(Pair.MakePair(key, score));
                        allScores.SetCount(key, score);
                    }
                }
                System.Console.Out.WriteLine("----");
                System.Console.Out.WriteLine("Sorted descending support * KL");
                answers.Sort(null);
                foreach (Pair <IList <string>, double> answer in answers)
                {
                    Pair   p   = (Pair)answer;
                    double psd = ((double)p.Second());
                    System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd));
                    if (psd >= Cutoffs[0])
                    {
                        IList  lst  = (IList)p.First();
                        string nd   = (string)lst[0];
                        string par  = (string)lst[1];
                        string gpar = (string)lst[2];
                        for (int j = 0; j < Cutoffs.Length; j++)
                        {
                            if (psd >= Cutoffs[j])
                            {
                                javaSB[j].Append("\"").Append(nd).Append("^");
                                javaSB[j].Append(par).Append("~");
                                javaSB[j].Append(gpar).Append("\", ");
                            }
                        }
                    }
                }
                System.Console.Out.WriteLine();
            }
            System.Console.Out.WriteLine();
            System.Console.Out.WriteLine("All scores:");
            IPriorityQueue <IList <string> > pq = Counters.ToPriorityQueue(allScores);

            while (!pq.IsEmpty())
            {
                IList <string> key   = pq.GetFirst();
                double         score = pq.GetPriority(key);
                pq.RemoveFirst();
                System.Console.Out.WriteLine(key + "\t" + score);
            }
            System.Console.Out.WriteLine("  // Automatically generated by ParentAnnotationStats -- preferably don't edit");
            for (int i_1 = 0; i_1 < Cutoffs.Length; i_1++)
            {
                int len = javaSB[i_1].Length;
                javaSB[i_1].Replace(len - 2, len, "};");
                System.Console.Out.WriteLine(javaSB[i_1]);
            }
            System.Console.Out.Write("  public static HashSet splitters = new HashSet(Arrays.asList(");
            for (int i_2 = Cutoffs.Length; i_2 > 0; i_2--)
            {
                if (i_2 == 1)
                {
                    System.Console.Out.Write("splitters1");
                }
                else
                {
                    System.Console.Out.Write("selectiveSplit" + i_2 + " ? splitters" + i_2 + " : (");
                }
            }
            // need to print extra one to close other things open
            for (int i_3 = Cutoffs.Length; i_3 >= 0; i_3--)
            {
                System.Console.Out.Write(")");
            }
            System.Console.Out.WriteLine(";");
        }
Ejemplo n.º 12
0
        /// <summary>Print some statistics about this lexicon.</summary>
        public virtual void PrintLexStats()
        {
            System.Console.Out.WriteLine("BaseLexicon statistics");
            System.Console.Out.WriteLine("unknownLevel is " + GetUnknownWordModel().GetUnknownLevel());
            // System.out.println("Rules size: " + rules.size());
            System.Console.Out.WriteLine("Sum of rulesWithWord: " + NumRules());
            System.Console.Out.WriteLine("Tags size: " + tags.Count);
            int wsize = words.Count;

            System.Console.Out.WriteLine("Words size: " + wsize);
            // System.out.println("Unseen Sigs size: " + sigs.size() +
            // " [number of unknown equivalence classes]");
            System.Console.Out.WriteLine("rulesWithWord length: " + rulesWithWord.Length + " [should be sum of words + unknown sigs]");
            int[]           lengths = new int[StatsBins];
            List <string>[] wArr    = new ArrayList[StatsBins];
            for (int j = 0; j < StatsBins; j++)
            {
                wArr[j] = new List <string>();
            }
            for (int i = 0; i < rulesWithWord.Length; i++)
            {
                int num = rulesWithWord[i].Count;
                if (num > StatsBins - 1)
                {
                    num = StatsBins - 1;
                }
                lengths[num]++;
                if (wsize <= 20 || num >= StatsBins / 2)
                {
                    wArr[num].Add(wordIndex.Get(i));
                }
            }
            System.Console.Out.WriteLine("Stats on how many taggings for how many words");
            for (int j_1 = 0; j_1 < StatsBins; j_1++)
            {
                System.Console.Out.Write(j_1 + " taggings: " + lengths[j_1] + " words ");
                if (wsize <= 20 || j_1 >= StatsBins / 2)
                {
                    System.Console.Out.Write(wArr[j_1]);
                }
                System.Console.Out.WriteLine();
            }
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(0);
            System.Console.Out.WriteLine("Unseen counter: " + Counters.ToString(uwModel.UnSeenCounter(), nf));
            if (wsize < 50 && tags.Count < 10)
            {
                nf.SetMaximumFractionDigits(3);
                StringWriter sw = new StringWriter();
                PrintWriter  pw = new PrintWriter(sw);
                pw.Println("Tagging probabilities log P(word|tag)");
                for (int t = 0; t < tags.Count; t++)
                {
                    pw.Print('\t');
                    pw.Print(tagIndex.Get(t));
                }
                pw.Println();
                for (int w = 0; w < wsize; w++)
                {
                    pw.Print(wordIndex.Get(w));
                    pw.Print('\t');
                    for (int t_1 = 0; t_1 < tags.Count; t_1++)
                    {
                        IntTaggedWord iTW = new IntTaggedWord(w, t_1);
                        pw.Print(nf.Format(Score(iTW, 1, wordIndex.Get(w), null)));
                        if (t_1 == tags.Count - 1)
                        {
                            pw.Println();
                        }
                        else
                        {
                            pw.Print('\t');
                        }
                    }
                }
                pw.Close();
                System.Console.Out.WriteLine(sw.ToString());
            }
        }
Ejemplo n.º 13
0
        /// <summary>
        /// Provides some testing and opportunities for exploration of the
        /// probabilities of a BaseLexicon.
        /// </summary>
        /// <remarks>
        /// Provides some testing and opportunities for exploration of the
        /// probabilities of a BaseLexicon.  What's here currently probably
        /// only works for the English Penn Treeebank, as it uses default
        /// constructors.  Of the words given to test on,
        /// the first is treated as sentence initial, and the rest as not
        /// sentence initial.
        /// </remarks>
        /// <param name="args">
        /// The command line arguments:
        /// java BaseLexicon treebankPath fileRange unknownWordModel words
        /// </param>
        public static void Main(string[] args)
        {
            if (args.Length < 3)
            {
                log.Info("java BaseLexicon treebankPath fileRange unknownWordModel words*");
                return;
            }
            System.Console.Out.Write("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
            Treebank tb = new DiskTreebank();

            tb.LoadPath(args[0], new NumberRangesFileFilter(args[1], true));
            // TODO: change this interface so the lexicon creates its own indices?
            IIndex <string> wordIndex = new HashIndex <string>();
            IIndex <string> tagIndex  = new HashIndex <string>();
            Options         op        = new Options();

            op.lexOptions.useUnknownWordSignatures = System.Convert.ToInt32(args[2]);
            Edu.Stanford.Nlp.Parser.Lexparser.BaseLexicon lex = new Edu.Stanford.Nlp.Parser.Lexparser.BaseLexicon(op, wordIndex, tagIndex);
            lex.InitializeTraining(tb.Count);
            lex.Train(tb);
            lex.FinishTraining();
            System.Console.Out.WriteLine("done.");
            System.Console.Out.WriteLine();
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(4);
            IList <string> impos = new List <string>();

            for (int i = 3; i < args.Length; i++)
            {
                if (lex.IsKnown(args[i]))
                {
                    System.Console.Out.WriteLine(args[i] + " is a known word.  Log probabilities [log P(w|t)] for its taggings are:");
                    for (IEnumerator <IntTaggedWord> it = lex.RuleIteratorByWord(wordIndex.AddToIndex(args[i]), i - 3, null); it.MoveNext();)
                    {
                        IntTaggedWord iTW = it.Current;
                        System.Console.Out.WriteLine(StringUtils.Pad(iTW, 24) + nf.Format(lex.Score(iTW, i - 3, wordIndex.Get(iTW.word), null)));
                    }
                }
                else
                {
                    string sig = lex.GetUnknownWordModel().GetSignature(args[i], i - 3);
                    System.Console.Out.WriteLine(args[i] + " is an unknown word.  Signature with uwm " + lex.GetUnknownWordModel().GetUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig);
                    impos.Clear();
                    IList <string> lis = new List <string>(tagIndex.ObjectsList());
                    lis.Sort();
                    foreach (string tStr in lis)
                    {
                        IntTaggedWord iTW   = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex);
                        double        score = lex.Score(iTW, 1, args[i], null);
                        if (score == float.NegativeInfinity)
                        {
                            impos.Add(tStr);
                        }
                        else
                        {
                            System.Console.Out.WriteLine(StringUtils.Pad(iTW, 24) + nf.Format(score));
                        }
                    }
                    if (impos.Count > 0)
                    {
                        System.Console.Out.WriteLine(args[i] + " impossible tags: " + impos);
                    }
                }
                System.Console.Out.WriteLine();
            }
        }
        public virtual string AsDOTString()
        {
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(3);
            nf.SetMinimumFractionDigits(1);
            StringBuilder result = new StringBuilder();
            ISet          nodes  = GetNodes();

            result.Append("digraph G {\n");
            //    result.append("page = \"8.5,11\";\n");
            //    result.append("margin = \"0.25\";\n");
            // Heuristic number of pages
            int sz  = arcs.Count;
            int ht  = 105;
            int mag = 250;

            while (sz > mag)
            {
                ht  += 105;
                mag *= 2;
            }
            int wd = 8;

            mag = 500;
            while (sz > mag)
            {
                wd  += 8;
                mag *= 4;
            }
            double htd = ht / 10.0;

            result.Append("size = \"" + wd + "," + htd + "\";\n");
            result.Append("graph [rankdir = \"LR\"];\n");
            result.Append("graph [ranksep = \"0.2\"];\n");
            foreach (object node in nodes)
            {
                string cleanString = StringUtils.FileNameClean(node.ToString());
                result.Append(cleanString);
                result.Append(" [ ");
                //      if (getEndNodes().contains(node)) {
                //        result.append("label=\"" + node.toString() + "\", style=filled, ");
                //      } else
                result.Append("label=\"" + node.ToString() + "\"");
                result.Append("height=\"0.3\", width=\"0.3\"");
                result.Append(" ];\n");
                foreach (TransducerGraph.Arc arc in GetArcsBySource(node))
                {
                    result.Append(StringUtils.FileNameClean(arc.GetSourceNode().ToString()));
                    result.Append(" -> ");
                    result.Append(StringUtils.FileNameClean(arc.GetTargetNode().ToString()));
                    result.Append(" [ ");
                    result.Append("label=\"");
                    result.Append(arc.GetInput());
                    result.Append(" : ");
                    // result.append(arc.getOutput());
                    object output = arc.GetOutput();
                    string wt     = string.Empty;
                    if (output is Number)
                    {
                        double dd = ((Number)output);
                        if (dd == -0.0d)
                        {
                            result.Append(nf.Format(0.0d));
                        }
                        else
                        {
                            result.Append(nf.Format(output));
                        }
                        int weight;
                        if (dotWeightInverted)
                        {
                            weight = (int)(20.0 - dd);
                        }
                        else
                        {
                            weight = (int)dd;
                        }
                        if (weight > 0)
                        {
                            wt = ", weight = \"" + weight + "\"";
                        }
                        if (dotWeightInverted && dd <= 2.0 || (!dotWeightInverted) && dd >= 20.0)
                        {
                            wt += ", style=bold";
                        }
                    }
                    else
                    {
                        result.Append(output);
                    }
                    result.Append("\"");
                    result.Append(wt);
                    // result.append("fontsize = 14 ");
                    if (arc.GetInput().ToString().Equals("EPSILON"))
                    {
                        result.Append(", style = \"dashed\" ");
                    }
                    else
                    {
                        result.Append(", style = \"solid\" ");
                    }
                    // result.append(", weight = \"" + arc.getOutput() + "\" ");
                    result.Append("];\n");
                }
            }
            result.Append("}\n");
            return(result.ToString());
        }
Ejemplo n.º 15
0
        public virtual SVMLightClassifier <L, F> TrainClassifierBasic(GeneralDataset <L, F> dataset)
        {
            IIndex <L> labelIndex   = dataset.LabelIndex();
            IIndex <F> featureIndex = dataset.featureIndex;
            bool       multiclass   = (dataset.NumClasses() > 2);

            try
            {
                // this is the file that the model will be saved to
                File modelFile = File.CreateTempFile("svm-", ".model");
                if (deleteTempFilesOnExit)
                {
                    modelFile.DeleteOnExit();
                }
                // this is the file that the svm light formated dataset
                // will be printed to
                File dataFile = File.CreateTempFile("svm-", ".data");
                if (deleteTempFilesOnExit)
                {
                    dataFile.DeleteOnExit();
                }
                // print the dataset
                PrintWriter pw = new PrintWriter(new FileWriter(dataFile));
                dataset.PrintSVMLightFormat(pw);
                pw.Close();
                // -v 0 makes it not verbose
                // -m 400 gives it a larger cache, for faster training
                string cmd = (multiclass ? svmStructLearn : (useSVMPerf ? svmPerfLearn : svmLightLearn)) + " -v " + svmLightVerbosity + " -m 400 ";
                // set the value of C if we have one specified
                if (C > 0.0)
                {
                    cmd = cmd + " -c " + C + " ";
                }
                else
                {
                    // C value
                    if (useSVMPerf)
                    {
                        cmd = cmd + " -c " + 0.01 + " ";
                    }
                }
                //It's required to specify this parameter for SVM perf
                // Alpha File
                if (useAlphaFile)
                {
                    File newAlphaFile = File.CreateTempFile("svm-", ".alphas");
                    if (deleteTempFilesOnExit)
                    {
                        newAlphaFile.DeleteOnExit();
                    }
                    cmd = cmd + " -a " + newAlphaFile.GetAbsolutePath();
                    if (alphaFile != null)
                    {
                        cmd = cmd + " -y " + alphaFile.GetAbsolutePath();
                    }
                    alphaFile = newAlphaFile;
                }
                // File and Model Data
                cmd = cmd + " " + dataFile.GetAbsolutePath() + " " + modelFile.GetAbsolutePath();
                if (verbose)
                {
                    logger.Info("<< " + cmd + " >>");
                }

                /*Process p = Runtime.getRuntime().exec(cmd);
                 *
                 * p.waitFor();
                 *
                 * if (p.exitValue() != 0) throw new RuntimeException("Error Training SVM Light exit value: " + p.exitValue());
                 * p.destroy();   */
                SystemUtils.Run(new ProcessBuilder(whitespacePattern.Split(cmd)), new PrintWriter(System.Console.Error), new PrintWriter(System.Console.Error));
                if (doEval)
                {
                    File predictFile = File.CreateTempFile("svm-", ".pred");
                    if (deleteTempFilesOnExit)
                    {
                        predictFile.DeleteOnExit();
                    }
                    string evalCmd = (multiclass ? svmStructClassify : (useSVMPerf ? svmPerfClassify : svmLightClassify)) + " " + dataFile.GetAbsolutePath() + " " + modelFile.GetAbsolutePath() + " " + predictFile.GetAbsolutePath();
                    if (verbose)
                    {
                        logger.Info("<< " + evalCmd + " >>");
                    }
                    SystemUtils.Run(new ProcessBuilder(whitespacePattern.Split(evalCmd)), new PrintWriter(System.Console.Error), new PrintWriter(System.Console.Error));
                }
                // read in the model file
                Pair <double, ClassicCounter <int> > weightsAndThresh = ReadModel(modelFile, multiclass);
                double threshold = weightsAndThresh.First();
                ClassicCounter <Pair <F, L> > weights    = ConvertWeights(weightsAndThresh.Second(), featureIndex, labelIndex, multiclass);
                ClassicCounter <L>            thresholds = new ClassicCounter <L>();
                if (!multiclass)
                {
                    thresholds.SetCount(labelIndex.Get(0), -threshold);
                    thresholds.SetCount(labelIndex.Get(1), threshold);
                }
                SVMLightClassifier <L, F> classifier = new SVMLightClassifier <L, F>(weights, thresholds);
                if (doEval)
                {
                    File predictFile = File.CreateTempFile("svm-", ".pred2");
                    if (deleteTempFilesOnExit)
                    {
                        predictFile.DeleteOnExit();
                    }
                    PrintWriter  pw2 = new PrintWriter(predictFile);
                    NumberFormat nf  = NumberFormat.GetNumberInstance();
                    nf.SetMaximumFractionDigits(5);
                    foreach (IDatum <L, F> datum in dataset)
                    {
                        ICounter <L> scores = classifier.ScoresOf(datum);
                        pw2.Println(Counters.ToString(scores, nf));
                    }
                    pw2.Close();
                }
                if (useSigmoid)
                {
                    if (verbose)
                    {
                        System.Console.Out.Write("fitting sigmoid...");
                    }
                    classifier.SetPlatt(FitSigmoid(classifier, dataset));
                    if (verbose)
                    {
                        System.Console.Out.WriteLine("done");
                    }
                }
                return(classifier);
            }
            catch (Exception e)
            {
                throw new Exception(e);
            }
        }
        public virtual void PrintStats()
        {
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(2);
            // System.out.println("Node rules");
            // System.out.println(nodeRules);
            // System.out.println("Parent rules");
            // System.out.println(pRules);
            // System.out.println("Grandparent rules");
            // System.out.println(gPRules);
            // Store java code for selSplit
            StringBuilder[] javaSB = new StringBuilder[Cutoffs.Length];
            for (int i = 0; i < Cutoffs.Length; i++)
            {
                javaSB[i] = new StringBuilder("  private static String[] sisterSplit" + (i + 1) + " = new String[] {");
            }
            ArrayList topScores = new ArrayList();

            foreach (object o in nodeRules.Keys)
            {
                ArrayList      answers = new ArrayList();
                string         label   = (string)o;
                ClassicCounter cntr    = (ClassicCounter)nodeRules[label];
                double         support = (cntr.TotalCount());
                System.Console.Out.WriteLine("Node " + label + " support is " + support);
                foreach (object o4 in ((Hashtable)leftRules[label]).Keys)
                {
                    string         sis      = (string)o4;
                    ClassicCounter cntr2    = (ClassicCounter)((Hashtable)leftRules[label])[sis];
                    double         support2 = (cntr2.TotalCount());
                    /* alternative 1: use full distribution to calculate score */
                    double kl = Counters.KlDivergence(cntr2, cntr);
                    /* alternative 2: hold out test-context data to calculate score */

                    /* this doesn't work because it can lead to zero-probability
                     * data points hence infinite divergence */
                    //  Counter tempCounter = new Counter();
                    //  tempCounter.addCounter(cntr2);
                    //  for(Iterator i = tempCounter.seenSet().iterator(); i.hasNext();) {
                    //    Object o = i.next();
                    //    tempCounter.setCount(o,-1*tempCounter.countOf(o));
                    //  }
                    //  System.out.println(tempCounter); //debugging
                    //  tempCounter.addCounter(cntr);
                    //  System.out.println(tempCounter); //debugging
                    //  System.out.println(cntr);
                    //  double kl = cntr2.klDivergence(tempCounter);
                    /* alternative 2 ends here */
                    string annotatedLabel = label + "=l=" + sis;
                    System.Console.Out.WriteLine("KL(" + annotatedLabel + "||" + label + ") = " + nf.Format(kl) + "\t" + "support(" + sis + ") = " + support2);
                    answers.Add(new Pair(annotatedLabel, kl * support2));
                    topScores.Add(new Pair(annotatedLabel, kl * support2));
                }
                foreach (object o3 in ((Hashtable)rightRules[label]).Keys)
                {
                    string         sis            = (string)o3;
                    ClassicCounter cntr2          = (ClassicCounter)((Hashtable)rightRules[label])[sis];
                    double         support2       = (cntr2.TotalCount());
                    double         kl             = Counters.KlDivergence(cntr2, cntr);
                    string         annotatedLabel = label + "=r=" + sis;
                    System.Console.Out.WriteLine("KL(" + annotatedLabel + "||" + label + ") = " + nf.Format(kl) + "\t" + "support(" + sis + ") = " + support2);
                    answers.Add(new Pair(annotatedLabel, kl * support2));
                    topScores.Add(new Pair(annotatedLabel, kl * support2));
                }
                // upto
                System.Console.Out.WriteLine("----");
                System.Console.Out.WriteLine("Sorted descending support * KL");
                answers.Sort(null);
                foreach (object answer in answers)
                {
                    Pair   p   = (Pair)answer;
                    double psd = ((double)p.Second());
                    System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd));
                    if (psd >= Cutoffs[0])
                    {
                        string annotatedLabel = (string)p.First();
                        foreach (double Cutoff in Cutoffs)
                        {
                            if (psd >= Cutoff)
                            {
                            }
                        }
                    }
                }
                //javaSB[j].append("\"").append(annotatedLabel);
                //javaSB[j].append("\",");
                System.Console.Out.WriteLine();
            }
            topScores.Sort(null);
            string outString = "All enriched categories, sorted by score\n";

            foreach (object topScore in topScores)
            {
                Pair   p   = (Pair)topScore;
                double psd = ((double)p.Second());
                System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd));
            }
            System.Console.Out.WriteLine();
            System.Console.Out.WriteLine("  // Automatically generated by SisterAnnotationStats -- preferably don't edit");
            int k = Cutoffs.Length - 1;

            for (int j = 0; j < topScores.Count; j++)
            {
                Pair   p   = (Pair)topScores[j];
                double psd = ((double)p.Second());
                if (psd < Cutoffs[k])
                {
                    if (k == 0)
                    {
                        break;
                    }
                    else
                    {
                        k--;
                        j -= 1;
                        // messy but should do it
                        continue;
                    }
                }
                javaSB[k].Append("\"").Append(p.First());
                javaSB[k].Append("\",");
            }
            for (int i_1 = 0; i_1 < Cutoffs.Length; i_1++)
            {
                int len = javaSB[i_1].Length;
                javaSB[i_1].Replace(len - 2, len, "};");
                System.Console.Out.WriteLine(javaSB[i_1]);
            }
            System.Console.Out.Write("  public static String[] sisterSplit = ");
            for (int i_2 = Cutoffs.Length; i_2 > 0; i_2--)
            {
                if (i_2 == 1)
                {
                    System.Console.Out.Write("sisterSplit1");
                }
                else
                {
                    System.Console.Out.Write("selectiveSisterSplit" + i_2 + " ? sisterSplit" + i_2 + " : (");
                }
            }
            // need to print extra one to close other things open
            for (int i_3 = Cutoffs.Length; i_3 >= 0; i_3--)
            {
                System.Console.Out.Write(")");
            }
            System.Console.Out.WriteLine(";");
        }
        /// <summary>
        /// Return various statistics about the treebank (number of sentences,
        /// words, tag set, etc.).
        /// </summary>
        /// <param name="tlp">
        /// The TreebankLanguagePack used to determine punctuation and an
        /// appropriate character encoding
        /// </param>
        /// <returns>A big string for human consumption describing the treebank</returns>
        public virtual string TextualSummary(ITreebankLanguagePack tlp)
        {
            int  numTrees         = 0;
            int  numTreesLE40     = 0;
            int  numNonUnaryRoots = 0;
            Tree nonUnaryEg       = null;
            ClassicCounter <Tree>   nonUnaries = new ClassicCounter <Tree>();
            ClassicCounter <string> roots      = new ClassicCounter <string>();
            ClassicCounter <string> starts     = new ClassicCounter <string>();
            ClassicCounter <string> puncts     = new ClassicCounter <string>();
            int numUnenclosedLeaves            = 0;
            int numLeaves     = 0;
            int numNonPhrasal = 0;
            int numPreTerminalWithMultipleChildren = 0;
            int numWords                       = 0;
            int numTags                        = 0;
            int shortestSentence               = int.MaxValue;
            int longestSentence                = 0;
            int numNullLabel                   = 0;
            ICollection <string>    words      = Generics.NewHashSet();
            ClassicCounter <string> tags       = new ClassicCounter <string>();
            ClassicCounter <string> cats       = new ClassicCounter <string>();
            Tree leafEg                        = null;
            Tree preTerminalMultipleChildrenEg = null;
            Tree nullLabelEg                   = null;
            Tree rootRewritesAsTaggedWordEg    = null;

            foreach (Tree t in this)
            {
                roots.IncrementCount(t.Value());
                numTrees++;
                int leng = t.Yield().Count;
                if (leng <= 40)
                {
                    numTreesLE40++;
                }
                if (leng < shortestSentence)
                {
                    shortestSentence = leng;
                }
                if (leng > longestSentence)
                {
                    longestSentence = leng;
                }
                if (t.NumChildren() > 1)
                {
                    if (numNonUnaryRoots == 0)
                    {
                        nonUnaryEg = t;
                    }
                    if (numNonUnaryRoots < 100)
                    {
                        nonUnaries.IncrementCount(t.LocalTree());
                    }
                    numNonUnaryRoots++;
                }
                else
                {
                    if (t.IsLeaf())
                    {
                        numUnenclosedLeaves++;
                    }
                    else
                    {
                        Tree t2 = t.FirstChild();
                        if (t2.IsLeaf())
                        {
                            numLeaves++;
                            leafEg = t;
                        }
                        else
                        {
                            if (t2.IsPreTerminal())
                            {
                                if (numNonPhrasal == 0)
                                {
                                    rootRewritesAsTaggedWordEg = t;
                                }
                                numNonPhrasal++;
                            }
                        }
                        starts.IncrementCount(t2.Value());
                    }
                }
                foreach (Tree subtree in t)
                {
                    ILabel lab = subtree.Label();
                    if (lab == null || lab.Value() == null || lab.Value().IsEmpty())
                    {
                        if (numNullLabel == 0)
                        {
                            nullLabelEg = subtree;
                        }
                        numNullLabel++;
                        if (lab == null)
                        {
                            subtree.SetLabel(new StringLabel(string.Empty));
                        }
                        else
                        {
                            if (lab.Value() == null)
                            {
                                subtree.Label().SetValue(string.Empty);
                            }
                        }
                    }
                    if (subtree.IsLeaf())
                    {
                        numWords++;
                        words.Add(subtree.Value());
                    }
                    else
                    {
                        if (subtree.IsPreTerminal())
                        {
                            numTags++;
                            tags.IncrementCount(subtree.Value());
                            if (tlp != null && tlp.IsPunctuationTag(subtree.Value()))
                            {
                                puncts.IncrementCount(subtree.FirstChild().Value());
                            }
                        }
                        else
                        {
                            if (subtree.IsPhrasal())
                            {
                                bool hasLeafChild = false;
                                foreach (Tree kt in subtree.Children())
                                {
                                    if (kt.IsLeaf())
                                    {
                                        hasLeafChild = true;
                                    }
                                }
                                if (hasLeafChild)
                                {
                                    numPreTerminalWithMultipleChildren++;
                                    if (preTerminalMultipleChildrenEg == null)
                                    {
                                        preTerminalMultipleChildrenEg = subtree;
                                    }
                                }
                                cats.IncrementCount(subtree.Value());
                            }
                            else
                            {
                                throw new InvalidOperationException("Treebank: Bad tree in treebank!: " + subtree);
                            }
                        }
                    }
                }
            }
            StringWriter sw = new StringWriter(2000);
            PrintWriter  pw = new PrintWriter(sw);
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(0);
            pw.Println("Treebank has " + numTrees + " trees (" + numTreesLE40 + " of length <= 40) and " + numWords + " words (tokens)");
            if (numTrees > 0)
            {
                if (numTags != numWords)
                {
                    pw.Println("  Warning! numTags differs and is " + numTags);
                }
                if (roots.Size() == 1)
                {
                    string root = (string)Sharpen.Collections.ToArray(roots.KeySet())[0];
                    pw.Println("  The root category is: " + root);
                }
                else
                {
                    pw.Println("  Warning! " + roots.Size() + " different roots in treebank: " + Counters.ToString(roots, nf));
                }
                if (numNonUnaryRoots > 0)
                {
                    pw.Print("  Warning! " + numNonUnaryRoots + " trees without unary initial rewrite.  ");
                    if (numNonUnaryRoots > 100)
                    {
                        pw.Print("First 100 ");
                    }
                    pw.Println("Rewrites: " + Counters.ToString(nonUnaries, nf));
                    pw.Println("    Example: " + nonUnaryEg);
                }
                if (numUnenclosedLeaves > 0 || numLeaves > 0 || numNonPhrasal > 0)
                {
                    pw.Println("  Warning! Non-phrasal trees: " + numUnenclosedLeaves + " bare leaves; " + numLeaves + " root rewrites as leaf; and " + numNonPhrasal + " root rewrites as tagged word");
                    if (numLeaves > 0)
                    {
                        pw.Println("  Example bad root rewrites as leaf: " + leafEg);
                    }
                    if (numNonPhrasal > 0)
                    {
                        pw.Println("  Example bad root rewrites as tagged word: " + rootRewritesAsTaggedWordEg);
                    }
                }
                if (numNullLabel > 0)
                {
                    pw.Println("  Warning!  " + numNullLabel + " tree nodes with null or empty string labels, e.g.:");
                    pw.Println("    " + nullLabelEg);
                }
                if (numPreTerminalWithMultipleChildren > 0)
                {
                    pw.Println("  Warning! " + numPreTerminalWithMultipleChildren + " preterminal nodes with multiple children.");
                    pw.Println("    Example: " + preTerminalMultipleChildrenEg);
                }
                pw.Println("  Sentences range from " + shortestSentence + " to " + longestSentence + " words, with an average length of " + (((numWords * 100) / numTrees) / 100.0) + " words.");
                pw.Println("  " + cats.Size() + " phrasal category types, " + tags.Size() + " tag types, and " + words.Count + " word types");
                string[] empties = new string[] { "*", "0", "*T*", "*RNR*", "*U*", "*?*", "*EXP*", "*ICH*", "*NOT*", "*PPA*", "*OP*", "*pro*", "*PRO*" };
                // What a dopey choice using 0 as an empty element name!!
                // The problem with the below is that words aren't turned into a basic
                // category, but empties commonly are indexed....  Would need to look
                // for them with a suffix of -[0-9]+
                ICollection <string> knownEmpties        = Generics.NewHashSet(Arrays.AsList(empties));
                ICollection <string> emptiesIntersection = Sets.Intersection(words, knownEmpties);
                if (!emptiesIntersection.IsEmpty())
                {
                    pw.Println("  Caution! " + emptiesIntersection.Count + " word types are known empty elements: " + emptiesIntersection);
                }
                ICollection <string> joint = Sets.Intersection(cats.KeySet(), tags.KeySet());
                if (!joint.IsEmpty())
                {
                    pw.Println("  Warning! " + joint.Count + " items are tags and categories: " + joint);
                }
                foreach (string cat in cats.KeySet())
                {
                    if (cat != null && cat.Contains("@"))
                    {
                        pw.Println("  Warning!!  Stanford Parser does not work with categories containing '@' like: " + cat);
                        break;
                    }
                }
                foreach (string cat_1 in tags.KeySet())
                {
                    if (cat_1 != null && cat_1.Contains("@"))
                    {
                        pw.Println("  Warning!!  Stanford Parser does not work with tags containing '@' like: " + cat_1);
                        break;
                    }
                }
                pw.Println("    Cats: " + Counters.ToString(cats, nf));
                pw.Println("    Tags: " + Counters.ToString(tags, nf));
                pw.Println("    " + starts.Size() + " start categories: " + Counters.ToString(starts, nf));
                if (!puncts.IsEmpty())
                {
                    pw.Println("    Puncts: " + Counters.ToString(puncts, nf));
                }
            }
            return(sw.ToString());
        }