Пример #1
0
        /// <summary>
        /// Finds the best sequence by collecting numSamples samples, scoring them, and then choosing
        /// the highest scoring sample.
        /// </summary>
        /// <returns>the array of type int representing the highest scoring sequence</returns>
        public virtual int[] FindBestUsingSampling(ISequenceModel model, int numSamples, int sampleInterval, int[] initialSequence)
        {
            IList samples = CollectSamples(model, numSamples, sampleInterval, initialSequence);

            int[]  best      = null;
            double bestScore = double.NegativeInfinity;

            foreach (object sample in samples)
            {
                int[]  sequence = (int[])sample;
                double score    = model.ScoreOf(sequence);
                if (score > bestScore)
                {
                    best      = sequence;
                    bestScore = score;
                    log.Info("found new best (" + bestScore + ")");
                    log.Info(ArrayMath.ToString(best));
                }
            }
            return(best);
        }
Пример #2
0
        /// <summary>Execute with no arguments for usage.</summary>
        public static void Main(string[] args)
        {
            if (!ValidateCommandLine(args))
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            DateTime startTime = new DateTime();

            System.Console.Out.WriteLine("##################################");
            System.Console.Out.WriteLine("# Stanford Treebank Preprocessor #");
            System.Console.Out.WriteLine("##################################");
            System.Console.Out.Printf("Start time: %s%n", startTime);
            System.Console.Out.Printf("Configuration: %s%n%n", configFile);
            ConfigParser cp = new ConfigParser(configFile);

            cp.Parse();
            DistributionPackage distrib = new DistributionPackage();

            foreach (Properties dsParams in cp)
            {
                string nameOfDataset = PropertiesUtils.HasProperty(dsParams, ConfigParser.paramName) ? dsParams.GetProperty(ConfigParser.paramName) : "UN-NAMED";
                if (outputPath != null)
                {
                    dsParams.SetProperty(ConfigParser.paramOutputPath, outputPath);
                }
                IDataset ds = GetDatasetClass(dsParams);
                if (ds == null)
                {
                    System.Console.Out.Printf("Unable to instantiate TYPE for dataset %s. Check the javadocs%n", nameOfDataset);
                    continue;
                }
                bool shouldDistribute = dsParams.Contains(ConfigParser.paramDistrib) && bool.ParseBoolean(dsParams.GetProperty(ConfigParser.paramDistrib));
                dsParams.Remove(ConfigParser.paramDistrib);
                bool lacksRequiredOptions = !(ds.SetOptions(dsParams));
                if (lacksRequiredOptions)
                {
                    System.Console.Out.Printf("Skipping dataset %s as it lacks required parameters. Check the javadocs%n", nameOfDataset);
                    continue;
                }
                ds.Build();
                if (shouldDistribute)
                {
                    distrib.AddFiles(ds.GetFilenames());
                }
                if (Verbose)
                {
                    System.Console.Out.Printf("%s%n", ds.ToString());
                }
            }
            if (MakeDistrib)
            {
                distrib.Make(distribName);
            }
            if (Verbose)
            {
                System.Console.Out.WriteLine("-->configuration details");
                System.Console.Out.WriteLine(cp.ToString());
                if (MakeDistrib)
                {
                    System.Console.Out.WriteLine("-->distribution package details");
                    System.Console.Out.WriteLine(distrib.ToString());
                }
            }
            DateTime stopTime    = new DateTime();
            long     elapsedTime = stopTime.GetTime() - startTime.GetTime();

            System.Console.Out.Printf("Completed processing at %s%n", stopTime);
            System.Console.Out.Printf("Elapsed time: %d seconds%n", (int)(elapsedTime / 1000F));
        }
Пример #3
0
 public static void Help()
 {
     log.Info("Possible arguments for SemgrexPattern:");
     log.Info(Pattern + ": what pattern to use for matching");
     log.Info(TreeFile + ": a file of trees to process");
     log.Info(ConlluFile + ": a CoNLL-U file of dependency trees to process");
     log.Info(Mode + ": what mode for dependencies.  basic, collapsed, or ccprocessed.  To get 'noncollapsed', use basic with extras");
     log.Info(Extras + ": whether or not to use extras");
     log.Info(OutputFormatOption + ": output format of matches. list or offset. 'list' prints the graph as a list of dependencies, " + "'offset' prints the filename and the line offset in the ConLL-U file.");
     log.Info();
     log.Info(Pattern + " is required");
 }
 public override bool Precondition(string tag)
 {
     log.Info("VBN: Testing precondition on " + tag + ": " + (tag.Equals(vbnTag) || tag.Equals(vbdTag) || tag.Equals(jjTag)));
     return(tag.Equals(vbnTag) || tag.Equals(vbdTag) || tag.Equals(jjTag));
 }
 /// <summary>This method allows you to show the results of timing according to another class' logger.</summary>
 /// <remarks>
 /// This method allows you to show the results of timing according to another class' logger.
 /// E.g.,
 /// <c>timing.done(logger, "Loading lexicon")</c>
 /// .
 /// </remarks>
 /// <param name="logger">Logger to log a timed operation with</param>
 /// <param name="msg">Message to report.</param>
 public virtual void Done(Redwood.RedwoodChannels logger, StringBuilder msg)
 {
     msg.Append("... done [").Append(ToSecondsString()).Append(" sec].");
     logger.Info(msg.ToString());
 }
        /// <exception cref="System.MemberAccessException"/>
        /// <exception cref="System.Exception"/>
        /// <exception cref="Java.Util.Concurrent.ExecutionException"/>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Java.Lang.InstantiationException"/>
        /// <exception cref="System.MissingMethodException"/>
        /// <exception cref="System.Reflection.TargetInvocationException"/>
        /// <exception cref="System.TypeLoadException"/>
        /// <exception cref="Java.Sql.SQLException"/>
        public virtual string SuggestPhrasesTest(Properties testProps, string modelPropertiesFile, string stopWordsFile)
        {
            logger.Info("Suggesting phrases in test");
            logger.Info("test properties are " + testProps);
            Properties runProps = StringUtils.ArgsToPropertiesWithResolve(new string[] { "-props", modelPropertiesFile });

            string[] removeProperties = new string[] { "allPatternsDir", "storePatsForEachToken", "invertedIndexClass", "savePatternsWordsDir", "batchProcessSents", "outDir", "saveInvertedIndex", "removeOverLappingLabels", "numThreads" };
            foreach (string s in removeProperties)
            {
                if (runProps.Contains(s))
                {
                    runProps.Remove(s);
                }
            }
            runProps.SetProperty("stopWordsPatternFiles", stopWordsFile);
            runProps.SetProperty("englishWordsFiles", stopWordsFile);
            runProps.SetProperty("commonWordsPatternFiles", stopWordsFile);
            runProps.PutAll(props);
            runProps.PutAll(testProps);
            props.PutAll(runProps);
            ProcessText(false);
            GetPatternsFromDataMultiClass <SurfacePattern> model = new GetPatternsFromDataMultiClass <SurfacePattern>(runProps, Data.sents, seedWords, true, humanLabelClasses);

            ArgumentParser.FillOptions(model, runProps);
            GetPatternsFromDataMultiClass.LoadFromSavedPatternsWordsDir(model, runProps);
            IDictionary <string, int> alreadyLearnedIters = new Dictionary <string, int>();

            foreach (string label in model.constVars.GetLabels())
            {
                alreadyLearnedIters[label] = model.constVars.GetLearnedWordsEachIter()[label].LastEntry().Key;
            }
            if (model.constVars.learn)
            {
                //      Map<String, E> p0 = new HashMap<String, SurfacePattern>();
                //      Map<String, Counter<CandidatePhrase>> p0Set = new HashMap<String, Counter<CandidatePhrase>>();
                //      Map<String, Set<E>> ignorePatterns = new HashMap<String, Set<E>>();
                model.IterateExtractApply(null, null, null);
            }
            IDictionary <string, ICounter <CandidatePhrase> > allExtractions = new Dictionary <string, ICounter <CandidatePhrase> >();
            //Only for one label right now!
            string label_1 = model.constVars.GetLabels().GetEnumerator().Current;

            allExtractions[label_1] = new ClassicCounter <CandidatePhrase>();
            foreach (KeyValuePair <string, DataInstance> sent in Data.sents)
            {
                StringBuilder str = new StringBuilder();
                foreach (CoreLabel l in sent.Value.GetTokens())
                {
                    if (l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null && !l.Get(typeof(PatternsAnnotations.MatchedPatterns)).IsEmpty())
                    {
                        str.Append(" " + l.Word());
                    }
                    else
                    {
                        allExtractions[label_1].IncrementCount(CandidatePhrase.CreateOrGet(str.ToString().Trim()));
                        str.Length = 0;
                    }
                }
            }
            allExtractions.PutAll(model.matchedSeedWords);
            return(model.constVars.GetSetWordsAsJson(allExtractions));
        }
        /// <summary>Create a new KBP annotator from the given properties.</summary>
        /// <param name="props">The properties to use when creating this extractor.</param>
        public KBPAnnotator(string name, Properties props)
        {
            //@ArgumentParser.Option(name="kbp.language", gloss="language for kbp")
            //private String language = "english";

            /*
             * A TokensRegexNER annotator for the special KBP NER types (case-sensitive).
             */
            //private final TokensRegexNERAnnotator casedNER;

            /*
             * A TokensRegexNER annotator for the special KBP NER types (case insensitive).
             */
            //private final TokensRegexNERAnnotator caselessNER;
            // Parse standard properties
            ArgumentParser.FillOptions(this, name, props);
            //Locale kbpLanguage =
            //(language.toLowerCase().equals("zh") || language.toLowerCase().equals("chinese")) ?
            //Locale.CHINESE : Locale.ENGLISH ;
            kbpProperties = props;
            try
            {
                List <IKBPRelationExtractor> extractors = new List <IKBPRelationExtractor>();
                // add tokensregex rules
                if (!tokensregexdir.Equals(NotProvided))
                {
                    extractors.Add(new KBPTokensregexExtractor(tokensregexdir, Verbose));
                }
                // add semgrex rules
                if (!semgrexdir.Equals(NotProvided))
                {
                    extractors.Add(new KBPSemgrexExtractor(semgrexdir, Verbose));
                }
                // attempt to add statistical model
                if (!model.Equals(NotProvided))
                {
                    log.Info("Loading KBP classifier from: " + model);
                    object @object = IOUtils.ReadObjectFromURLOrClasspathOrFileSystem(model);
                    IKBPRelationExtractor statisticalExtractor;
                    if (@object is LinearClassifier)
                    {
                        //noinspection unchecked
                        statisticalExtractor = new KBPStatisticalExtractor((IClassifier <string, string>)@object);
                    }
                    else
                    {
                        if (@object is KBPStatisticalExtractor)
                        {
                            statisticalExtractor = (KBPStatisticalExtractor)@object;
                        }
                        else
                        {
                            throw new InvalidCastException(@object.GetType() + " cannot be cast into a " + typeof(KBPStatisticalExtractor));
                        }
                    }
                    extractors.Add(statisticalExtractor);
                }
                // build extractor
                this.extractor = new KBPEnsembleExtractor(Sharpen.Collections.ToArray(extractors, new IKBPRelationExtractor[extractors.Count]));
                // set maximum length of sentence to operate on
                maxLength = System.Convert.ToInt32(props.GetProperty("kbp.maxlen", "-1"));
            }
            catch (Exception e)
            {
                throw new RuntimeIOException(e);
            }
            // set up map for converting between older and new KBP relation names
            relationNameConversionMap = new Dictionary <string, string>();
            relationNameConversionMap["org:dissolved"] = "org:date_dissolved";
            relationNameConversionMap["org:founded"]   = "org:date_founded";
            relationNameConversionMap["org:number_of_employees/members"]     = "org:number_of_employees_members";
            relationNameConversionMap["org:political/religious_affiliation"] = "org:political_religious_affiliation";
            relationNameConversionMap["org:top_members/employees"]           = "org:top_members_employees";
            relationNameConversionMap["per:member_of"]   = "per:employee_or_member_of";
            relationNameConversionMap["per:employee_of"] = "per:employee_or_member_of";
            relationNameConversionMap["per:stateorprovinces_of_residence"] = "per:statesorprovinces_of_residence";
            // set up KBP language
            kbpLanguage = LanguageInfo.GetLanguageFromString(props.GetProperty("kbp.language", "en"));
            // build the Spanish coref system if necessary
            if (LanguageInfo.HumanLanguage.Spanish.Equals(kbpLanguage))
            {
                spanishCorefSystem = new KBPBasicSpanishCorefSystem();
            }
        }
Пример #8
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length < MinArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            Properties options        = StringUtils.ArgsToProperties(args, OptionArgDefs());
            bool       Verbose        = PropertiesUtils.GetBool(options, "v", false);
            File       testTreebank   = options.Contains("t") ? new File(options.GetProperty("t")) : null;
            int        maxGoldSentLen = PropertiesUtils.GetInt(options, "l", int.MaxValue);
            bool       SerInput       = PropertiesUtils.GetBool(options, "o", false);

            string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+");
            if (parsedArgs.Length != MinArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            File     trainTreebank = new File(parsedArgs[0]);
            DateTime startTime     = new DateTime();

            log.Info("###################################");
            log.Info("### Joint Segmentation / Parser ###");
            log.Info("###################################");
            System.Console.Error.Printf("Start time: %s\n", startTime);
            JointParsingModel parsingModel = new JointParsingModel();

            parsingModel.SetVerbose(Verbose);
            parsingModel.SetMaxEvalSentLen(maxGoldSentLen);
            parsingModel.SetSerInput(SerInput);
            //WSGDEBUG -- Some stuff for eclipse debugging
            InputStream inputStream = null;

            try
            {
                if (Runtime.GetProperty("eclipse") == null)
                {
                    inputStream = (SerInput) ? new ObjectInputStream(new GZIPInputStream(Runtime.@in)) : Runtime.@in;
                }
                else
                {
                    FileInputStream fileStream = new FileInputStream(new File("debug.2.xml"));
                    inputStream = (SerInput) ? new ObjectInputStream(new GZIPInputStream(fileStream)) : fileStream;
                }
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
                System.Environment.Exit(-1);
            }
            finally
            {
                if (inputStream != null)
                {
                    try
                    {
                        inputStream.Close();
                    }
                    catch (IOException)
                    {
                    }
                }
            }
            if (!trainTreebank.Exists())
            {
                log.Info("Training treebank does not exist!\n  " + trainTreebank.GetPath());
            }
            else
            {
                if (testTreebank != null && !testTreebank.Exists())
                {
                    log.Info("Test treebank does not exist!\n  " + testTreebank.GetPath());
                }
                else
                {
                    if (parsingModel.Run(trainTreebank, testTreebank, inputStream))
                    {
                        log.Info("Successful shutdown!");
                    }
                    else
                    {
                        log.Error("Parsing model failure.");
                    }
                }
            }
            DateTime stopTime    = new DateTime();
            long     elapsedTime = stopTime.GetTime() - startTime.GetTime();

            log.Info();
            log.Info();
            System.Console.Error.Printf("Completed processing at %s\n", stopTime);
            System.Console.Error.Printf("Elapsed time: %d seconds\n", (int)(elapsedTime / 1000F));
        }
        /// <summary>for testing -- CURRENTLY BROKEN!!!</summary>
        /// <param name="args">input dir and output filename</param>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            if (args.Length != 3)
            {
                throw new Exception("args: treebankPath trainNums testNums");
            }
            ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();

            ctpp.charTags = true;
            // TODO: these options are getting clobbered by reading in the
            // parser object (unless it's a text file parser?)
            Options op = new Options(ctpp);

            op.doDep = false;
            op.testOptions.maxLength = 90;
            LexicalizedParser lp;

            try
            {
                IFileFilter trainFilt = new NumberRangesFileFilter(args[1], false);
                lp = LexicalizedParser.TrainFromTreebank(args[0], trainFilt, op);
                try
                {
                    string filename = "chineseCharTagPCFG.ser.gz";
                    log.Info("Writing parser in serialized format to file " + filename + " ");
                    System.Console.Error.Flush();
                    ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename);
                    @out.WriteObject(lp);
                    @out.Close();
                    log.Info("done.");
                }
                catch (IOException ioe)
                {
                    Sharpen.Runtime.PrintStackTrace(ioe);
                }
            }
            catch (ArgumentException)
            {
                lp = LexicalizedParser.LoadModel(args[1], op);
            }
            IFileFilter    testFilt     = new NumberRangesFileFilter(args[2], false);
            MemoryTreebank testTreebank = ctpp.MemoryTreebank();

            testTreebank.LoadPath(new File(args[0]), testFilt);
            PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("out.chi"), "GB18030"), true);
            WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
            WordCatEqualityChecker    eqcheck = new WordCatEqualityChecker();
            EquivalenceClassEval      eval    = new EquivalenceClassEval(eqclass, eqcheck);

            //    System.out.println("Preterminals:" + preterminals);
            System.Console.Out.WriteLine("Testing...");
            foreach (Tree gold in testTreebank)
            {
                Tree tree;
                try
                {
                    tree = lp.ParseTree(gold.YieldHasWord());
                    if (tree == null)
                    {
                        System.Console.Out.WriteLine("Failed to parse " + gold.YieldHasWord());
                        continue;
                    }
                }
                catch (Exception e)
                {
                    Sharpen.Runtime.PrintStackTrace(e);
                    continue;
                }
                gold = gold.FirstChild();
                pw.Println(SentenceUtils.ListToString(gold.PreTerminalYield()));
                pw.Println(SentenceUtils.ListToString(gold.Yield()));
                gold.PennPrint(pw);
                pw.Println(tree.PreTerminalYield());
                pw.Println(tree.Yield());
                tree.PennPrint(pw);
                //      Collection allBrackets = WordCatConstituent.allBrackets(tree);
                //      Collection goldBrackets = WordCatConstituent.allBrackets(gold);
                //      eval.eval(allBrackets, goldBrackets);
                eval.DisplayLast();
            }
            System.Console.Out.WriteLine();
            System.Console.Out.WriteLine();
            eval.Display();
        }
Пример #10
0
        /// <summary>Creates an ArabicTokenizer.</summary>
        /// <remarks>
        /// Creates an ArabicTokenizer. The default tokenizer
        /// is ArabicTokenizer.atbFactory(), which produces the
        /// same orthographic normalization as Green and Manning (2010).
        /// </remarks>
        /// <returns>A TokenizerFactory that produces each Arabic token as a CoreLabel</returns>
        private ITokenizerFactory <CoreLabel> GetTokenizerFactory()
        {
            ITokenizerFactory <CoreLabel> tokFactory = null;

            if (!isTokenized)
            {
                if (tokenizerOptions == null)
                {
                    tokFactory = ArabicTokenizer.AtbFactory();
                    string atbVocOptions = "removeProMarker,removeMorphMarker,removeLengthening";
                    tokFactory.SetOptions(atbVocOptions);
                }
                else
                {
                    if (tokenizerOptions.Contains("removeSegMarker"))
                    {
                        throw new Exception("Option 'removeSegMarker' cannot be used with ArabicSegmenter");
                    }
                    tokFactory = ArabicTokenizer.Factory();
                    tokFactory.SetOptions(tokenizerOptions);
                }
                log.Info("Loaded ArabicTokenizer with options: " + tokenizerOptions);
            }
            return(tokFactory);
        }
Пример #11
0
        /// <summary>Train a multinomial classifier off of the provided dataset.</summary>
        /// <param name="dataset">The dataset to train the classifier off of.</param>
        /// <returns>A classifier.</returns>
        public static IClassifier <string, string> TrainMultinomialClassifier(GeneralDataset <string, string> dataset, int featureThreshold, double sigma)
        {
            // Set up the dataset and factory
            log.Info("Applying feature threshold (" + featureThreshold + ")...");
            dataset.ApplyFeatureCountThreshold(featureThreshold);
            log.Info("Randomizing dataset...");
            dataset.Randomize(42l);
            log.Info("Creating factory...");
            LinearClassifierFactory <string, string> factory = InitFactory(sigma);

            // Train the final classifier
            log.Info("BEGIN training");
            LinearClassifier <string, string> classifier = factory.TrainClassifier(dataset);

            log.Info("END training");
            // Debug
            KBPRelationExtractor.Accuracy trainAccuracy = new KBPRelationExtractor.Accuracy();
            foreach (IDatum <string, string> datum in dataset)
            {
                string guess = classifier.ClassOf(datum);
                trainAccuracy.Predict(Java.Util.Collections.Singleton(guess), Java.Util.Collections.Singleton(datum.Label()));
            }
            log.Info("Training accuracy:");
            log.Info(trainAccuracy.ToString());
            log.Info(string.Empty);
            // Return the classifier
            return(classifier);
        }
 public override void Display()
 {
     log.Info(optionsString.ToString());
 }
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            // Parse the arguments
            Properties props = StringUtils.ArgsToProperties(args);

            ArgumentParser.FillOptions(new Type[] { typeof(ArgumentParser), typeof(SplitTrainingSet) }, props);
            if (SplitNames.Length != SplitWeights.Length)
            {
                throw new ArgumentException("Name and weight arrays must be of the same length");
            }
            double totalWeight = 0.0;

            foreach (double weight in SplitWeights)
            {
                totalWeight += weight;
                if (weight < 0.0)
                {
                    throw new ArgumentException("Split weights cannot be negative");
                }
            }
            if (totalWeight <= 0.0)
            {
                throw new ArgumentException("Split weights must total to a positive weight");
            }
            IList <double> splitWeights = new List <double>();

            foreach (double weight_1 in SplitWeights)
            {
                splitWeights.Add(weight_1 / totalWeight);
            }
            logger.Info("Splitting into " + splitWeights.Count + " lists with weights " + splitWeights);
            if (Seed == 0L)
            {
                Seed = Runtime.NanoTime();
                logger.Info("Random seed not set by options, using " + Seed);
            }
            Random random = new Random(Seed);
            IList <IList <Tree> > splits = new List <IList <Tree> >();

            foreach (double d in splitWeights)
            {
                splits.Add(new List <Tree>());
            }
            Treebank treebank = new MemoryTreebank(null);

            treebank.LoadPath(Input);
            logger.Info("Splitting " + treebank.Count + " trees");
            foreach (Tree tree in treebank)
            {
                int index = WeightedIndex(splitWeights, random);
                splits[index].Add(tree);
            }
            for (int i = 0; i < splits.Count; ++i)
            {
                string       filename = Output + "." + SplitNames[i];
                IList <Tree> split    = splits[i];
                logger.Info("Writing " + split.Count + " trees to " + filename);
                FileWriter     fout = new FileWriter(filename);
                BufferedWriter bout = new BufferedWriter(fout);
                foreach (Tree tree_1 in split)
                {
                    bout.Write(tree_1.ToString());
                    bout.NewLine();
                }
                bout.Close();
                fout.Close();
            }
        }
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            if (args.Length != 4)
            {
                logger.Info("Usage: MnistConverter dataFile labelFile outFile propsFile");
                return;
            }
            DataInputStream xStream = IOUtils.GetDataInputStream(args[0]);
            DataInputStream yStream = IOUtils.GetDataInputStream(args[1]);
            PrintWriter     oStream = new PrintWriter(new FileWriter(args[2]));
            PrintWriter     pStream = new PrintWriter(new FileWriter(args[3]));
            int             xMagic  = xStream.ReadInt();

            if (xMagic != 2051)
            {
                throw new Exception("Bad format of xStream");
            }
            int yMagic = yStream.ReadInt();

            if (yMagic != 2049)
            {
                throw new Exception("Bad format of yStream");
            }
            int xNumImages = xStream.ReadInt();
            int yNumLabels = yStream.ReadInt();

            if (xNumImages != yNumLabels)
            {
                throw new Exception("x and y sizes don't match");
            }
            logger.Info("Images and label file both contain " + xNumImages + " entries.");
            int xRows    = xStream.ReadInt();
            int xColumns = xStream.ReadInt();

            for (int i = 0; i < xNumImages; i++)
            {
                int   label  = yStream.ReadUnsignedByte();
                int[] matrix = new int[xRows * xColumns];
                for (int j = 0; j < xRows * xColumns; j++)
                {
                    matrix[j] = xStream.ReadUnsignedByte();
                }
                oStream.Print(label);
                foreach (int k in matrix)
                {
                    oStream.Print('\t');
                    oStream.Print(k);
                }
                oStream.Println();
            }
            logger.Info("Converted.");
            xStream.Close();
            yStream.Close();
            oStream.Close();
            // number from 1; column 0 is the class
            pStream.Println("goldAnswerColumn = 0");
            pStream.Println("useClassFeature = true");
            pStream.Println("sigma = 10");
            // not optimized, but weak regularization seems appropriate when much data, few features
            for (int j_1 = 0; j_1 < xRows * xColumns; j_1++)
            {
                pStream.Println((j_1 + 1) + ".realValued = true");
            }
            pStream.Close();
        }
 /// <summary>
 /// <inheritDoc/>
 ///
 /// </summary>
 protected internal override void DoOneFailedSentence(Annotation annotation, ICoreMap sentence)
 {
     log.Info("Failed to annotate: " + sentence.Get(typeof(CoreAnnotations.TextAnnotation)));
 }
Пример #16
0
        /// <summary>
        /// Recursively builds an answer lattice (Chinese words) from a Viterbi search graph
        /// of binary predictions.
        /// </summary>
        /// <remarks>
        /// Recursively builds an answer lattice (Chinese words) from a Viterbi search graph
        /// of binary predictions. This function does a limited amount of post-processing:
        /// preserve white spaces of the input, and not segment between two latin characters or
        /// between two digits. Consequently, the probabilities of all paths in answerLattice
        /// may not sum to 1 (they do sum to 1 if no post processing applies).
        /// </remarks>
        /// <param name="tSource">Current node in Viterbi search graph.</param>
        /// <param name="aSource">Current node in answer lattice.</param>
        /// <param name="answer">Partial word starting at aSource.</param>
        /// <param name="nodeId">Currently unused node identifier for answer graph.</param>
        /// <param name="pos">Current position in docArray.</param>
        /// <param name="cost">Current cost of answer.</param>
        /// <param name="stateLinks">
        /// Maps nodes of the search graph to nodes in answer lattice
        /// (when paths of the search graph are recombined, paths of the answer lattice should be
        /// recombined as well, if at word boundary).
        /// </param>
        private void TagLatticeToAnswerLattice(DFSAState <string, int> tSource, DFSAState <string, int> aSource, StringBuilder answer, MutableInteger nodeId, int pos, double cost, IDictionary <DFSAState <string, int>, DFSAState <string, int> > stateLinks,
                                               DFSA <string, int> answerLattice, CoreLabel[] docArray)
        {
            // Add "1" prediction after the end of the sentence, if applicable:
            if (tSource.IsAccepting() && tSource.ContinuingInputs().IsEmpty())
            {
                tSource.AddTransition(new DFSATransition <string, int>(string.Empty, tSource, new DFSAState <string, int>(-1, null), "1", string.Empty, 0));
            }
            // Get current label, character, and prediction:
            CoreLabel curLabel  = (pos < docArray.Length) ? docArray[pos] : null;
            string    curChr    = null;
            string    origSpace = null;

            if (curLabel != null)
            {
                curChr = curLabel.Get(typeof(CoreAnnotations.OriginalCharAnnotation));
                System.Diagnostics.Debug.Assert((curChr.Length == 1));
                origSpace = curLabel.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation));
            }
            // Get set of successors in search graph:
            ICollection <string> inputs = tSource.ContinuingInputs();
            // Only keep most probable transition out of initial state:
            string answerConstraint = null;

            if (pos == 0)
            {
                double minCost = double.PositiveInfinity;
                // DFSATransition<String, Integer> bestTransition = null;
                foreach (string predictSpace in inputs)
                {
                    DFSATransition <string, int> transition = tSource.Transition(predictSpace);
                    double transitionCost = transition.Score();
                    if (transitionCost < minCost)
                    {
                        if (predictSpace != null)
                        {
                            logger.Info(string.Format("mincost (%s): %e -> %e%n", predictSpace, minCost, transitionCost));
                            minCost          = transitionCost;
                            answerConstraint = predictSpace;
                        }
                    }
                }
            }
            // Follow along each transition:
            foreach (string predictSpace_1 in inputs)
            {
                DFSATransition <string, int> transition = tSource.Transition(predictSpace_1);
                DFSAState <string, int>      tDest      = transition.Target();
                DFSAState <string, int>      newASource = aSource;
                //logger.info(String.format("tsource=%s tdest=%s asource=%s pos=%d predictSpace=%s%n", tSource, tDest, newASource, pos, predictSpace));
                StringBuilder newAnswer = new StringBuilder(answer.ToString());
                int           answerLen = newAnswer.Length;
                string        prevChr   = (answerLen > 0) ? newAnswer.Substring(answerLen - 1) : null;
                double        newCost   = cost;
                // Ignore paths starting with zero:
                if (answerConstraint != null && !answerConstraint.Equals(predictSpace_1))
                {
                    logger.Info(string.Format("Skipping transition %s at pos 0.%n", predictSpace_1));
                    continue;
                }
                // Ignore paths not consistent with input segmentation:
                if (flags.keepAllWhitespaces && "0".Equals(predictSpace_1) && "1".Equals(origSpace))
                {
                    logger.Info(string.Format("Skipping non-boundary at pos %d, since space in the input.%n", pos));
                    continue;
                }
                // Ignore paths adding segment boundaries between two latin characters, or between two digits:
                // (unless already present in original input)
                if ("1".Equals(predictSpace_1) && "0".Equals(origSpace) && prevChr != null && curChr != null)
                {
                    char p = prevChr[0];
                    char c = curChr[0];
                    if (ChineseStringUtils.IsLetterASCII(p) && ChineseStringUtils.IsLetterASCII(c))
                    {
                        logger.Info(string.Format("Not hypothesizing a boundary at pos %d, since between two ASCII letters (%s and %s).%n", pos, prevChr, curChr));
                        continue;
                    }
                    if (ChineseUtils.IsNumber(p) && ChineseUtils.IsNumber(c))
                    {
                        logger.Info(string.Format("Not hypothesizing a boundary at pos %d, since between two numeral characters (%s and %s).%n", pos, prevChr, curChr));
                        continue;
                    }
                }
                // If predictSpace==1, create a new transition in answer search graph:
                if ("1".Equals(predictSpace_1))
                {
                    if (newAnswer.ToString().Length > 0)
                    {
                        // If answer destination node visited before, create a new edge and leave:
                        if (stateLinks.Contains(tSource))
                        {
                            DFSAState <string, int> aDest = stateLinks[tSource];
                            newASource.AddTransition(new DFSATransition <string, int>(string.Empty, newASource, aDest, newAnswer.ToString(), string.Empty, newCost));
                            //logger.info(String.format("new transition: asource=%s adest=%s edge=%s%n", newASource, aDest, newAnswer));
                            continue;
                        }
                        // If answer destination node not visited before, create it + new edge:
                        nodeId.IncValue(1);
                        DFSAState <string, int> aDest_1 = new DFSAState <string, int>(nodeId, answerLattice, 0.0);
                        stateLinks[tSource] = aDest_1;
                        newASource.AddTransition(new DFSATransition <string, int>(string.Empty, newASource, aDest_1, newAnswer.ToString(), string.Empty, newCost));
                        //logger.info(String.format("new edge: adest=%s%n", newASource, aDest, newAnswer));
                        //logger.info(String.format("new transition: asource=%s adest=%s edge=%s%n%n%n", newASource, aDest, newAnswer));
                        // Reached an accepting state:
                        if (tSource.IsAccepting())
                        {
                            aDest_1.SetAccepting(true);
                            continue;
                        }
                        // Start new answer edge:
                        newASource = aDest_1;
                        newAnswer  = new StringBuilder();
                        newCost    = 0.0;
                    }
                }
                System.Diagnostics.Debug.Assert((curChr != null));
                newAnswer.Append(curChr);
                newCost += transition.Score();
                if (newCost < flags.searchGraphPrune || ChineseStringUtils.IsLetterASCII(curChr[0]))
                {
                    TagLatticeToAnswerLattice(tDest, newASource, newAnswer, nodeId, pos + 1, newCost, stateLinks, answerLattice, docArray);
                }
            }
        }
        // main method only
        public static void Main(string[] args)
        {
            Treebank   tb           = new MemoryTreebank();
            Properties props        = StringUtils.ArgsToProperties(args);
            string     treeFileName = props.GetProperty("treeFile");
            string     sentFileName = props.GetProperty("sentFile");
            string     testGraph    = props.GetProperty("testGraph");

            if (testGraph == null)
            {
                testGraph = "false";
            }
            string load = props.GetProperty("load");
            string save = props.GetProperty("save");

            if (load != null)
            {
                log.Info("Load not implemented!");
                return;
            }
            if (sentFileName == null && treeFileName == null)
            {
                log.Info("Usage: java SemanticGraph [-sentFile file|-treeFile file] [-testGraph]");
                Tree t = Tree.ValueOf("(ROOT (S (NP (NP (DT An) (NN attempt)) (PP (IN on) (NP (NP (NNP Andres) (NNP Pastrana) (POS 's)) (NN life)))) (VP (VBD was) (VP (VBN carried) (PP (IN out) (S (VP (VBG using) (NP (DT a) (JJ powerful) (NN bomb))))))) (. .)))"
                                      );
                tb.Add(t);
            }
            else
            {
                if (treeFileName != null)
                {
                    tb.LoadPath(treeFileName);
                }
                else
                {
                    string[]          options = new string[] { "-retainNPTmpSubcategories" };
                    LexicalizedParser lp      = ((LexicalizedParser)LexicalizedParser.LoadModel("/u/nlp/data/lexparser/englishPCFG.ser.gz", options));
                    BufferedReader    reader  = null;
                    try
                    {
                        reader = IOUtils.ReaderFromString(sentFileName);
                    }
                    catch (IOException e)
                    {
                        throw new RuntimeIOException("Cannot find or open " + sentFileName, e);
                    }
                    try
                    {
                        System.Console.Out.WriteLine("Processing sentence file " + sentFileName);
                        for (string line; (line = reader.ReadLine()) != null;)
                        {
                            System.Console.Out.WriteLine("Processing sentence: " + line);
                            PTBTokenizer <Word> ptb   = PTBTokenizer.NewPTBTokenizer(new StringReader(line));
                            IList <Word>        words = ptb.Tokenize();
                            Tree parseTree            = lp.ParseTree(words);
                            tb.Add(parseTree);
                        }
                        reader.Close();
                    }
                    catch (Exception e)
                    {
                        throw new Exception("Exception reading key file " + sentFileName, e);
                    }
                }
            }
            foreach (Tree t_1 in tb)
            {
                SemanticGraph sg = SemanticGraphFactory.GenerateUncollapsedDependencies(t_1);
                System.Console.Out.WriteLine(sg.ToString());
                System.Console.Out.WriteLine(sg.ToCompactString());
                if (testGraph.Equals("true"))
                {
                    SemanticGraph g1 = SemanticGraphFactory.GenerateCollapsedDependencies(t_1);
                    System.Console.Out.WriteLine("TEST SEMANTIC GRAPH - graph ----------------------------");
                    System.Console.Out.WriteLine(g1.ToString());
                    System.Console.Out.WriteLine("readable ----------------------------");
                    System.Console.Out.WriteLine(g1.ToString(SemanticGraph.OutputFormat.Readable));
                    System.Console.Out.WriteLine("List of dependencies ----------------------------");
                    System.Console.Out.WriteLine(g1.ToList());
                    System.Console.Out.WriteLine("xml ----------------------------");
                    System.Console.Out.WriteLine(g1.ToString(SemanticGraph.OutputFormat.Xml));
                    System.Console.Out.WriteLine("dot ----------------------------");
                    System.Console.Out.WriteLine(g1.ToDotFormat());
                    System.Console.Out.WriteLine("dot (simple) ----------------------------");
                    System.Console.Out.WriteLine(g1.ToDotFormat("Simple", CoreLabel.OutputFormat.Value));
                }
            }
            // System.out.println(" graph ----------------------------");
            // System.out.println(t.allTypedDependenciesCCProcessed(false));
            if (save != null)
            {
                log.Info("Save not implemented!");
            }
        }
 /// <summary>Parse a sentence represented as a List of tokens.</summary>
 /// <remarks>
 /// Parse a sentence represented as a List of tokens.
 /// The text must already have been tokenized and
 /// normalized into tokens that are appropriate to the treebank
 /// which was used to train the parser.  The tokens can be of
 /// multiple types, and the list items need not be homogeneous as to type
 /// (in particular, only some words might be given tags):
 /// <ul>
 /// <li>If a token implements HasWord, then the word to be parsed is
 /// given by its word() value.</li>
 /// <li>If a token implements HasTag and the tag() value is not
 /// null or the empty String, then the parser is strongly advised to assign
 /// a part of speech tag that <i>begins</i> with this String.</li>
 /// </ul>
 /// </remarks>
 /// <param name="sentence">The sentence to parse</param>
 /// <returns>
 /// true Iff the sentence was accepted by the grammar.  If
 /// the main grammar fails, but the PCFG succeeds, then
 /// this still returns true, but parseFallback() will
 /// also return true.  getBestParse() will have a valid
 /// result iff this returns true.
 /// </returns>
 public virtual bool Parse <_T0>(IList <_T0> sentence)
     where _T0 : IHasWord
 {
     try
     {
         if (!ParseInternal(sentence))
         {
             if (pparser != null && pparser.HasParse() && fallbackToPCFG)
             {
                 parseFallback = true;
                 return(true);
             }
             else
             {
                 parseUnparsable = true;
                 return(false);
             }
         }
         else
         {
             return(true);
         }
     }
     catch (OutOfMemoryException e)
     {
         if (op.testOptions.maxLength != -unchecked ((int)(0xDEADBEEF)))
         {
             // this means they explicitly asked for a length they cannot handle.
             // Throw exception.  Avoid string concatenation before throw it.
             log.Info("NOT ENOUGH MEMORY TO PARSE SENTENCES OF LENGTH ");
             log.Info(op.testOptions.maxLength);
             throw;
         }
         if (pparser.HasParse() && fallbackToPCFG)
         {
             try
             {
                 whatFailed = "dependency";
                 if (dparser.HasParse())
                 {
                     whatFailed = "factored";
                 }
                 parseFallback = true;
                 return(true);
             }
             catch (OutOfMemoryException oome)
             {
                 Sharpen.Runtime.PrintStackTrace(oome);
                 parseNoMemory = true;
                 pparser.NudgeDownArraySize();
                 return(false);
             }
         }
         else
         {
             parseNoMemory = true;
             return(false);
         }
     }
     catch (NotSupportedException)
     {
         parseSkipped = true;
         return(false);
     }
 }
        public virtual bool Parse <_T0>(IList <_T0> sentence)
            where _T0 : IHasWord
        {
            if (op.testOptions.verbose)
            {
                Timing.Tick("Starting dependency parse.");
            }
            this.sentence = sentence;
            int length = sentence.Count;

            if (length > arraySize)
            {
                if (length > op.testOptions.maxLength + 1 || length >= myMaxLength)
                {
                    throw new OutOfMemoryException("Refusal to create such large arrays.");
                }
                else
                {
                    try
                    {
                        CreateArrays(length + 1);
                    }
                    catch (OutOfMemoryException e)
                    {
                        myMaxLength = length;
                        if (arraySize > 0)
                        {
                            try
                            {
                                CreateArrays(arraySize);
                            }
                            catch (OutOfMemoryException)
                            {
                                throw new Exception("CANNOT EVEN CREATE ARRAYS OF ORIGINAL SIZE!!! " + arraySize);
                            }
                        }
                        throw;
                    }
                    arraySize = length + 1;
                    if (op.testOptions.verbose)
                    {
                        log.Info("Created dparser arrays of size " + arraySize);
                    }
                }
            }
            if (op.testOptions.verbose)
            {
                log.Info("Initializing...");
            }
            // map to words
            words = new int[length];
            int numTags = dg.NumTagBins();

            //tagIndex.size();
            //System.out.println("\nNumTags: "+numTags);
            //System.out.println(tagIndex);
            bool[][] hasTag = new bool[length][];
            for (int i = 0; i < length; i++)
            {
                //if (wordIndex.contains(sentence.get(i).toString()))
                words[i] = wordIndex.AddToIndex(sentence[i].Word());
            }
            //else
            //words[i] = wordIndex.indexOf(Lexicon.UNKNOWN_WORD);
            for (int head = 0; head < length; head++)
            {
                for (int tag = 0; tag < numTags; tag++)
                {
                    Arrays.Fill(iScoreH[head][tag], float.NegativeInfinity);
                    Arrays.Fill(oScoreH[head][tag], float.NegativeInfinity);
                }
            }
            for (int head_1 = 0; head_1 < length; head_1++)
            {
                for (int loc = 0; loc <= length; loc++)
                {
                    rawDistance[head_1][loc] = (head_1 >= loc ? head_1 - loc : loc - head_1 - 1);
                    binDistance[head_1][loc] = dg.DistanceBin(rawDistance[head_1][loc]);
                }
            }
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            // do tags
            for (int start = 0; start + 1 <= length; start++)
            {
                //Force tags
                string trueTagStr = null;
                if (sentence[start] is IHasTag)
                {
                    trueTagStr = ((IHasTag)sentence[start]).Tag();
                    if (string.Empty.Equals(trueTagStr))
                    {
                        trueTagStr = null;
                    }
                }
                //Word context (e.g., morphosyntactic info)
                string wordContextStr = null;
                if (sentence[start] is IHasContext)
                {
                    wordContextStr = ((IHasContext)sentence[start]).OriginalText();
                    if (string.Empty.Equals(wordContextStr))
                    {
                        wordContextStr = null;
                    }
                }
                int word = words[start];
                for (IEnumerator <IntTaggedWord> taggingI = lex.RuleIteratorByWord(word, start, wordContextStr); taggingI.MoveNext();)
                {
                    IntTaggedWord tagging = taggingI.Current;
                    if (trueTagStr != null)
                    {
                        if (!tlp.BasicCategory(tagging.TagString(tagIndex)).Equals(trueTagStr))
                        {
                            continue;
                        }
                    }
                    float score = lex.Score(tagging, start, wordIndex.Get(tagging.word), wordContextStr);
                    //iScoreH[start][tag][start] = (op.dcTags ? (float)op.testOptions.depWeight*score : 0.0f);
                    if (score > float.NegativeInfinity)
                    {
                        int tag = tagging.tag;
                        iScoreH[start][dg.TagBin(tag)][start]     = 0.0f;
                        iScoreH[start][dg.TagBin(tag)][start + 1] = 0.0f;
                    }
                }
            }
            for (int hWord = 0; hWord < length; hWord++)
            {
                for (int hTag = 0; hTag < numTags; hTag++)
                {
                    hasTag[hWord][hTag] = (iScoreH[hWord][hTag][hWord] + iScoreH[hWord][hTag][hWord + 1] > float.NegativeInfinity);
                    Arrays.Fill(headStop[hWord][hTag], float.NegativeInfinity);
                    for (int aWord = 0; aWord < length; aWord++)
                    {
                        for (int dist = 0; dist < dg.NumDistBins(); dist++)
                        {
                            Arrays.Fill(headScore[dist][hWord][hTag][aWord], float.NegativeInfinity);
                        }
                    }
                }
            }
            // score and cache all pairs -- headScores and stops
            //int hit = 0;
            for (int hWord_1 = 0; hWord_1 < length; hWord_1++)
            {
                for (int hTag = 0; hTag < numTags; hTag++)
                {
                    //Arrays.fill(headStopL[hWord][hTag], Float.NEGATIVE_INFINITY);
                    //Arrays.fill(headStopR[hWord][hTag], Float.NEGATIVE_INFINITY);
                    //Arrays.fill(headStop[hWord][hTag], Float.NEGATIVE_INFINITY);
                    if (!hasTag[hWord_1][hTag])
                    {
                        continue;
                    }
                    for (int split = 0; split <= length; split++)
                    {
                        if (split <= hWord_1)
                        {
                            headStop[hWord_1][hTag][split] = (float)dg.ScoreTB(words[hWord_1], hTag, -2, -2, false, hWord_1 - split);
                        }
                        else
                        {
                            //System.out.println("headstopL " + hWord +" " + hTag + " " + split + " " + headStopL[hWord][hTag][split]); // debugging
                            headStop[hWord_1][hTag][split] = (float)dg.ScoreTB(words[hWord_1], hTag, -2, -2, true, split - hWord_1 - 1);
                        }
                    }
                    //System.out.println("headstopR " + hWord +" " + hTag + " " + split + " " + headStopR[hWord][hTag][split]); // debugging
                    //hit++;
                    //Timing.tick("hWord: "+hWord+" hTag: "+hTag+" piddle count: "+hit);
                    for (int aWord = 0; aWord < length; aWord++)
                    {
                        if (aWord == hWord_1)
                        {
                            continue;
                        }
                        // can't be argument of yourself
                        bool leftHeaded = hWord_1 < aWord;
                        int  start_1;
                        int  end;
                        if (leftHeaded)
                        {
                            start_1 = hWord_1 + 1;
                            end     = aWord + 1;
                        }
                        else
                        {
                            start_1 = aWord + 1;
                            end     = hWord_1 + 1;
                        }
                        for (int aTag = 0; aTag < numTags; aTag++)
                        {
                            if (!hasTag[aWord][aTag])
                            {
                                continue;
                            }
                            for (int split_1 = start_1; split_1 < end; split_1++)
                            {
                                // Moved this stuff out two loops- GMA
                                //              for (int split = 0; split <= length; split++) {
                                // if leftHeaded, go from hWord+1 to aWord
                                // else go from aWord+1 to hWord
                                //              if ((leftHeaded && (split <= hWord || split > aWord)) ||
                                //                      ((!leftHeaded) && (split <= aWord || split > hWord)))
                                //                continue;
                                int headDistance = rawDistance[hWord_1][split_1];
                                int binDist      = binDistance[hWord_1][split_1];
                                headScore[binDist][hWord_1][hTag][aWord][aTag] = (float)dg.ScoreTB(words[hWord_1], hTag, words[aWord], aTag, leftHeaded, headDistance);
                                //hit++;
                                // skip other splits with same binDist
                                while (split_1 + 1 < end && binDistance[hWord_1][split_1 + 1] == binDist)
                                {
                                    split_1++;
                                }
                            }
                        }
                    }
                }
            }
            // end split
            // end aTag
            // end aWord
            // end hTag
            // end hWord
            if (op.testOptions.verbose)
            {
                Timing.Tick("done.");
                // displayHeadScores();
                log.Info("Starting insides...");
            }
            // do larger spans
            for (int diff = 2; diff <= length; diff++)
            {
                if (Thread.Interrupted())
                {
                    throw new RuntimeInterruptedException();
                }
                for (int start_1 = 0; start_1 + diff <= length; start_1++)
                {
                    int end = start_1 + diff;
                    // left extension
                    int endHead = end - 1;
                    for (int endTag = 0; endTag < numTags; endTag++)
                    {
                        if (!hasTag[endHead][endTag])
                        {
                            continue;
                        }
                        // bestScore is max for iScoreH
                        float bestScore = float.NegativeInfinity;
                        for (int argHead = start_1; argHead < endHead; argHead++)
                        {
                            for (int argTag = 0; argTag < numTags; argTag++)
                            {
                                if (!hasTag[argHead][argTag])
                                {
                                    continue;
                                }
                                float argLeftScore = iScoreH[argHead][argTag][start_1];
                                if (argLeftScore == float.NegativeInfinity)
                                {
                                    continue;
                                }
                                float stopLeftScore = headStop[argHead][argTag][start_1];
                                if (stopLeftScore == float.NegativeInfinity)
                                {
                                    continue;
                                }
                                for (int split = argHead + 1; split < end; split++)
                                {
                                    // short circuit if dependency is impossible
                                    float depScore = headScore[binDistance[endHead][split]][endHead][endTag][argHead][argTag];
                                    if (depScore == float.NegativeInfinity)
                                    {
                                        continue;
                                    }
                                    float score = iScoreH[endHead][endTag][split] + argLeftScore + iScoreH[argHead][argTag][split] + depScore + stopLeftScore + headStop[argHead][argTag][split];
                                    if (score > bestScore)
                                    {
                                        bestScore = score;
                                    }
                                }
                            }
                        }
                        // end for split
                        // sum for iScoreHSum
                        // end for argTag : tags
                        // end for argHead
                        iScoreH[endHead][endTag][start_1] = bestScore;
                    }
                    // end for endTag : tags
                    // right extension
                    int startHead = start_1;
                    for (int startTag = 0; startTag < numTags; startTag++)
                    {
                        if (!hasTag[startHead][startTag])
                        {
                            continue;
                        }
                        // bestScore is max for iScoreH
                        float bestScore = float.NegativeInfinity;
                        for (int argHead = start_1 + 1; argHead < end; argHead++)
                        {
                            for (int argTag = 0; argTag < numTags; argTag++)
                            {
                                if (!hasTag[argHead][argTag])
                                {
                                    continue;
                                }
                                float argRightScore = iScoreH[argHead][argTag][end];
                                if (argRightScore == float.NegativeInfinity)
                                {
                                    continue;
                                }
                                float stopRightScore = headStop[argHead][argTag][end];
                                if (stopRightScore == float.NegativeInfinity)
                                {
                                    continue;
                                }
                                for (int split = start_1 + 1; split <= argHead; split++)
                                {
                                    // short circuit if dependency is impossible
                                    float depScore = headScore[binDistance[startHead][split]][startHead][startTag][argHead][argTag];
                                    if (depScore == float.NegativeInfinity)
                                    {
                                        continue;
                                    }
                                    float score = iScoreH[startHead][startTag][split] + iScoreH[argHead][argTag][split] + argRightScore + depScore + stopRightScore + headStop[argHead][argTag][split];
                                    if (score > bestScore)
                                    {
                                        bestScore = score;
                                    }
                                }
                            }
                        }
                        // sum for iScoreHSum
                        // end for argTag: tags
                        // end for argHead
                        iScoreH[startHead][startTag][end] = bestScore;
                    }
                }
            }
            // end for startTag: tags
            // end for start
            // end for diff (i.e., span)
            int goalTag = dg.TagBin(tagIndex.IndexOf(LexiconConstants.BoundaryTag));

            if (op.testOptions.verbose)
            {
                Timing.Tick("done.");
                log.Info("Dep  parsing " + length + " words (incl. stop): insideScore " + (iScoreH[length - 1][goalTag][0] + iScoreH[length - 1][goalTag][length]));
            }
            if (!op.doPCFG)
            {
                return(HasParse());
            }
            if (op.testOptions.verbose)
            {
                log.Info("Starting outsides...");
            }
            oScoreH[length - 1][goalTag][0]      = 0.0f;
            oScoreH[length - 1][goalTag][length] = 0.0f;
            for (int diff_1 = length; diff_1 > 1; diff_1--)
            {
                if (Thread.Interrupted())
                {
                    throw new RuntimeInterruptedException();
                }
                for (int start_1 = 0; start_1 + diff_1 <= length; start_1++)
                {
                    int end = start_1 + diff_1;
                    // left half
                    int endHead = end - 1;
                    for (int endTag = 0; endTag < numTags; endTag++)
                    {
                        if (!hasTag[endHead][endTag])
                        {
                            continue;
                        }
                        for (int argHead = start_1; argHead < endHead; argHead++)
                        {
                            for (int argTag = 0; argTag < numTags; argTag++)
                            {
                                if (!hasTag[argHead][argTag])
                                {
                                    continue;
                                }
                                for (int split = argHead; split <= endHead; split++)
                                {
                                    float subScore   = (oScoreH[endHead][endTag][start_1] + headScore[binDistance[endHead][split]][endHead][endTag][argHead][argTag] + headStop[argHead][argTag][start_1] + headStop[argHead][argTag][split]);
                                    float scoreRight = (subScore + iScoreH[argHead][argTag][start_1] + iScoreH[argHead][argTag][split]);
                                    float scoreMid   = (subScore + iScoreH[argHead][argTag][start_1] + iScoreH[endHead][endTag][split]);
                                    float scoreLeft  = (subScore + iScoreH[argHead][argTag][split] + iScoreH[endHead][endTag][split]);
                                    if (scoreRight > oScoreH[endHead][endTag][split])
                                    {
                                        oScoreH[endHead][endTag][split] = scoreRight;
                                    }
                                    if (scoreMid > oScoreH[argHead][argTag][split])
                                    {
                                        oScoreH[argHead][argTag][split] = scoreMid;
                                    }
                                    if (scoreLeft > oScoreH[argHead][argTag][start_1])
                                    {
                                        oScoreH[argHead][argTag][start_1] = scoreLeft;
                                    }
                                }
                            }
                        }
                    }
                    // right half
                    int startHead = start_1;
                    for (int startTag = 0; startTag < numTags; startTag++)
                    {
                        if (!hasTag[startHead][startTag])
                        {
                            continue;
                        }
                        for (int argHead = startHead + 1; argHead < end; argHead++)
                        {
                            for (int argTag = 0; argTag < numTags; argTag++)
                            {
                                if (!hasTag[argHead][argTag])
                                {
                                    continue;
                                }
                                for (int split = startHead + 1; split <= argHead; split++)
                                {
                                    float subScore   = (oScoreH[startHead][startTag][end] + headScore[binDistance[startHead][split]][startHead][startTag][argHead][argTag] + headStop[argHead][argTag][split] + headStop[argHead][argTag][end]);
                                    float scoreLeft  = (subScore + iScoreH[argHead][argTag][split] + iScoreH[argHead][argTag][end]);
                                    float scoreMid   = (subScore + iScoreH[startHead][startTag][split] + iScoreH[argHead][argTag][end]);
                                    float scoreRight = (subScore + iScoreH[startHead][startTag][split] + iScoreH[argHead][argTag][split]);
                                    if (scoreLeft > oScoreH[startHead][startTag][split])
                                    {
                                        oScoreH[startHead][startTag][split] = scoreLeft;
                                    }
                                    if (scoreMid > oScoreH[argHead][argTag][split])
                                    {
                                        oScoreH[argHead][argTag][split] = scoreMid;
                                    }
                                    if (scoreRight > oScoreH[argHead][argTag][end])
                                    {
                                        oScoreH[argHead][argTag][end] = scoreRight;
                                    }
                                }
                            }
                        }
                    }
                }
            }
            if (op.testOptions.verbose)
            {
                Timing.Tick("done.");
                log.Info("Starting half-filters...");
            }
            for (int loc_1 = 0; loc_1 <= length; loc_1++)
            {
                for (int head_2 = 0; head_2 < length; head_2++)
                {
                    Arrays.Fill(iPossibleByL[loc_1][head_2], false);
                    Arrays.Fill(iPossibleByR[loc_1][head_2], false);
                    Arrays.Fill(oPossibleByL[loc_1][head_2], false);
                    Arrays.Fill(oPossibleByR[loc_1][head_2], false);
                }
            }
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            for (int head_3 = 0; head_3 < length; head_3++)
            {
                for (int tag = 0; tag < numTags; tag++)
                {
                    if (!hasTag[head_3][tag])
                    {
                        continue;
                    }
                    for (int start_1 = 0; start_1 <= head_3; start_1++)
                    {
                        for (int end = head_3 + 1; end <= length; end++)
                        {
                            if (iScoreH[head_3][tag][start_1] + iScoreH[head_3][tag][end] > float.NegativeInfinity && oScoreH[head_3][tag][start_1] + oScoreH[head_3][tag][end] > float.NegativeInfinity)
                            {
                                iPossibleByR[end][head_3][tag]     = true;
                                iPossibleByL[start_1][head_3][tag] = true;
                                oPossibleByR[end][head_3][tag]     = true;
                                oPossibleByL[start_1][head_3][tag] = true;
                            }
                        }
                    }
                }
            }
            if (op.testOptions.verbose)
            {
                Timing.Tick("done.");
            }
            return(HasParse());
        }
Пример #20
0
        public virtual ICollection <string> FeaturesCpC(PaddedList <IN> cInfo, int loc)
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            c1       = cInfo[loc + 1];
            CoreLabel            c2       = cInfo[loc + 2];
            CoreLabel            c3       = cInfo[loc + 3];
            CoreLabel            p        = cInfo[loc - 1];
            CoreLabel            p2       = cInfo[loc - 2];
            CoreLabel            p3       = cInfo[loc - 3];
            string charc = c.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charc == null)
            {
                charc = string.Empty;
            }
            string charc1 = c1.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charc1 == null)
            {
                charc1 = string.Empty;
            }
            string charc2 = c2.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charc2 == null)
            {
                charc2 = string.Empty;
            }
            string charc3 = c3.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charc3 == null)
            {
                charc3 = string.Empty;
            }
            string charp = p.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charp == null)
            {
                charp = string.Empty;
            }
            string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charp2 == null)
            {
                charp2 = string.Empty;
            }
            string charp3 = p3.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charp3 == null)
            {
                charp3 = string.Empty;
            }

            /*
             * N-gram features. N is upto 2.
             */
            if (flags.useWord2)
            {
                // features.add(charc +"c");
                // features.add(charc1+"c1");
                // features.add(charp +"p");
                // features.add(charp +charc  +"pc");
                // if( flags.useMsr ){
                //   features.add(charc +charc1 +"cc1");
                //   features.add(charp + charc1 +"pc1");
                // }
                features.Add(charc + "::c");
                features.Add(charc1 + "::c1");
                features.Add(charp + "::p");
                features.Add(charp2 + "::p2");
                // trying to restore the features that Huishin described in SIGHAN 2005 paper
                features.Add(charc + charc1 + "::cn");
                features.Add(charp + charc + "::pc");
                features.Add(charp + charc1 + "::pn");
                features.Add(charp2 + charp + "::p2p");
                features.Add(charp2 + charc + "::p2c");
                features.Add(charc2 + charc + "::n2c");
                features.Add("|word2");
            }

            /*
             * Radical N-gram features. N is upto 4.
             * Smoothing method of N-gram, because there are too many characters in Chinese.
             * (It works better than N-gram when they are used individually. less sparse)
             */
            char rcharc;
            char rcharc1;
            char rcharc2;
            char rcharc3;
            char rcharp;
            char rcharp1;
            char rcharp2;
            char rcharp3;

            if (charc.Length == 0)
            {
                rcharc = 'n';
            }
            else
            {
                rcharc = RadicalMap.GetRadical(charc[0]);
            }
            if (charc1.Length == 0)
            {
                rcharc1 = 'n';
            }
            else
            {
                rcharc1 = RadicalMap.GetRadical(charc1[0]);
            }
            if (charc2.Length == 0)
            {
                rcharc2 = 'n';
            }
            else
            {
                rcharc2 = RadicalMap.GetRadical(charc2[0]);
            }
            if (charc3.Length == 0)
            {
                rcharc3 = 'n';
            }
            else
            {
                rcharc3 = RadicalMap.GetRadical(charc3[0]);
            }
            if (charp.Length == 0)
            {
                rcharp = 'n';
            }
            else
            {
                rcharp = RadicalMap.GetRadical(charp[0]);
            }
            if (charp2.Length == 0)
            {
                rcharp2 = 'n';
            }
            else
            {
                rcharp2 = RadicalMap.GetRadical(charp2[0]);
            }
            if (charp3.Length == 0)
            {
                rcharp3 = 'n';
            }
            else
            {
                rcharp3 = RadicalMap.GetRadical(charp3[0]);
            }
            if (flags.useRad2)
            {
                features.Add(rcharc + "rc");
                features.Add(rcharc1 + "rc1");
                features.Add(rcharp + "rp");
                features.Add(rcharp + rcharc + "rpc");
                features.Add(rcharc + rcharc1 + "rcc1");
                features.Add(rcharp + rcharc + rcharc1 + "rpcc1");
                features.Add("|rad2");
            }
            /* non-word dictionary:SEEM bi-gram marked as non-word */
            if (flags.useDict2)
            {
                NonDict2 nd = new NonDict2(flags);
                features.Add(nd.CheckDic(charp + charc, flags) + "nondict");
                features.Add("|useDict2");
            }
            if (flags.useOutDict2)
            {
                if (outDict == null)
                {
                    logger.Info("reading " + flags.outDict2 + " as a seen lexicon");
                    outDict = new CorpusDictionary(flags.outDict2, true);
                }
                features.Add(outDict.GetW(charp + charc) + "outdict");
                // -1 0
                features.Add(outDict.GetW(charc + charc1) + "outdict");
                // 0 1
                features.Add(outDict.GetW(charp2 + charp) + "outdict");
                // -2 -1
                features.Add(outDict.GetW(charp2 + charp + charc) + "outdict");
                // -2 -1 0
                features.Add(outDict.GetW(charp3 + charp2 + charp) + "outdict");
                // -3 -2 -1
                features.Add(outDict.GetW(charp + charc + charc1) + "outdict");
                // -1 0 1
                features.Add(outDict.GetW(charc + charc1 + charc2) + "outdict");
                // 0 1 2
                features.Add(outDict.GetW(charp + charc + charc1 + charc2) + "outdict");
            }
            // -1 0 1 2

            /*
             * (CTB/ASBC/HK/PK/MSR) POS information of each characters.
             * If a character falls into some function categories,
             * it is very likely there is a boundary.
             * A lot of Chinese function words belong to single characters.
             * This feature is also good for numbers and punctuations.
             * DE* are grouped into DE.
             */
            if (flags.useCTBChar2 || flags.useASBCChar2 || flags.useHKChar2 || flags.usePKChar2 || flags.useMSRChar2)
            {
                string[] tagsets;
                // the "useChPos" now only works for CTB and PK
                if (flags.useChPos)
                {
                    if (flags.useCTBChar2)
                    {
                        tagsets = new string[] { "AD", "AS", "BA", "CC", "CD", "CS", "DE", "DT", "ETC", "IJ", "JJ", "LB", "LC", "M", "NN", "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" };
                    }
                    else
                    {
                        if (flags.usePKChar2)
                        {
                            //tagsets = new String[]{"r", "j", "t", "a", "nz", "l", "vn", "i", "m", "ns", "nr", "v", "n", "q", "Ng", "b", "d", "nt"};
                            tagsets = new string[] { "2", "3", "4" };
                        }
                        else
                        {
                            throw new Exception("only support settings for CTB and PK now.");
                        }
                    }
                }
                else
                {
                    //logger.info("Using Derived features");
                    tagsets = new string[] { "2", "3", "4" };
                }
                if (taDetector == null)
                {
                    taDetector = new TagAffixDetector(flags);
                }
                foreach (string tagset in tagsets)
                {
                    features.Add(taDetector.CheckDic(tagset + "p", charp) + taDetector.CheckDic(tagset + "i", charp) + taDetector.CheckDic(tagset + "s", charc) + taDetector.CheckInDic(charp) + taDetector.CheckInDic(charc) + tagset + "prep-sufc");
                }
            }
            // features.add("|ctbchar2");  // Added a constant feature several times!!

            /*
             * In error analysis, we found English words and numbers are often separated.
             * Rule 1: isNumber feature: check if the current and previous char is a number.
             * Rule 2: Disambiguation of time point and time duration.
             * Rule 3: isEnglish feature: check if the current and previous character is an english letter.
             * Rule 4: English name feature: check if the current char is a conjunct pu for English first and last name, since there is no space between two names.
             * Most of PUs are a good indicator for word boundary, but - and .  is a strong indicator that there is no boundry within a previous , a follow char and it.
             */
            if (flags.useRule2)
            {
                /* Reduplication features */
                // previous character == current character
                if (charp.Equals(charc))
                {
                    features.Add("11");
                }
                // previous character == next character
                if (charp.Equals(charc1))
                {
                    features.Add("22");
                }
                // current character == next next character
                // fire only when usePk and useHk are both false.
                // Notice: this should be (almost) the same as the "22" feature, but we keep it for now.
                if (!flags.usePk && !flags.useHk)
                {
                    if (charc.Equals(charc2))
                    {
                        features.Add("33");
                    }
                }
                char cur1 = ' ';
                char cur2 = ' ';
                char cur  = ' ';
                char pre  = ' ';
                // actually their length must be either 0 or 1
                if (charc1.Length > 0)
                {
                    cur1 = charc1[0];
                }
                if (charc2.Length > 0)
                {
                    cur2 = charc2[0];
                }
                if (charc.Length > 0)
                {
                    cur = charc[0];
                }
                if (charp.Length > 0)
                {
                    pre = charp[0];
                }
                string prer = rcharp.ToString();
                // the radical of previous character
                Pattern E  = Pattern.Compile("[a-zA-Z]");
                Pattern N  = Pattern.Compile("[0-9]");
                Matcher m  = E.Matcher(charp);
                Matcher ce = E.Matcher(charc);
                Matcher pe = E.Matcher(charp2);
                Matcher cn = N.Matcher(charc);
                Matcher pn = N.Matcher(charp2);
                // if current and previous characters are numbers...
                if (cur >= '0' && cur <= '9' && pre >= '0' && pre <= '9')
                {
                    if (cur == '9' && pre == '1' && cur1 == '9' && cur2 >= '0' && cur2 <= '9')
                    {
                        //199x
                        features.Add("YR");
                    }
                    else
                    {
                        features.Add("2N");
                    }
                }
                else
                {
                    // if current and previous characters are not both numbers
                    // but previous char is a number
                    // i.e. patterns like "1N" , "2A", etc
                    if (pre >= '0' && pre <= '9')
                    {
                        features.Add("1N");
                    }
                    else
                    {
                        // if previous character is an English character
                        if (m.Matches())
                        {
                            features.Add("E");
                        }
                        else
                        {
                            // if the previous character contains no radical (and it exist)
                            if (prer.Equals(".") && charp.Length == 1)
                            {
                                // fire only when usePk and useHk are both false. Not sure why. -pichuan
                                if (!flags.useHk && !flags.usePk)
                                {
                                    if (ce.Matches())
                                    {
                                        features.Add("PU+E");
                                    }
                                    if (pe.Matches())
                                    {
                                        features.Add("E+PU");
                                    }
                                    if (cn.Matches())
                                    {
                                        features.Add("PU+N");
                                    }
                                    if (pn.Matches())
                                    {
                                        features.Add("N+PU");
                                    }
                                }
                                features.Add("PU");
                            }
                        }
                    }
                }
                string engType = IsEnglish(charp, charc);
                string engPU   = IsEngPU(charp);
                if (!engType.Equals(string.Empty))
                {
                    features.Add(engType);
                }
                if (!engPU.Equals(string.Empty) && !engType.Equals(string.Empty))
                {
                    features.Add(engPU + engType);
                }
            }
            //end of use rule
            // features using "Character.getType" information!
            string origS = c.Get(typeof(CoreAnnotations.OriginalCharAnnotation));
            char   origC = ' ';

            if (origS.Length > 0)
            {
                origC = origS[0];
            }
            int type = char.GetType(origC);

            switch (type)
            {
            case char.UppercaseLetter:
            case char.LowercaseLetter:
            {
                // A-Z and full-width A-Z
                // a-z and full-width a-z
                features.Add("CHARTYPE-LETTER");
                break;
            }

            case char.DecimalDigitNumber:
            {
                features.Add("CHARTYPE-DECIMAL_DIGIT_NUMBER");
                break;
            }

            case char.OtherLetter:
            {
                // mostly chinese chars
                features.Add("CHARTYPE-OTHER_LETTER");
                break;
            }

            default:
            {
                // other types
                features.Add("CHARTYPE-MISC");
                break;
            }
            }
            return(features);
        }
Пример #21
0
        public virtual void ReadWordVectors()
        {
            SimpleMatrix unknownNumberVector         = null;
            SimpleMatrix unknownCapsVector           = null;
            SimpleMatrix unknownChineseYearVector    = null;
            SimpleMatrix unknownChineseNumberVector  = null;
            SimpleMatrix unknownChinesePercentVector = null;

            wordVectors = Generics.NewTreeMap();
            int numberCount         = 0;
            int capsCount           = 0;
            int chineseYearCount    = 0;
            int chineseNumberCount  = 0;
            int chinesePercentCount = 0;
            //Map<String, SimpleMatrix> rawWordVectors = NeuralUtils.readRawWordVectors(op.lexOptions.wordVectorFile, op.lexOptions.numHid);
            Embedding rawWordVectors = new Embedding(op.lexOptions.wordVectorFile, op.lexOptions.numHid);

            foreach (string word in rawWordVectors.KeySet())
            {
                SimpleMatrix vector = rawWordVectors.Get(word);
                if (op.wordFunction != null)
                {
                    word = op.wordFunction.Apply(word);
                }
                wordVectors[word] = vector;
                if (op.lexOptions.numHid <= 0)
                {
                    op.lexOptions.numHid = vector.GetNumElements();
                }
                // TODO: factor out all of these identical blobs
                if (op.trainOptions.unknownNumberVector && (NumberPattern.Matcher(word).Matches() || DgPattern.Matcher(word).Matches()))
                {
                    ++numberCount;
                    if (unknownNumberVector == null)
                    {
                        unknownNumberVector = new SimpleMatrix(vector);
                    }
                    else
                    {
                        unknownNumberVector = unknownNumberVector.Plus(vector);
                    }
                }
                if (op.trainOptions.unknownCapsVector && CapsPattern.Matcher(word).Matches())
                {
                    ++capsCount;
                    if (unknownCapsVector == null)
                    {
                        unknownCapsVector = new SimpleMatrix(vector);
                    }
                    else
                    {
                        unknownCapsVector = unknownCapsVector.Plus(vector);
                    }
                }
                if (op.trainOptions.unknownChineseYearVector && ChineseYearPattern.Matcher(word).Matches())
                {
                    ++chineseYearCount;
                    if (unknownChineseYearVector == null)
                    {
                        unknownChineseYearVector = new SimpleMatrix(vector);
                    }
                    else
                    {
                        unknownChineseYearVector = unknownChineseYearVector.Plus(vector);
                    }
                }
                if (op.trainOptions.unknownChineseNumberVector && (ChineseNumberPattern.Matcher(word).Matches() || DgPattern.Matcher(word).Matches()))
                {
                    ++chineseNumberCount;
                    if (unknownChineseNumberVector == null)
                    {
                        unknownChineseNumberVector = new SimpleMatrix(vector);
                    }
                    else
                    {
                        unknownChineseNumberVector = unknownChineseNumberVector.Plus(vector);
                    }
                }
                if (op.trainOptions.unknownChinesePercentVector && ChinesePercentPattern.Matcher(word).Matches())
                {
                    ++chinesePercentCount;
                    if (unknownChinesePercentVector == null)
                    {
                        unknownChinesePercentVector = new SimpleMatrix(vector);
                    }
                    else
                    {
                        unknownChinesePercentVector = unknownChinesePercentVector.Plus(vector);
                    }
                }
            }
            string unkWord = op.trainOptions.unkWord;

            if (op.wordFunction != null)
            {
                unkWord = op.wordFunction.Apply(unkWord);
            }
            SimpleMatrix unknownWordVector = wordVectors[unkWord];

            wordVectors[UnknownWord] = unknownWordVector;
            if (unknownWordVector == null)
            {
                throw new Exception("Unknown word vector not specified in the word vector file");
            }
            if (op.trainOptions.unknownNumberVector)
            {
                if (numberCount > 0)
                {
                    unknownNumberVector = unknownNumberVector.Divide(numberCount);
                }
                else
                {
                    unknownNumberVector = new SimpleMatrix(unknownWordVector);
                }
                wordVectors[UnknownNumber] = unknownNumberVector;
            }
            if (op.trainOptions.unknownCapsVector)
            {
                if (capsCount > 0)
                {
                    unknownCapsVector = unknownCapsVector.Divide(capsCount);
                }
                else
                {
                    unknownCapsVector = new SimpleMatrix(unknownWordVector);
                }
                wordVectors[UnknownCaps] = unknownCapsVector;
            }
            if (op.trainOptions.unknownChineseYearVector)
            {
                log.Info("Matched " + chineseYearCount + " chinese year vectors");
                if (chineseYearCount > 0)
                {
                    unknownChineseYearVector = unknownChineseYearVector.Divide(chineseYearCount);
                }
                else
                {
                    unknownChineseYearVector = new SimpleMatrix(unknownWordVector);
                }
                wordVectors[UnknownChineseYear] = unknownChineseYearVector;
            }
            if (op.trainOptions.unknownChineseNumberVector)
            {
                log.Info("Matched " + chineseNumberCount + " chinese number vectors");
                if (chineseNumberCount > 0)
                {
                    unknownChineseNumberVector = unknownChineseNumberVector.Divide(chineseNumberCount);
                }
                else
                {
                    unknownChineseNumberVector = new SimpleMatrix(unknownWordVector);
                }
                wordVectors[UnknownChineseNumber] = unknownChineseNumberVector;
            }
            if (op.trainOptions.unknownChinesePercentVector)
            {
                log.Info("Matched " + chinesePercentCount + " chinese percent vectors");
                if (chinesePercentCount > 0)
                {
                    unknownChinesePercentVector = unknownChinesePercentVector.Divide(chinesePercentCount);
                }
                else
                {
                    unknownChinesePercentVector = new SimpleMatrix(unknownWordVector);
                }
                wordVectors[UnknownChinesePercent] = unknownChinesePercentVector;
            }
            if (op.trainOptions.useContextWords)
            {
                SimpleMatrix start = SimpleMatrix.Random(op.lexOptions.numHid, 1, -0.5, 0.5, rand);
                SimpleMatrix end   = SimpleMatrix.Random(op.lexOptions.numHid, 1, -0.5, 0.5, rand);
                wordVectors[StartWord] = start;
                wordVectors[EndWord]   = end;
            }
        }
 public override void Display()
 {
     log.Info("TueBaDZParserParams nodeCleanup=" + nodeCleanup + " mKonjParent=" + markKonjParent + " mContainsV=" + markContainsV + " mZu=" + markZu + " mColons=" + markColons);
 }
        // 0=MinErr  1=Bradley
        public virtual double TuneFixedGain(IFunction function, double[] initial, long msPerTest, double fixedStart)
        {
            double[] xtest  = new double[initial.Length];
            double   fOpt   = 0.0;
            double   factor = 1.2;
            double   min    = double.PositiveInfinity;

            this.maxTime = msPerTest;
            double prev = double.PositiveInfinity;

            // check for stochastic derivatives
            if (!(function is AbstractStochasticCachingDiffFunction))
            {
                throw new NotSupportedException();
            }
            AbstractStochasticCachingDiffFunction dfunction = (AbstractStochasticCachingDiffFunction)function;
            int    it         = 1;
            bool   toContinue = true;
            double f          = fixedStart;

            do
            {
                System.Array.Copy(initial, 0, xtest, 0, initial.Length);
                log.Info(string.Empty);
                this.fixedGain = f;
                log.Info("Testing with batchsize: " + bSize + "    gain:  " + gain + "  fixedGain:  " + nf.Format(fixedGain));
                this.numPasses = 10000;
                this.Minimize(function, 1e-100, xtest);
                double result = dfunction.ValueAt(xtest);
                if (it == 1)
                {
                    f = f / factor;
                }
                if (result < min)
                {
                    min  = result;
                    fOpt = this.fixedGain;
                    f    = f / factor;
                    prev = result;
                }
                else
                {
                    if (result < prev)
                    {
                        f    = f / factor;
                        prev = result;
                    }
                    else
                    {
                        if (result > prev)
                        {
                            toContinue = false;
                        }
                    }
                }
                it += 1;
                log.Info(string.Empty);
                log.Info("Final value is: " + nf.Format(result));
                log.Info("Optimal so far is:  fixedgain: " + fOpt);
            }while (toContinue);
            return(fOpt);
        }
Пример #24
0
 // Logging to file facilities.
 // The prefix is used to append stuff in front of the logging messages
 /// <exception cref="System.IO.IOException"/>
 public virtual void InitLog(File logFilePath)
 {
     RedwoodConfiguration.Empty().Handlers(RedwoodConfiguration.Handlers.Chain(RedwoodConfiguration.Handlers.ShowAllChannels(), RedwoodConfiguration.Handlers.stderr), RedwoodConfiguration.Handlers.File(logFilePath.ToString())).Apply();
     // fh.setFormatter(new NewlineLogFormatter());
     System.Console.Out.WriteLine("Starting Ssurgeon log, at " + logFilePath.GetAbsolutePath() + " date=" + DateFormat.GetDateInstance(DateFormat.Full).Format(new DateTime()));
     log.Info("Starting Ssurgeon log, date=" + DateFormat.GetDateInstance(DateFormat.Full).Format(new DateTime()));
 }
 public virtual void Done(Redwood.RedwoodChannels logger, string msg)
 {
     logger.Info(msg + " ... done [" + ToSecondsString() + " sec].");
 }
Пример #26
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            Properties options         = StringUtils.ArgsToProperties(args, ArgSpec());
            string     encoding        = options.GetProperty("e", "UTF-8");
            bool       isMorphTreeFile = PropertiesUtils.GetBool(options, "g", false);

            string[] parsedArgs = options.GetProperty(string.Empty).Split("\\s+");
            if (parsedArgs.Length != 2)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            AddMorphoAnnotations.YieldIterator morphIter = new AddMorphoAnnotations.YieldIterator(parsedArgs[0], isMorphTreeFile);
            AddMorphoAnnotations.YieldIterator lemmaIter = new AddMorphoAnnotations.YieldIterator(parsedArgs[1], false);
            Pattern pParenStripper = Pattern.Compile("[\\(\\)]");

            try
            {
                BufferedReader     brIn = new BufferedReader(new InputStreamReader(Runtime.@in, encoding));
                ITreeReaderFactory trf  = new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(true);
                int nTrees = 0;
                for (string line; (line = brIn.ReadLine()) != null; ++nTrees)
                {
                    Tree         tree   = trf.NewTreeReader(new StringReader(line)).ReadTree();
                    IList <Tree> leaves = tree.GetLeaves();
                    if (!morphIter.MoveNext())
                    {
                        throw new Exception("Mismatch between number of morpho analyses and number of input lines.");
                    }
                    IList <string> morphTags = morphIter.Current;
                    if (!lemmaIter.MoveNext())
                    {
                        throw new Exception("Mismatch between number of lemmas and number of input lines.");
                    }
                    IList <string> lemmas = lemmaIter.Current;
                    // Sanity checks
                    System.Diagnostics.Debug.Assert(morphTags.Count == lemmas.Count);
                    System.Diagnostics.Debug.Assert(lemmas.Count == leaves.Count);
                    for (int i = 0; i < leaves.Count; ++i)
                    {
                        string morphTag = morphTags[i];
                        if (pParenStripper.Matcher(morphTag).Find())
                        {
                            morphTag = pParenStripper.Matcher(morphTag).ReplaceAll(string.Empty);
                        }
                        string newLeaf = string.Format("%s%s%s%s%s", leaves[i].Value(), MorphoFeatureSpecification.MorphoMark, lemmas[i], MorphoFeatureSpecification.LemmaMark, morphTag);
                        leaves[i].SetValue(newLeaf);
                    }
                    System.Console.Out.WriteLine(tree.ToString());
                }
                // Sanity checks
                System.Diagnostics.Debug.Assert(!morphIter.MoveNext());
                System.Diagnostics.Debug.Assert(!lemmaIter.MoveNext());
                System.Console.Error.Printf("Processed %d trees%n", nTrees);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
 /// <summary>Add specified rules to this extractor.</summary>
 /// <param name="rules"/>
 public virtual void AppendRules(IList <SequenceMatchRules.IRule> rules)
 {
     if (verbose)
     {
         log.Info("Read " + rules.Count + " rules");
     }
     // Put rules into stages
     if (collapseExtractionRules)
     {
         rules = Collapse(rules);
         if (verbose)
         {
             log.Info("Collapsing into " + rules.Count + " rules");
         }
     }
     foreach (SequenceMatchRules.IRule r in rules)
     {
         if (r is SequenceMatchRules.AssignmentRule)
         {
             // Nothing to do
             // Assignments are added to environment as they are parsed
             ((SequenceMatchRules.AssignmentRule)r).Evaluate(env);
         }
         else
         {
             if (r is SequenceMatchRules.AnnotationExtractRule)
             {
                 SequenceMatchRules.AnnotationExtractRule aer   = (SequenceMatchRules.AnnotationExtractRule)r;
                 CoreMapExpressionExtractor.Stage <T>     stage = stages[aer.stage];
                 if (stage == null)
                 {
                     stages[aer.stage] = stage = new CoreMapExpressionExtractor.Stage <T>();
                     stage.stageId     = aer.stage;
                     bool clearMatched = (bool)env.GetDefaults()["stage.clearMatched"];
                     if (clearMatched != null)
                     {
                         stage.clearMatched = clearMatched;
                     }
                     int limitIters = (int)env.GetDefaults()["stage.limitIters"];
                     if (limitIters != null)
                     {
                         stage.limitIters = limitIters;
                     }
                 }
                 if (aer.active)
                 {
                     if (SequenceMatchRules.FilterRuleType.Equals(aer.ruleType))
                     {
                         stage.AddFilterRule(aer);
                     }
                     else
                     {
                         if (aer.isComposite)
                         {
                             //            if (SequenceMatchRules.COMPOSITE_RULE_TYPE.equals(aer.ruleType)) {
                             stage.AddCompositeRule(aer);
                         }
                         else
                         {
                             stage.AddBasicRule(aer);
                         }
                     }
                 }
                 else
                 {
                     log.Debug("Ignoring inactive rule: " + aer.name);
                 }
             }
         }
     }
 }
        /// <summary>
        /// arg[0] := tokenizer options
        /// args[1] := file to tokenize
        /// </summary>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                System.Console.Out.Printf("Usage: java %s OPTS filename%n", typeof(ArabicTokenizerTester).FullName);
                System.Environment.Exit(-1);
            }
            string tokOptions = args[0];
            File   path       = new File(args[1]);

            log.Info("Reading from: " + path.GetPath());
            try
            {
                BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
                ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.Factory();
                tf.SetOptions(tokOptions);
                IMapper lexMapper = new DefaultLexicalMapper();
                lexMapper.Setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8");
                int lineId = 0;
                for (string line; (line = br.ReadLine()) != null; lineId++)
                {
                    line = line.Trim();
                    // Tokenize with the tokenizer
                    IList <CoreLabel> tokenizedLine = tf.GetTokenizer(new StringReader(line)).Tokenize();
                    System.Console.Out.WriteLine(SentenceUtils.ListToString(tokenizedLine));
                    // Tokenize with the mapper
                    StringBuilder sb   = new StringBuilder();
                    string[]      toks = line.Split("\\s+");
                    foreach (string tok in toks)
                    {
                        string mappedTok = lexMapper.Map(null, tok);
                        sb.Append(mappedTok).Append(" ");
                    }
                    IList <string> mappedToks = Arrays.AsList(sb.ToString().Trim().Split("\\s+"));
                    // Evaluate the output
                    if (mappedToks.Count != tokenizedLine.Count)
                    {
                        System.Console.Error.Printf("Line length mismatch:%norig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks));
                    }
                    else
                    {
                        bool printLines = false;
                        for (int i = 0; i < mappedToks.Count; ++i)
                        {
                            string mappedTok    = mappedToks[i];
                            string tokenizedTok = tokenizedLine[i].Word();
                            if (!mappedTok.Equals(tokenizedTok))
                            {
                                System.Console.Error.Printf("Token mismatch:%nmap: %s%ntok: %s%n", mappedTok, tokenizedTok);
                                printLines = true;
                            }
                        }
                        if (printLines)
                        {
                            System.Console.Error.Printf("orig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks));
                        }
                    }
                }
                System.Console.Error.Printf("Read %d lines.%n", lineId);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Пример #29
0
        // Note: The first two SBARQ patterns only work when the SQ
        // structure has already been removed in CoordinationTransformer.
        // Matches phrases such as "what is wrong"
        // matches WHNP $+ VB<copula $+ NP
        // for example, "Who am I to judge?"
        // !$++ ADJP matches against "Why is the dog pink?"
        // Actually somewhat limited in scope, this detects "Tuesday it is",
        // "Such a great idea this was", etc
        /// <summary>
        /// Determine which daughter of the current parse tree is the
        /// head.
        /// </summary>
        /// <remarks>
        /// Determine which daughter of the current parse tree is the
        /// head.  It assumes that the daughters already have had their
        /// heads determined.  Uses special rule for VP heads
        /// </remarks>
        /// <param name="t">
        /// The parse tree to examine the daughters of.
        /// This is assumed to never be a leaf
        /// </param>
        /// <returns>The parse tree that is the head</returns>
        protected internal override Tree DetermineNonTrivialHead(Tree t, Tree parent)
        {
            string motherCat = tlp.BasicCategory(t.Label().Value());

            if (Debug)
            {
                log.Info("At " + motherCat + ", my parent is " + parent);
            }
            // Some conj expressions seem to make more sense with the "not" or
            // other key words as the head.  For example, "and not" means
            // something completely different than "and".  Furthermore,
            // downstream code was written assuming "not" would be the head...
            if (motherCat.Equals("CONJP"))
            {
                foreach (TregexPattern pattern in headOfConjpTregex)
                {
                    TregexMatcher matcher = pattern.Matcher(t);
                    if (matcher.MatchesAt(t))
                    {
                        return(matcher.GetNode("head"));
                    }
                }
            }
            // if none of the above patterns match, use the standard method
            if (motherCat.Equals("SBARQ") || motherCat.Equals("SINV"))
            {
                if (!makeCopulaHead)
                {
                    foreach (TregexPattern pattern in headOfCopulaTregex)
                    {
                        TregexMatcher matcher = pattern.Matcher(t);
                        if (matcher.MatchesAt(t))
                        {
                            return(matcher.GetNode("head"));
                        }
                    }
                }
            }
            // if none of the above patterns match, use the standard method
            // do VPs with auxiliary as special case
            if ((motherCat.Equals("VP") || motherCat.Equals("SQ") || motherCat.Equals("SINV")))
            {
                Tree[] kids = t.Children();
                // try to find if there is an auxiliary verb
                if (Debug)
                {
                    log.Info("Semantic head finder: at VP");
                    log.Info("Class is " + t.GetType().FullName);
                    t.PennPrint(System.Console.Error);
                }
                //log.info("hasVerbalAuxiliary = " + hasVerbalAuxiliary(kids, verbalAuxiliaries));
                // looks for auxiliaries
                Tree[] tmpFilteredChildren = null;
                if (HasVerbalAuxiliary(kids, verbalAuxiliaries, true) || HasPassiveProgressiveAuxiliary(kids))
                {
                    // String[] how = new String[] {"left", "VP", "ADJP", "NP"};
                    // Including NP etc seems okay for copular sentences but is
                    // problematic for other auxiliaries, like 'he has an answer'
                    string[] how;
                    if (HasVerbalAuxiliary(kids, copulars, true))
                    {
                        // Only allow ADJP in copular constructions
                        // In constructions like "It gets cold", "get" should be the head
                        how = new string[] { "left", "VP", "ADJP" };
                    }
                    else
                    {
                        how = new string[] { "left", "VP" };
                    }
                    if (tmpFilteredChildren == null)
                    {
                        tmpFilteredChildren = ArrayUtils.Filter(kids, RemoveTmpAndAdv);
                    }
                    Tree pti = TraverseLocate(tmpFilteredChildren, how, false);
                    if (Debug)
                    {
                        log.Info("Determined head (case 1) for " + t.Value() + " is: " + pti);
                    }
                    if (pti != null)
                    {
                        return(pti);
                    }
                }
                // } else {
                // log.info("------");
                // log.info("SemanticHeadFinder failed to reassign head for");
                // t.pennPrint(System.err);
                // log.info("------");
                // looks for copular verbs
                if (HasVerbalAuxiliary(kids, copulars, false) && !IsExistential(t, parent) && !IsWHQ(t, parent))
                {
                    string[][] how;
                    //TODO: also allow ADVP to be heads
                    if (motherCat.Equals("SQ"))
                    {
                        how = new string[][] { new string[] { "right", "VP", "ADJP", "NP", "UCP", "PP", "WHADJP", "WHNP" } };
                    }
                    else
                    {
                        how = new string[][] { new string[] { "left", "VP", "ADJP", "NP", "UCP", "PP", "WHADJP", "WHNP" } };
                    }
                    // Avoid undesirable heads by filtering them from the list of potential children
                    if (tmpFilteredChildren == null)
                    {
                        tmpFilteredChildren = ArrayUtils.Filter(kids, RemoveTmpAndAdv);
                    }
                    Tree pti = null;
                    for (int i = 0; i < how.Length && pti == null; i++)
                    {
                        pti = TraverseLocate(tmpFilteredChildren, how[i], false);
                    }
                    // In SQ, only allow an NP to become head if there is another one to the left (then it's probably predicative)
                    if (motherCat.Equals("SQ") && pti != null && pti.Label() != null && pti.Label().Value().StartsWith("NP"))
                    {
                        bool foundAnotherNp = false;
                        foreach (Tree kid in kids)
                        {
                            if (kid == pti)
                            {
                                break;
                            }
                            else
                            {
                                if (kid.Label() != null && kid.Label().Value().StartsWith("NP"))
                                {
                                    foundAnotherNp = true;
                                    break;
                                }
                            }
                        }
                        if (!foundAnotherNp)
                        {
                            pti = null;
                        }
                    }
                    if (Debug)
                    {
                        log.Info("Determined head (case 2) for " + t.Value() + " is: " + pti);
                    }
                    if (pti != null)
                    {
                        return(pti);
                    }
                    else
                    {
                        if (Debug)
                        {
                            log.Info("------");
                            log.Info("SemanticHeadFinder failed to reassign head for");
                            t.PennPrint(System.Console.Error);
                            log.Info("------");
                        }
                    }
                }
            }
            Tree hd = base.DetermineNonTrivialHead(t, parent);

            /* ----
             * // This should now be handled at the AbstractCollinsHeadFinder level, so see if we can comment this out
             * // Heuristically repair punctuation heads
             * Tree[] hdChildren = hd.children();
             * if (hdChildren != null && hdChildren.length > 0 &&
             * hdChildren[0].isLeaf()) {
             * if (tlp.isPunctuationWord(hdChildren[0].label().value())) {
             * Tree[] tChildren = t.children();
             * if (DEBUG) {
             * System.err.printf("head is punct: %s\n", hdChildren[0].label());
             * }
             * for (int i = tChildren.length - 1; i >= 0; i--) {
             * if (!tlp.isPunctuationWord(tChildren[i].children()[0].label().value())) {
             * hd = tChildren[i];
             * if (DEBUG) {
             * System.err.printf("New head of %s is %s%n", hd.label(), hd.children()[0].label());
             * }
             * break;
             * }
             * }
             * }
             * }
             */
            if (Debug)
            {
                log.Info("Determined head (case 3) for " + t.Value() + " is: " + hd);
            }
            return(hd);
        }
Пример #30
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void Main(string[] args)
        {
            string         modelPath          = null;
            IList <string> baseModelPaths     = null;
            string         testTreebankPath   = null;
            IFileFilter    testTreebankFilter = null;
            IList <string> unusedArgs         = new List <string>();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                    {
                        Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                        argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                        testTreebankPath   = treebankDescription.First();
                        testTreebankFilter = treebankDescription.Second();
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-baseModels"))
                        {
                            argIndex++;
                            baseModelPaths = new List <string>();
                            while (argIndex < args.Length && args[argIndex][0] != '-')
                            {
                                baseModelPaths.Add(args[argIndex++]);
                            }
                            if (baseModelPaths.Count == 0)
                            {
                                throw new ArgumentException("Found an argument -baseModels with no actual models named");
                            }
                        }
                        else
                        {
                            unusedArgs.Add(args[argIndex++]);
                        }
                    }
                }
            }
            string[]          newArgs          = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            LexicalizedParser underlyingParser = null;
            Options           options          = null;
            LexicalizedParser combinedParser   = null;

            if (baseModelPaths != null)
            {
                IList <DVModel> dvparsers = new List <DVModel>();
                foreach (string baseModelPath in baseModelPaths)
                {
                    log.Info("Loading serialized DVParser from " + baseModelPath);
                    LexicalizedParser dvparser = ((LexicalizedParser)LexicalizedParser.LoadModel(baseModelPath));
                    IReranker         reranker = dvparser.reranker;
                    if (!(reranker is DVModelReranker))
                    {
                        throw new ArgumentException("Expected parsers with DVModel embedded");
                    }
                    dvparsers.Add(((DVModelReranker)reranker).GetModel());
                    if (underlyingParser == null)
                    {
                        underlyingParser = dvparser;
                        options          = underlyingParser.GetOp();
                        // TODO: other parser's options?
                        options.SetOptions(newArgs);
                    }
                    log.Info("... done");
                }
                combinedParser = LexicalizedParser.CopyLexicalizedParser(underlyingParser);
                CombinedDVModelReranker reranker_1 = new CombinedDVModelReranker(options, dvparsers);
                combinedParser.reranker = reranker_1;
                combinedParser.SaveParserToSerialized(modelPath);
            }
            else
            {
                throw new ArgumentException("Need to specify -model to load an already prepared CombinedParser");
            }
            Treebank testTreebank = null;

            if (testTreebankPath != null)
            {
                log.Info("Reading in trees from " + testTreebankPath);
                if (testTreebankFilter != null)
                {
                    log.Info("Filtering on " + testTreebankFilter);
                }
                testTreebank = combinedParser.GetOp().tlpParams.MemoryTreebank();
                testTreebank.LoadPath(testTreebankPath, testTreebankFilter);
                log.Info("Read in " + testTreebank.Count + " trees for testing");
                EvaluateTreebank evaluator = new EvaluateTreebank(combinedParser.GetOp(), null, combinedParser);
                evaluator.TestOnTreebank(testTreebank);
            }
        }