Пример #1
0
 public NonLinearCliquePotentialFunction(double[][] linearWeights, double[][] inputLayerWeights, double[][] outputLayerWeights, SeqClassifierFlags flags)
 {
     this.linearWeights      = linearWeights;
     this.inputLayerWeights  = inputLayerWeights;
     this.outputLayerWeights = outputLayerWeights;
     this.flags = flags;
 }
Пример #2
0
 // = null;
 /// <summary>
 /// reads the tokenFactory and tokensAnnotationClassName from
 /// <see cref="SeqClassifierFlags"/>
 /// </summary>
 public virtual void Init(SeqClassifierFlags flags)
 {
     if (flags.tokensAnnotationClassName != null)
     {
         this.tokensAnnotationClassName = flags.tokensAnnotationClassName;
     }
     else
     {
         this.tokensAnnotationClassName = "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation";
     }
     if (flags.tokenFactory != null)
     {
         try
         {
             this.tokenFactory = (ICoreTokenFactory <IN>)System.Activator.CreateInstance(Sharpen.Runtime.GetType(flags.tokenFactory));
         }
         catch (Exception e)
         {
             throw new Exception(e);
         }
     }
     else
     {
         this.tokenFactory = (ICoreTokenFactory <IN>) new CoreLabelTokenFactory();
     }
     Init(flags, this.tokenFactory, this.tokensAnnotationClassName);
 }
 public ObjectBankWrapper(SeqClassifierFlags flags, ObjectBank <IList <In> > wrapped, ICollection <string> knownLCWords)
     : base(null, null)
 {
     this.flags        = flags;
     this.wrapped      = wrapped;
     this.knownLCWords = knownLCWords;
 }
        // end enum Output style
        // = null;
        public virtual void Init(SeqClassifierFlags flags)
        {
            string options = "tokenizeNLs=false,invertible=true";

            if (flags.tokenizerOptions != null)
            {
                options = options + ',' + flags.tokenizerOptions;
            }
            ITokenizerFactory <In> factory;

            if (flags.tokenizerFactory != null)
            {
                try
                {
                    Type       clazz         = ErasureUtils.UncheckedCast(Sharpen.Runtime.GetType(flags.tokenizerFactory));
                    MethodInfo factoryMethod = clazz.GetMethod("newCoreLabelTokenizerFactory", typeof(string));
                    factory = ErasureUtils.UncheckedCast(factoryMethod.Invoke(null, options));
                }
                catch (Exception e)
                {
                    throw new Exception(e);
                }
            }
            else
            {
                factory = ErasureUtils.UncheckedCast(PTBTokenizer.PTBTokenizerFactory.NewCoreLabelTokenizerFactory(options));
            }
            Init(flags, factory);
        }
Пример #5
0
 public virtual string CheckDic(string c2, SeqClassifierFlags flags)
 {
     if (cd.GetW(c2).Equals("1"))
     {
         return("1");
     }
     return("0");
 }
 public EmpiricalNERPriorBIO(string backgroundSymbol, IIndex <string> classIndex, IIndex <string> tagIndex, IList <IN> doc, Pair <double[][], double[][]> matrices, SeqClassifierFlags flags)
     : base(backgroundSymbol, classIndex, tagIndex, doc)
 {
     entityMatrix    = matrices.First();
     subEntityMatrix = matrices.Second();
     this.flags      = flags;
     ORGIndex        = tagIndex.IndexOf("ORG");
     LOCIndex        = tagIndex.IndexOf("LOC");
 }
        public TagAffixDetector(SeqClassifierFlags flags)
        {
            // String sighanCorporaDict = "/u/nlp/data/chinese-segmenter/";
            string corporaDict;

            if (flags.sighanCorporaDict != null)
            {
                corporaDict = flags.sighanCorporaDict;
            }
            else
            {
                corporaDict = DefaultCorporaDict;
            }
            if (!corporaDict.IsEmpty() && !corporaDict.EndsWith("/"))
            {
                corporaDict = corporaDict + '/';
            }
            string ccPath;
            string adPath;

            if (flags.useChPos || flags.useCTBChar2 || flags.usePKChar2)
            {
                // if we're using POS information, override the ccPath
                // For now we only have list for CTB and PK
                if (flags.useASBCChar2 || flags.useHKChar2 || flags.useMSRChar2)
                {
                    throw new Exception("only support settings for CTB and PK now.");
                }
                else
                {
                    if (flags.useCTBChar2)
                    {
                        ccPath = corporaDict + "dict/character_list";
                        adPath = corporaDict + "dict/in.ctb";
                    }
                    else
                    {
                        if (flags.usePKChar2)
                        {
                            ccPath = corporaDict + "dict/pos_open/character_list.pku.utf8";
                            adPath = corporaDict + "dict/in.pk";
                        }
                        else
                        {
                            throw new Exception("none of flags.useXXXChar2 are on");
                        }
                    }
                }
            }
            else
            {
                ccPath = corporaDict + "dict/pos_close/char.ctb.list";
                adPath = corporaDict + "dict/in.ctb";
            }
            cc = new CorpusChar(ccPath);
            aD = new AffixDictionary(adPath);
        }
        // Somewhat arbitrary flags.  We're just picking flags that will execute the problematic code
        // path.
        private static SeqClassifierFlags CreateTestFlags()
        {
            SeqClassifierFlags flags = new SeqClassifierFlags();

            flags.sighanPostProcessing = true;
            flags.usePk = true;
            flags.keepEnglishWhitespaces = false;
            flags.keepAllWhitespaces     = false;
            return(flags);
        }
Пример #9
0
        static void TrainAndWrite(string propPath, string modelPath)
        {
            Properties props = StringUtils.propFileToProperties(propPath);

            var flags = new SeqClassifierFlags(props);

            var crf = new CRFClassifier(flags);

            crf.train();
            crf.serializeClassifier(modelPath);
        }
 public NonLinearSecondOrderCliquePotentialFunction(double[][] inputLayerWeights4Edge, double[][] outputLayerWeights4Edge, double[][] inputLayerWeights, double[][] outputLayerWeights, SeqClassifierFlags flags)
 {
     // first index is number of hidden units in layer one, second index is the input feature indices
     // first index is the output class, second index is the number of hidden units
     // first index is number of hidden units in layer one, second index is the input feature indices
     // first index is the output class, second index is the number of hidden units
     this.inputLayerWeights4Edge  = inputLayerWeights4Edge;
     this.outputLayerWeights4Edge = outputLayerWeights4Edge;
     this.inputLayerWeights       = inputLayerWeights;
     this.outputLayerWeights      = outputLayerWeights;
     this.flags = flags;
 }
Пример #11
0
 /// <summary>Make an Arabic Segmenter.</summary>
 /// <param name="props">
 /// Options for how to tokenize. See the main method of
 /// <see cref="ArabicTokenizer{T}"/>
 /// for details
 /// </param>
 public ArabicSegmenter(Properties props)
 {
     /* Serializable */
     // SEGMENTER OPTIONS (can be set in the Properties object
     // passed to the constructor).
     // The input already been tokenized. Do not run the Arabic tokenizer.
     // Tokenizer options
     // Mark segmented prefixes with this String
     // Mark segmented suffixes with this String
     // Number of decoding threads
     // Write TedEval files
     // Use a custom feature factory
     // Training and evaluation files have domain labels
     // Training and evaluation text are all in the same domain (default:atb)
     // Ignore rewrites (training only, produces a model that then can be used to do
     // no-rewrite segmentation)
     // Use the original feature set which doesn't contain start-and-end "wrapper" features
     isTokenized      = props.Contains(optTokenized);
     tokenizerOptions = props.GetProperty(optTokenizer, null);
     tedEvalPrefix    = props.GetProperty(optTedEval, null);
     hasDomainLabels  = props.Contains(optWithDomains);
     domain           = props.GetProperty(optDomain, "atb");
     noRewrites       = props.Contains(optNoRewrites);
     tf           = GetTokenizerFactory();
     prefixMarker = props.GetProperty(optPrefix, string.Empty);
     suffixMarker = props.GetProperty(optSuffix, string.Empty);
     if (props.Contains(optLocalFeaturesOnly))
     {
         if (props.Contains(optFeatureFactory))
         {
             throw new Exception("Cannot use custom feature factory with localFeaturesOnly flag--" + "have your custom feature factory extend ArabicSegmenterFeatureFactory instead of " + "StartAndEndArabicSegmenterFeatureFactory and remove the localFeaturesOnly flag."
                                 );
         }
         props.SetProperty(optFeatureFactory, localOnlyFeatureFactory);
     }
     if (!props.Contains(optFeatureFactory))
     {
         props.SetProperty(optFeatureFactory, defaultFeatureFactory);
     }
     // Remove all command-line properties that are specific to ArabicSegmenter
     props.Remove(optTokenizer);
     props.Remove(optTokenized);
     props.Remove(optPrefix);
     props.Remove(optSuffix);
     props.Remove(optThreads);
     props.Remove(optTedEval);
     props.Remove(optWithDomains);
     props.Remove(optDomain);
     props.Remove(optNoRewrites);
     props.Remove(optLocalFeaturesOnly);
     flags      = new SeqClassifierFlags(props);
     classifier = new CRFClassifier <CoreLabel>(flags);
 }
Пример #12
0
 /// <summary>Copy constructor.</summary>
 /// <param name="other"/>
 public ArabicSegmenter(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter other)
 {
     isTokenized      = other.isTokenized;
     tokenizerOptions = other.tokenizerOptions;
     prefixMarker     = other.prefixMarker;
     suffixMarker     = other.suffixMarker;
     tedEvalPrefix    = other.tedEvalPrefix;
     hasDomainLabels  = other.hasDomainLabels;
     domain           = other.domain;
     noRewrites       = other.noRewrites;
     flags            = other.flags;
     // ArabicTokenizerFactory is *not* threadsafe. Make a new copy.
     tf = GetTokenizerFactory();
     // CRFClassifier is threadsafe, so return a reference.
     classifier = other.classifier;
 }
Пример #13
0
 public NonDict2(SeqClassifierFlags flags)
 {
     //public String sighanCorporaDict = "/u/nlp/data/chinese-segmenter/";
     if (cd == null)
     {
         if (flags.sighanCorporaDict != null)
         {
             corporaDict = flags.sighanCorporaDict;
         }
         // use the same flag for Sighan 2005,
         // but our list is extracted from ctb
         string path;
         if (flags.useAs || flags.useHk || flags.useMsr)
         {
             throw new Exception("only support settings for CTB and PKU now.");
         }
         else
         {
             if (flags.usePk)
             {
                 path = corporaDict + "/dict/pku.non";
             }
             else
             {
                 // CTB
                 path = corporaDict + "/dict/ctb.non";
             }
         }
         cd = new CorpusDictionary(path);
         // just output the msg...
         if (flags.useAs || flags.useHk || flags.useMsr)
         {
         }
         else
         {
             if (flags.usePk)
             {
                 logger.Info("INFO: flags.usePk=true | building NonDict2 from " + path);
             }
             else
             {
                 // CTB
                 logger.Info("INFO: flags.usePk=false | building NonDict2 from " + path);
             }
         }
     }
 }
Пример #14
0
        public void createModelFromTrainingData(string inputPath, string outputPath, string properties)
        {
            Properties props = edu.stanford.nlp.util.StringUtils.propFileToProperties(properties);

            props.setProperty("serializeTo", outputPath);

            if (inputPath != null)
            {
                props.setProperty("trainFile", inputPath);
            }

            SeqClassifierFlags flags = new SeqClassifierFlags(props);
            CRFClassifier      crf   = new CRFClassifier(flags);

            crf.train();
            crf.serializeClassifier(outputPath);
        }
        public virtual void TestUsingIterator()
        {
            string s = "\n\n@@123\nthis\nis\na\nsentence\n\n@@12\nThis\nis another\n.\n\n";

            string[] output = new string[] { "@@", "123", "this", "is", "a", "sentence", "@@", "12", "This", "is", "another", "." };
            string[] outWSs = new string[] { "@@", "ddd", "xxxx", "xx", "x", "xxxxx", "@@", "dd", "Xxxx", "xx", "xxxxx", "." };
            NUnit.Framework.Assert.AreEqual(output.Length, outWSs.Length, "Two output arrays should have same length");
            Properties         props = PropertiesUtils.AsProperties("wordShape", "chris2");
            SeqClassifierFlags flags = new SeqClassifierFlags(props);
            PlainTextDocumentReaderAndWriter <CoreLabel> readerAndWriter = new PlainTextDocumentReaderAndWriter <CoreLabel>();

            readerAndWriter.Init(flags);
            ReaderIteratorFactory           rif          = new ReaderIteratorFactory(new StringReader(s));
            ObjectBank <IList <CoreLabel> > di           = new ObjectBank <IList <CoreLabel> >(rif, readerAndWriter);
            ICollection <string>            knownLCWords = new HashSet <string>();
            ObjectBankWrapper <CoreLabel>   obw          = new ObjectBankWrapper <CoreLabel>(flags, di, knownLCWords);

            try
            {
                int outIdx = 0;
                for (IEnumerator <IList <CoreLabel> > iter = obw.GetEnumerator(); iter.MoveNext();)
                {
                    IList <CoreLabel> sent = iter.Current;
                    for (IEnumerator <CoreLabel> iter2 = sent.GetEnumerator(); iter2.MoveNext();)
                    {
                        CoreLabel cl    = iter2.Current;
                        string    tok   = cl.Word();
                        string    shape = cl.Get(typeof(CoreAnnotations.ShapeAnnotation));
                        NUnit.Framework.Assert.AreEqual(output[outIdx], tok);
                        NUnit.Framework.Assert.AreEqual(outWSs[outIdx], shape);
                        outIdx++;
                    }
                }
                if (outIdx < output.Length)
                {
                    NUnit.Framework.Assert.Fail("Too few things in iterator, lacking: " + output[outIdx]);
                }
            }
            catch (Exception e)
            {
                NUnit.Framework.Assert.Fail("Probably too many things in iterator: " + e);
            }
        }
        public virtual void TestMultithreadedCombineSegmentedSentence()
        {
            SeqClassifierFlags      flags    = CreateTestFlags();
            IList <CoreLabel>       labels   = CreateTestTokens();
            IList <IFuture <bool> > tasks    = new List <IFuture <bool> >(Threads);
            IExecutorService        executor = Executors.NewFixedThreadPool(Threads);

            for (int v = 0; v < Threads; v++)
            {
                IFuture <bool> f = executor.Submit(null);
                tasks.Add(f);
            }
            foreach (IFuture <bool> task in tasks)
            {
                // This assert will fail by throwing a propagated exception, if exceptions due to
                // multithreading issues (generally NPEs) were thrown during the test.
                System.Diagnostics.Debug.Assert((task.Get()));
            }
        }
 internal CRFNonLinearSecondOrderLogConditionalObjectiveFunction(int[][][][] data, int[][] labels, int window, IIndex <string> classIndex, IList <IIndex <CRFLabel> > labelIndices, int[] map, int prior, SeqClassifierFlags flags, int numNodeFeatures
                                                                 , int numEdgeFeatures)
 {
     this.window               = window;
     this.classIndex           = classIndex;
     this.numClasses           = classIndex.Size();
     this.labelIndices         = labelIndices;
     this.data                 = data;
     this.flags                = flags;
     this.map                  = map;
     this.labels               = labels;
     this.prior                = prior;
     this.backgroundSymbol     = flags.backgroundSymbol;
     this.sigma                = flags.sigma;
     this.outputLayerSize      = numClasses;
     this.outputLayerSize4Edge = numClasses * numClasses;
     this.numHiddenUnits       = flags.numHiddenUnits;
     this.inputLayerSize       = numHiddenUnits * numClasses;
     this.inputLayerSize4Edge  = numHiddenUnits * numClasses * numClasses;
     this.numNodeFeatures      = numNodeFeatures;
     this.numEdgeFeatures      = numEdgeFeatures;
     this.useOutputLayer       = flags.useOutputLayer;
     this.useHiddenLayer       = flags.useHiddenLayer;
     this.useSigmoid           = flags.useSigmoid;
     this.docWindowLabels      = new int[data.Length][];
     if (!useOutputLayer)
     {
         log.Info("Output layer not activated, inputLayerSize must be equal to numClasses, setting it to " + numClasses);
         this.inputLayerSize      = numClasses;
         this.inputLayerSize4Edge = numClasses * numClasses;
     }
     else
     {
         if (flags.softmaxOutputLayer && !(flags.sparseOutputLayer || flags.tieOutputLayer))
         {
             throw new Exception("flags.softmaxOutputLayer == true, but neither flags.sparseOutputLayer or flags.tieOutputLayer is true");
         }
     }
 }
 public virtual void Init(SeqClassifierFlags flags)
 {
     this.flags = flags;
     factory    = LineIterator.GetFactory(new Sighan2005DocumentReaderAndWriter.CTBDocumentParser(this));
     // pichuan : flags.normalizationTable is null --> i believe this is replaced by some java class??
     // (Thu Apr 24 11:10:42 2008)
     cdtos = new ChineseDocumentToSentenceProcessor(flags.normalizationTable);
     if (flags.dictionary != null)
     {
         string[] dicts = flags.dictionary.Split(",");
         cdict = new ChineseDictionary(dicts, cdtos, flags.expandMidDot);
     }
     if (flags.serializedDictionary != null)
     {
         string dict = flags.serializedDictionary;
         cdict = new ChineseDictionary(dict, cdtos, flags.expandMidDot);
     }
     if (flags.dictionary2 != null)
     {
         string[] dicts2 = flags.dictionary2.Split(",");
         cdict2 = new ChineseDictionary(dicts2, cdtos, flags.expandMidDot);
     }
 }
Пример #19
0
 /// <summary>
 /// post process the answer to be output
 /// these post processing are not dependent on original input
 /// </summary>
 private static string PostProcessingAnswer(string ans, SeqClassifierFlags flags)
 {
     if (flags.useHk)
     {
         //logger.info("Using HK post processing.");
         return(hkPostProcessor.PostProcessingAnswer(ans));
     }
     else
     {
         if (flags.useAs)
         {
             //logger.info("Using AS post processing.");
             return(asPostProcessor.PostProcessingAnswer(ans));
         }
         else
         {
             if (flags.usePk)
             {
                 //logger.info("Using PK post processing.");
                 return(pkPostProcessor.PostProcessingAnswer(ans, flags.keepAllWhitespaces));
             }
             else
             {
                 if (flags.useMsr)
                 {
                     //logger.info("Using MSR post processing.");
                     return(basicPostsProcessor.PostProcessingAnswer(ans));
                 }
                 else
                 {
                     //logger.info("Using CTB post processing.");
                     return(ctpPostProcessor.PostProcessingAnswer(ans, flags.suppressMidDotPostprocessing));
                 }
             }
         }
     }
 }
        public virtual IListeningSequenceModel GetInstance(string backgroundSymbol, IIndex <string> classIndex, IIndex <string> tagIndex, IList <In> document, Pair <double[][], double[][]> entityMatrices, SeqClassifierFlags flags)
        {
            EntityCachingAbstractSequencePriorBIO <In> prior = new EmpiricalNERPriorBIO <In>(flags.backgroundSymbol, classIndex, tagIndex, document, entityMatrices, flags);

            return(prior);
        }
Пример #21
0
 public CRFClassifierNonlinear(SeqClassifierFlags flags)
     : base(flags)
 {
 }
Пример #22
0
 public override void Init(SeqClassifierFlags flags)
 {
     base.Init(flags);
 }
        // run a particular CRF of this ClassifierCombiner on a testFile
        // user can say -crfToExamine 0 to get 1st element or -crfToExamine /edu/stanford/models/muc7.crf.ser.gz
        // this does not currently support drill down on CMM's
        /// <exception cref="System.Exception"/>
        public static void ExamineCRF(Edu.Stanford.Nlp.IE.ClassifierCombiner cc, string crfNameOrIndex, SeqClassifierFlags flags, string testFile, string testFiles, IDocumentReaderAndWriter <CoreLabel> readerAndWriter)
        {
            CRFClassifier <CoreLabel> crf;
            // potential index into baseClassifiers
            int ci;

            // set ci with the following rules
            // 1. first see if ci is an index into baseClassifiers
            // 2. if its not an integer or wrong size, see if its a file name of a loadPath
            try
            {
                ci = System.Convert.ToInt32(crfNameOrIndex);
                if (ci < 0 || ci >= cc.baseClassifiers.Count)
                {
                    // ci is not an int corresponding to an element in baseClassifiers, see if name of a crf loadPath
                    ci = cc.initLoadPaths.IndexOf(crfNameOrIndex);
                }
            }
            catch (NumberFormatException)
            {
                // cannot interpret crfNameOrIndex as an integer, see if name of a crf loadPath
                ci = cc.initLoadPaths.IndexOf(crfNameOrIndex);
            }
            // if ci corresponds to an index in baseClassifiers, get the crf at that index, otherwise set crf to null
            if (ci >= 0 && ci < cc.baseClassifiers.Count)
            {
                // TODO: this will break if baseClassifiers contains something that is not a CRF
                crf = (CRFClassifier <CoreLabel>)cc.baseClassifiers[ci];
            }
            else
            {
                crf = null;
            }
            // if you can get a specific crf, generate the appropriate report, if null do nothing
            if (crf != null)
            {
                // if there is a crf and testFile was set , do the crf stuff for a single testFile
                if (testFile != null)
                {
                    if (flags.searchGraphPrefix != null)
                    {
                        crf.ClassifyAndWriteViterbiSearchGraph(testFile, flags.searchGraphPrefix, crf.MakeReaderAndWriter());
                    }
                    else
                    {
                        if (flags.printFirstOrderProbs)
                        {
                            crf.PrintFirstOrderProbs(testFile, readerAndWriter);
                        }
                        else
                        {
                            if (flags.printFactorTable)
                            {
                                crf.PrintFactorTable(testFile, readerAndWriter);
                            }
                            else
                            {
                                if (flags.printProbs)
                                {
                                    crf.PrintProbs(testFile, readerAndWriter);
                                }
                                else
                                {
                                    if (flags.useKBest)
                                    {
                                        // TO DO: handle if user doesn't provide kBest
                                        int k = flags.kBest;
                                        crf.ClassifyAndWriteAnswersKBest(testFile, k, readerAndWriter);
                                    }
                                    else
                                    {
                                        if (flags.printLabelValue)
                                        {
                                            crf.PrintLabelInformation(testFile, readerAndWriter);
                                        }
                                        else
                                        {
                                            // no crf test flag provided
                                            log.Info("Warning: no crf test flag was provided, running classify and write answers");
                                            crf.ClassifyAndWriteAnswers(testFile, readerAndWriter, true);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                else
                {
                    if (testFiles != null)
                    {
                        // if there is a crf and testFiles was set , do the crf stuff for testFiles
                        // if testFile was set as well, testFile overrides
                        IList <File> files = Arrays.Stream(testFiles.Split(",")).Map(null).Collect(Collectors.ToList());
                        if (flags.printProbs)
                        {
                            // there is a crf and printProbs
                            crf.PrintProbs(files, crf.DefaultReaderAndWriter());
                        }
                        else
                        {
                            log.Info("Warning: no crf test flag was provided, running classify files and write answers");
                            crf.ClassifyFilesAndWriteAnswers(files, crf.DefaultReaderAndWriter(), true);
                        }
                    }
                }
            }
        }
        public static IDictionary <string, DataInstance> ParseColumnFile(BufferedReader reader, ICollection <string> categoriesAllowed, IDictionary <string, Type> setClassForTheseLabels, bool setGoldClass, string sentIDprefix)
        {
            CoNLLDocumentReaderAndWriter conllreader = new CoNLLDocumentReaderAndWriter();
            Properties         props = new Properties();
            SeqClassifierFlags flags = new SeqClassifierFlags(props);

            flags.entitySubclassification       = "noprefix";
            flags.retainEntitySubclassification = false;
            conllreader.Init(flags);
            IEnumerator <IList <CoreLabel> > dociter = conllreader.GetIterator(reader);
            int num = -1;
            IDictionary <string, DataInstance> sents = new Dictionary <string, DataInstance>();

            while (dociter.MoveNext())
            {
                IList <CoreLabel> doc      = dociter.Current;
                IList <string>    words    = new List <string>();
                IList <CoreLabel> sentcore = new List <CoreLabel>();
                int tokenindex             = 0;
                foreach (CoreLabel l in doc)
                {
                    if (l.Word().Equals(CoNLLDocumentReaderAndWriter.Boundary) || l.Word().Equals("-DOCSTART-"))
                    {
                        if (words.Count > 0)
                        {
                            num++;
                            string       docid    = sentIDprefix + "-" + num.ToString();
                            DataInstance sentInst = DataInstance.GetNewSurfaceInstance(sentcore);
                            sents[docid] = sentInst;
                            words        = new List <string>();
                            sentcore     = new List <CoreLabel>();
                            tokenindex   = 0;
                        }
                        continue;
                    }
                    tokenindex++;
                    words.Add(l.Word());
                    l.Set(typeof(CoreAnnotations.IndexAnnotation), tokenindex);
                    l.Set(typeof(CoreAnnotations.ValueAnnotation), l.Word());
                    string label = l.Get(typeof(CoreAnnotations.AnswerAnnotation));
                    System.Diagnostics.Debug.Assert(label != null, "label cannot be null");
                    l.Set(typeof(CoreAnnotations.TextAnnotation), l.Word());
                    l.Set(typeof(CoreAnnotations.OriginalTextAnnotation), l.Word());
                    if (setGoldClass)
                    {
                        l.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label);
                    }
                    if (setClassForTheseLabels != null && setClassForTheseLabels.Contains(label))
                    {
                        l.Set(setClassForTheseLabels[label], label);
                    }
                    sentcore.Add(l);
                }
                if (words.Count > 0)
                {
                    num++;
                    string       docid    = sentIDprefix + "-" + num.ToString();
                    DataInstance sentInst = DataInstance.GetNewSurfaceInstance(sentcore);
                    sents[docid] = sentInst;
                }
            }
            return(sents);
        }
 public virtual void Init(SeqClassifierFlags flags)
 {
     this.flags = flags;
     factory    = XMLBeginEndIterator.GetFactory("DOC", new MUCDocumentReaderAndWriter.MUCDocumentParser(), true, true);
 }
Пример #26
0
 public CRFClassifierNoisyLabel(SeqClassifierFlags flags)
     : base(flags)
 {
 }
 internal CRFNonLinearSecondOrderLogConditionalObjectiveFunction(int[][][][] data, int[][] labels, int window, IIndex <string> classIndex, IList <IIndex <CRFLabel> > labelIndices, int[] map, SeqClassifierFlags flags, int numNodeFeatures, int numEdgeFeatures
                                                                 )
     : this(data, labels, window, classIndex, labelIndices, map, QuadraticPrior, flags, numNodeFeatures, numEdgeFeatures)
 {
 }
        public static void Main(string[] args)
        {
            Properties props = StringUtils.ArgsToProperties(args);
            // logger.debug(props.toString());
            SeqClassifierFlags flags       = new SeqClassifierFlags(props);
            MaxMatchSegmenter  seg         = new MaxMatchSegmenter();
            string             lexiconFile = props.GetProperty("lexicon");

            if (lexiconFile != null)
            {
                seg.AddLexicon(lexiconFile);
            }
            else
            {
                logger.Error("Error: no lexicon file!");
                System.Environment.Exit(1);
            }
            Sighan2005DocumentReaderAndWriter sighanRW = new Sighan2005DocumentReaderAndWriter();

            sighanRW.Init(flags);
            BufferedReader br      = new BufferedReader(new InputStreamReader(Runtime.@in));
            PrintWriter    stdoutW = new PrintWriter(System.Console.Out);
            int            lineNb  = 0;

            for (; ;)
            {
                ++lineNb;
                logger.Info("line: " + lineNb);
                try
                {
                    string line = br.ReadLine();
                    if (line == null)
                    {
                        break;
                    }
                    string outputLine = null;
                    if (props.GetProperty("greedy") != null)
                    {
                        List <Word> sentence = seg.GreedilySegmentWords(line);
                        outputLine = SentenceUtils.ListToString(sentence);
                    }
                    else
                    {
                        if (props.GetProperty("maxwords") != null)
                        {
                            seg.BuildSegmentationLattice(line);
                            outputLine = SentenceUtils.ListToString(seg.SegmentWords(MaxMatchSegmenter.MatchHeuristic.Maxwords));
                        }
                        else
                        {
                            seg.BuildSegmentationLattice(line);
                            outputLine = SentenceUtils.ListToString(seg.MaxMatchSegmentation());
                        }
                    }
                    StringReader strR = new StringReader(outputLine);
                    IEnumerator <IList <CoreLabel> > itr = sighanRW.GetIterator(strR);
                    while (itr.MoveNext())
                    {
                        sighanRW.PrintAnswers(itr.Current, stdoutW);
                    }
                }
                catch (IOException)
                {
                    // System.out.println(outputLine);
                    break;
                }
            }
            stdoutW.Flush();
        }
Пример #29
0
 /// <summary>Required, but unused.</summary>
 public virtual void Init(SeqClassifierFlags flags)
 {
 }
Пример #30
0
        public static string CombineSegmentedSentence(IList <CoreLabel> doc, SeqClassifierFlags flags)
        {
            // Hey all: Some of the code that was previously here for
            // whitespace normalization was a bit hackish as well as
            // obviously broken for some test cases. So...I went ahead and
            // re-wrote it.
            //
            // Also, putting everything into 'testContent', is a bit wasteful
            // memory wise. But, it's on my near-term todo list to
            // code something that's a bit more memory efficient.
            //
            // Finally, if these changes ended up breaking anything
            // just e-mail me ([email protected]), and I'll try to fix it
            // asap  -cer (6/14/2006)

            /* Sun Oct  7 19:55:09 2007
             * I'm actually not using "testContent" anymore.
             * I think it's broken because the whole test file has been read over and over again,
             * tand the testContentIdx has been set to 0 every time, while "doc" is moving
             * line by line!!!!
             * -pichuan
             */
            int           testContentIdx = 0;
            StringBuilder ans            = new StringBuilder();
            // the actual output we will return
            StringBuilder unmod_ans = new StringBuilder();
            // this is the original output from the CoreLabel
            StringBuilder unmod_normed_ans = new StringBuilder();
            // this is the original output from the CoreLabel
            CoreLabel wi = null;

            for (IEnumerator <CoreLabel> wordIter = doc.GetEnumerator(); wordIter.MoveNext(); testContentIdx++)
            {
                CoreLabel pwi = wi;
                wi = wordIter.Current;
                bool originalWhiteSpace = "1".Equals(wi.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation)));
                //  if the CRF says "START" (segmented), and it's not the first word..
                if (wi.Get(typeof(CoreAnnotations.AnswerAnnotation)).Equals("1") && !("0".Equals(wi.Get(typeof(CoreAnnotations.PositionAnnotation)).ToString())))
                {
                    // check if we need to preserve the "no space" between English
                    // characters
                    bool seg = true;
                    // since it's in the "1" condition.. default is to seg
                    if (flags.keepEnglishWhitespaces)
                    {
                        if (testContentIdx > 0)
                        {
                            char prevChar = pwi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0];
                            char currChar = wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0];
                            if (IsLetterASCII(prevChar) && IsLetterASCII(currChar))
                            {
                                // keep the "non space" before wi
                                if (!originalWhiteSpace)
                                {
                                    seg = false;
                                }
                            }
                        }
                    }
                    // if there was space and keepAllWhitespaces is true, restore it no matter what
                    if (flags.keepAllWhitespaces && originalWhiteSpace)
                    {
                        seg = true;
                    }
                    if (seg)
                    {
                        if (originalWhiteSpace)
                        {
                            ans.Append('\u1924');
                        }
                        else
                        {
                            // a pretty Limbu character which is later changed to a space
                            ans.Append(' ');
                        }
                    }
                    unmod_ans.Append(' ');
                    unmod_normed_ans.Append(' ');
                }
                else
                {
                    bool seg = false;
                    // since it's in the "0" condition.. default
                    // Changed after conversation with Huihsin.
                    //
                    // Decided that all words consisting of English/ASCII characters
                    // should be separated from the surrounding Chinese characters. -cer

                    /* Sun Oct  7 22:14:46 2007 (pichuan)
                     * the comment above was from DanC.
                     * I changed the code but I think I'm doing the same thing here.
                     */
                    if (testContentIdx > 0)
                    {
                        char prevChar = pwi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0];
                        char currChar = wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0];
                        if ((prevChar < (char)128) != (currChar < (char)128))
                        {
                            if (ChineseUtils.IsNumber(prevChar) && ChineseUtils.IsNumber(currChar))
                            {
                            }
                            else
                            {
                                // cdm: you would get here if you had an ASCII number next to a
                                // Unihan range number.  Does that happen?  It presumably
                                // shouldn't do any harm.... [cdm, oct 2007]
                                if (flags.separateASCIIandRange)
                                {
                                    seg = true;
                                }
                            }
                        }
                    }
                    if (flags.keepEnglishWhitespaces)
                    {
                        if (testContentIdx > 0)
                        {
                            char prevChar = pwi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0];
                            char currChar = wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation))[0];
                            if (IsLetterASCII(prevChar) && IsLetterASCII(currChar) || IsLetterASCII(prevChar) && ChineseUtils.IsNumber(currChar) || ChineseUtils.IsNumber(prevChar) && IsLetterASCII(currChar))
                            {
                                // keep the "space" before wi
                                if ("1".Equals(wi.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation))))
                                {
                                    seg = true;
                                }
                            }
                        }
                    }
                    // if there was space and keepAllWhitespaces is true, restore it no matter what
                    if (flags.keepAllWhitespaces)
                    {
                        if (!("0".Equals(wi.Get(typeof(CoreAnnotations.PositionAnnotation)).ToString())) && "1".Equals(wi.Get(typeof(CoreAnnotations.SpaceBeforeAnnotation))))
                        {
                            seg = true;
                        }
                    }
                    if (seg)
                    {
                        if (originalWhiteSpace)
                        {
                            ans.Append('\u1924');
                        }
                        else
                        {
                            // a pretty Limbu character which is later changed to a space
                            ans.Append(' ');
                        }
                    }
                }
                ans.Append(wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation)));
                unmod_ans.Append(wi.Get(typeof(CoreAnnotations.OriginalCharAnnotation)));
                unmod_normed_ans.Append(wi.Get(typeof(CoreAnnotations.CharAnnotation)));
            }
            string ansStr = ans.ToString();

            if (flags.sighanPostProcessing)
            {
                if (!flags.keepAllWhitespaces)
                {
                    // remove the Limbu char now, so it can be deleted in postprocessing
                    ansStr = ansStr.ReplaceAll("\u1924", " ");
                }
                ansStr = PostProcessingAnswer(ansStr, flags);
            }
            // definitely remove the Limbu char if it survived till now
            ansStr = ansStr.ReplaceAll("\u1924", " ");
            return(ansStr);
        }