Esempio n. 1
0
 /// <summary>Train a new segmenter or load an trained model from file.</summary>
 /// <remarks>
 /// Train a new segmenter or load an trained model from file.  First
 /// checks to see if there is a "model" or "loadClassifier" flag to
 /// load from, and if not tries to run training using the given
 /// options.
 /// </remarks>
 /// <param name="options">Properties to specify segmenter behavior</param>
 /// <returns>the trained or loaded model</returns>
 public static Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter GetSegmenter(Properties options)
 {
     Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter segmenter = new Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter(options);
     if (segmenter.flags.inputEncoding == null)
     {
         segmenter.flags.inputEncoding = Runtime.GetProperty("file.encoding");
     }
     // Load or train the classifier
     if (segmenter.flags.loadClassifier != null)
     {
         segmenter.LoadSegmenter(segmenter.flags.loadClassifier, options);
     }
     else
     {
         if (segmenter.flags.trainFile != null)
         {
             segmenter.Train();
             if (segmenter.flags.serializeTo != null)
             {
                 segmenter.SerializeSegmenter(segmenter.flags.serializeTo);
                 log.Info("Serialized segmenter to: " + segmenter.flags.serializeTo);
             }
         }
         else
         {
             log.Info("No training file or trained model specified!");
             log.Info(Usage());
             System.Environment.Exit(-1);
         }
     }
     return(segmenter);
 }
Esempio n. 2
0
 /// <summary>Copy constructor.</summary>
 /// <param name="other"/>
 public ArabicSegmenter(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter other)
 {
     isTokenized      = other.isTokenized;
     tokenizerOptions = other.tokenizerOptions;
     prefixMarker     = other.prefixMarker;
     suffixMarker     = other.suffixMarker;
     tedEvalPrefix    = other.tedEvalPrefix;
     hasDomainLabels  = other.hasDomainLabels;
     domain           = other.domain;
     noRewrites       = other.noRewrites;
     flags            = other.flags;
     // ArabicTokenizerFactory is *not* threadsafe. Make a new copy.
     tf = GetTokenizerFactory();
     // CRFClassifier is threadsafe, so return a reference.
     classifier = other.classifier;
 }
Esempio n. 3
0
        /// <summary>Segment input and write to output stream.</summary>
        /// <param name="segmenter"/>
        /// <param name="br"/>
        /// <param name="pwOut"/>
        /// <param name="nThreads"/>
        /// <returns>input characters processed per second</returns>
        private static double Decode(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter segmenter, BufferedReader br, PrintWriter pwOut, int nThreads)
        {
            System.Diagnostics.Debug.Assert(nThreads > 0);
            long nChars    = 0;
            long startTime = Runtime.NanoTime();

            if (nThreads > 1)
            {
                MulticoreWrapper <string, string> wrapper = new MulticoreWrapper <string, string>(nThreads, segmenter);
                try
                {
                    for (string line; (line = br.ReadLine()) != null;)
                    {
                        nChars += line.Length;
                        wrapper.Put(line);
                        while (wrapper.Peek())
                        {
                            pwOut.Println(wrapper.Poll());
                        }
                    }
                    wrapper.Join();
                    while (wrapper.Peek())
                    {
                        pwOut.Println(wrapper.Poll());
                    }
                }
                catch (IOException e)
                {
                    log.Warn(e);
                }
            }
            else
            {
                nChars = segmenter.Segment(br, pwOut);
            }
            long   duration    = Runtime.NanoTime() - startTime;
            double charsPerSec = (double)nChars / (duration / 1000000000.0);

            return(charsPerSec);
        }
Esempio n. 4
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            // Strips off hyphens
            Properties options = StringUtils.ArgsToProperties(args, OptionArgDefs());

            if (options.Contains("help") || args.Length == 0)
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            int nThreads = PropertiesUtils.GetInt(options, "nthreads", 1);

            Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter segmenter = GetSegmenter(options);
            // Decode either an evaluation file or raw text
            try
            {
                PrintWriter pwOut;
                if (segmenter.flags.outputEncoding != null)
                {
                    OutputStreamWriter @out = new OutputStreamWriter(System.Console.Out, segmenter.flags.outputEncoding);
                    pwOut = new PrintWriter(@out, true);
                }
                else
                {
                    if (segmenter.flags.inputEncoding != null)
                    {
                        OutputStreamWriter @out = new OutputStreamWriter(System.Console.Out, segmenter.flags.inputEncoding);
                        pwOut = new PrintWriter(@out, true);
                    }
                    else
                    {
                        pwOut = new PrintWriter(System.Console.Out, true);
                    }
                }
                if (segmenter.flags.testFile != null)
                {
                    if (segmenter.flags.answerFile == null)
                    {
                        segmenter.Evaluate(pwOut);
                    }
                    else
                    {
                        Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter.EvaluateRawText(pwOut);
                    }
                }
                else
                {
                    BufferedReader br          = (segmenter.flags.textFile == null) ? IOUtils.ReaderFromStdin() : IOUtils.ReaderFromString(segmenter.flags.textFile, segmenter.flags.inputEncoding);
                    double         charsPerSec = Decode(segmenter, br, pwOut, nThreads);
                    IOUtils.CloseIgnoringExceptions(br);
                    System.Console.Error.Printf("Done! Processed input text at %.2f input characters/second%n", charsPerSec);
                }
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException)
            {
                System.Console.Error.Printf("%s: Could not open %s%n", typeof(Edu.Stanford.Nlp.International.Arabic.Process.ArabicSegmenter).FullName, segmenter.flags.textFile);
            }
        }