Пример #1
0
        /// <summary>Creates an ArabicTokenizer.</summary>
        /// <remarks>
        /// Creates an ArabicTokenizer. The default tokenizer
        /// is ArabicTokenizer.atbFactory(), which produces the
        /// same orthographic normalization as Green and Manning (2010).
        /// </remarks>
        /// <returns>A TokenizerFactory that produces each Arabic token as a CoreLabel</returns>
        private ITokenizerFactory <CoreLabel> GetTokenizerFactory()
        {
            ITokenizerFactory <CoreLabel> tokFactory = null;

            if (!isTokenized)
            {
                if (tokenizerOptions == null)
                {
                    tokFactory = ArabicTokenizer.AtbFactory();
                    string atbVocOptions = "removeProMarker,removeMorphMarker,removeLengthening";
                    tokFactory.SetOptions(atbVocOptions);
                }
                else
                {
                    if (tokenizerOptions.Contains("removeSegMarker"))
                    {
                        throw new Exception("Option 'removeSegMarker' cannot be used with ArabicSegmenter");
                    }
                    tokFactory = ArabicTokenizer.Factory();
                    tokFactory.SetOptions(tokenizerOptions);
                }
                log.Info("Loaded ArabicTokenizer with options: " + tokenizerOptions);
            }
            return(tokFactory);
        }
        /// <summary>A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).
        /// Performs punctuation splitting and light tokenization by default.
        /// Orthographic normalization options are available, and can be enabled with
        /// command line options.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It normalizes non-printing
        /// line separators across platforms and prints the system default line splitter
        /// to the output.
        /// <p>
        /// The following normalization options are provided:
        /// <ul>
        /// <li>
        /// <c>useUTF8Ellipsis</c>
        /// : Replaces sequences of three or more full stops with \u2026</li>
        /// <li>
        /// <c>normArDigits</c>
        /// : Convert Arabic digits to ASCII equivalents</li>
        /// <li>
        /// <c>normArPunc</c>
        /// : Convert Arabic punctuation to ASCII equivalents</li>
        /// <li>
        /// <c>normAlif</c>
        /// : Change all alif forms to bare alif</li>
        /// <li>
        /// <c>normYa</c>
        /// : Map ya to alif maqsura</li>
        /// <li>
        /// <c>removeDiacritics</c>
        /// : Strip all diacritics</li>
        /// <li>
        /// <c>removeTatweel</c>
        /// : Strip tatweel elongation character</li>
        /// <li>
        /// <c>removeQuranChars</c>
        /// : Remove diacritics that appear in the Quran</li>
        /// <li>
        /// <c>removeProMarker</c>
        /// : Remove the ATB null pronoun marker</li>
        /// <li>
        /// <c>removeSegMarker</c>
        /// : Remove the ATB clitic segmentation marker</li>
        /// <li>
        /// <c>removeMorphMarker</c>
        /// : Remove the ATB morpheme boundary markers</li>
        /// <li>
        /// <c>removeLengthening</c>
        /// : Replace all sequences of three or more identical (non-period) characters with one copy</li>
        /// <li>
        /// <c>atbEscaping</c>
        /// : Replace left/right parentheses with ATB escape characters</li>
        /// </ul>
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length > 0 && args[0].Contains("help"))
            {
                System.Console.Error.Printf("Usage: java %s [OPTIONS] < file%n", typeof(ArabicTokenizer).FullName);
                System.Console.Error.Printf("%nOptions:%n");
                log.Info("   -help : Print this message. See javadocs for all normalization options.");
                log.Info("   -atb  : Tokenization for the parsing experiments in Green and Manning (2010)");
                System.Environment.Exit(-1);
            }
            // Process normalization options
            Properties tokenizerOptions      = StringUtils.ArgsToProperties(args);
            ITokenizerFactory <CoreLabel> tf = tokenizerOptions.Contains("atb") ? ArabicTokenizer.AtbFactory() : ArabicTokenizer.Factory();

            foreach (string option in tokenizerOptions.StringPropertyNames())
            {
                tf.SetOptions(option);
            }
            // Replace line separators with a token so that we can
            // count lines
            tf.SetOptions("tokenizeNLs");
            // Read the file
            int nLines  = 0;
            int nTokens = 0;

            try
            {
                string encoding = "UTF-8";
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                {
                    ++nTokens;
                    string word = tokenizer.Current.Word();
                    if (word.Equals(ArabicLexer.NewlineToken))
                    {
                        ++nLines;
                        printSpace = false;
                        System.Console.Out.WriteLine();
                    }
                    else
                    {
                        if (printSpace)
                        {
                            System.Console.Out.Write(" ");
                        }
                        System.Console.Out.Write(word);
                        printSpace = true;
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens)%n", nLines, nTokens);
        }
Пример #3
0
        /// <summary>
        /// arg[0] := tokenizer options
        /// args[1] := file to tokenize
        /// </summary>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                System.Console.Out.Printf("Usage: java %s OPTS filename%n", typeof(ArabicTokenizerTester).FullName);
                System.Environment.Exit(-1);
            }
            string tokOptions = args[0];
            File   path       = new File(args[1]);

            log.Info("Reading from: " + path.GetPath());
            try
            {
                BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
                ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.Factory();
                tf.SetOptions(tokOptions);
                IMapper lexMapper = new DefaultLexicalMapper();
                lexMapper.Setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8");
                int lineId = 0;
                for (string line; (line = br.ReadLine()) != null; lineId++)
                {
                    line = line.Trim();
                    // Tokenize with the tokenizer
                    IList <CoreLabel> tokenizedLine = tf.GetTokenizer(new StringReader(line)).Tokenize();
                    System.Console.Out.WriteLine(SentenceUtils.ListToString(tokenizedLine));
                    // Tokenize with the mapper
                    StringBuilder sb   = new StringBuilder();
                    string[]      toks = line.Split("\\s+");
                    foreach (string tok in toks)
                    {
                        string mappedTok = lexMapper.Map(null, tok);
                        sb.Append(mappedTok).Append(" ");
                    }
                    IList <string> mappedToks = Arrays.AsList(sb.ToString().Trim().Split("\\s+"));
                    // Evaluate the output
                    if (mappedToks.Count != tokenizedLine.Count)
                    {
                        System.Console.Error.Printf("Line length mismatch:%norig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks));
                    }
                    else
                    {
                        bool printLines = false;
                        for (int i = 0; i < mappedToks.Count; ++i)
                        {
                            string mappedTok    = mappedToks[i];
                            string tokenizedTok = tokenizedLine[i].Word();
                            if (!mappedTok.Equals(tokenizedTok))
                            {
                                System.Console.Error.Printf("Token mismatch:%nmap: %s%ntok: %s%n", mappedTok, tokenizedTok);
                                printLines = true;
                            }
                        }
                        if (printLines)
                        {
                            System.Console.Error.Printf("orig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks));
                        }
                    }
                }
                System.Console.Error.Printf("Read %d lines.%n", lineId);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }