Esempio n. 1
0
        /// <summary>Creates an ArabicTokenizer.</summary>
        /// <remarks>
        /// Creates an ArabicTokenizer. The default tokenizer
        /// is ArabicTokenizer.atbFactory(), which produces the
        /// same orthographic normalization as Green and Manning (2010).
        /// </remarks>
        /// <returns>A TokenizerFactory that produces each Arabic token as a CoreLabel</returns>
        private ITokenizerFactory <CoreLabel> GetTokenizerFactory()
        {
            ITokenizerFactory <CoreLabel> tokFactory = null;

            if (!isTokenized)
            {
                if (tokenizerOptions == null)
                {
                    tokFactory = ArabicTokenizer.AtbFactory();
                    string atbVocOptions = "removeProMarker,removeMorphMarker,removeLengthening";
                    tokFactory.SetOptions(atbVocOptions);
                }
                else
                {
                    if (tokenizerOptions.Contains("removeSegMarker"))
                    {
                        throw new Exception("Option 'removeSegMarker' cannot be used with ArabicSegmenter");
                    }
                    tokFactory = ArabicTokenizer.Factory();
                    tokFactory.SetOptions(tokenizerOptions);
                }
                log.Info("Loaded ArabicTokenizer with options: " + tokenizerOptions);
            }
            return(tokFactory);
        }
        /// <summary>A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).</summary>
        /// <remarks>
        /// A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).
        /// Performs punctuation splitting and light tokenization by default.
        /// Orthographic normalization options are available, and can be enabled with
        /// command line options.
        /// <p>
        /// Currently, this tokenizer does not do line splitting. It normalizes non-printing
        /// line separators across platforms and prints the system default line splitter
        /// to the output.
        /// <p>
        /// The following normalization options are provided:
        /// <ul>
        /// <li>
        /// <c>useUTF8Ellipsis</c>
        /// : Replaces sequences of three or more full stops with \u2026</li>
        /// <li>
        /// <c>normArDigits</c>
        /// : Convert Arabic digits to ASCII equivalents</li>
        /// <li>
        /// <c>normArPunc</c>
        /// : Convert Arabic punctuation to ASCII equivalents</li>
        /// <li>
        /// <c>normAlif</c>
        /// : Change all alif forms to bare alif</li>
        /// <li>
        /// <c>normYa</c>
        /// : Map ya to alif maqsura</li>
        /// <li>
        /// <c>removeDiacritics</c>
        /// : Strip all diacritics</li>
        /// <li>
        /// <c>removeTatweel</c>
        /// : Strip tatweel elongation character</li>
        /// <li>
        /// <c>removeQuranChars</c>
        /// : Remove diacritics that appear in the Quran</li>
        /// <li>
        /// <c>removeProMarker</c>
        /// : Remove the ATB null pronoun marker</li>
        /// <li>
        /// <c>removeSegMarker</c>
        /// : Remove the ATB clitic segmentation marker</li>
        /// <li>
        /// <c>removeMorphMarker</c>
        /// : Remove the ATB morpheme boundary markers</li>
        /// <li>
        /// <c>removeLengthening</c>
        /// : Replace all sequences of three or more identical (non-period) characters with one copy</li>
        /// <li>
        /// <c>atbEscaping</c>
        /// : Replace left/right parentheses with ATB escape characters</li>
        /// </ul>
        /// </remarks>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length > 0 && args[0].Contains("help"))
            {
                System.Console.Error.Printf("Usage: java %s [OPTIONS] < file%n", typeof(ArabicTokenizer).FullName);
                System.Console.Error.Printf("%nOptions:%n");
                log.Info("   -help : Print this message. See javadocs for all normalization options.");
                log.Info("   -atb  : Tokenization for the parsing experiments in Green and Manning (2010)");
                System.Environment.Exit(-1);
            }
            // Process normalization options
            Properties tokenizerOptions      = StringUtils.ArgsToProperties(args);
            ITokenizerFactory <CoreLabel> tf = tokenizerOptions.Contains("atb") ? ArabicTokenizer.AtbFactory() : ArabicTokenizer.Factory();

            foreach (string option in tokenizerOptions.StringPropertyNames())
            {
                tf.SetOptions(option);
            }
            // Replace line separators with a token so that we can
            // count lines
            tf.SetOptions("tokenizeNLs");
            // Read the file
            int nLines  = 0;
            int nTokens = 0;

            try
            {
                string encoding = "UTF-8";
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding));
                bool printSpace = false;
                while (tokenizer.MoveNext())
                {
                    ++nTokens;
                    string word = tokenizer.Current.Word();
                    if (word.Equals(ArabicLexer.NewlineToken))
                    {
                        ++nLines;
                        printSpace = false;
                        System.Console.Out.WriteLine();
                    }
                    else
                    {
                        if (printSpace)
                        {
                            System.Console.Out.Write(" ");
                        }
                        System.Console.Out.Write(word);
                        printSpace = true;
                    }
                }
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens)%n", nLines, nTokens);
        }
        public virtual void TestArabicTokenizer()
        {
            System.Diagnostics.Debug.Assert((untokInputs.Length == tokReferences.Length));
            ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory();

            tf.SetOptions("removeProMarker");
            tf.SetOptions("removeSegMarker");
            tf.SetOptions("removeMorphMarker");
            for (int i = 0; i < untokInputs.Length; ++i)
            {
                string line = untokInputs[i];
                ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(line));
                IList <CoreLabel>      tokens    = tokenizer.Tokenize();
                string tokenizedLine             = SentenceUtils.ListToString(tokens);
                string reference = tokReferences[i];
                NUnit.Framework.Assert.AreEqual("Tokenization deviates from reference", reference, tokenizedLine);
            }
        }
        public virtual void TestCharOffsets()
        {
            string untokInput = "إِنَّ- -نا هادِئ+ُونَ .";

            int[] beginOffsets = new int[] { 0, 7, 11, 22 };
            int[] endOffsets   = new int[] { 6, 10, 21, 23 };
            ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory();

            tf.SetOptions("removeProMarker");
            tf.SetOptions("removeSegMarker");
            tf.SetOptions("removeMorphMarker");
            ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(untokInput));
            IList <CoreLabel>      tokens    = tokenizer.Tokenize();

            NUnit.Framework.Assert.AreEqual("Number of tokens doesn't match reference", tokens.Count, beginOffsets.Length);
            for (int i = 0; i < beginOffsets.Length; i++)
            {
                NUnit.Framework.Assert.AreEqual("Char begin offset deviates from reference", beginOffsets[i], tokens[i].BeginPosition());
                NUnit.Framework.Assert.AreEqual("Char end offset deviates from reference", endOffsets[i], tokens[i].EndPosition());
            }
        }
Esempio n. 5
0
        /// <summary>For debugging.</summary>
        /// <param name="args"/>
        /// <exception cref="System.IO.IOException"></exception>
        public static void Main(string[] args)
        {
            if (args.Length != 1)
            {
                System.Console.Error.Printf("Usage: java %s file > output%n", typeof(ArabicDocumentReaderAndWriter).FullName);
                System.Environment.Exit(-1);
            }
            string fileName = args[0];
            ITokenizerFactory <CoreLabel> tokFactory = ArabicTokenizer.AtbFactory();
            string atbVocOptions = "removeProMarker,removeMorphMarker";

            tokFactory.SetOptions(atbVocOptions);
            BufferedReader reader = IOUtils.ReaderFromString(fileName);

            for (string line; (line = reader.ReadLine()) != null;)
            {
                string[] toks    = line.Split("\\s+");
                string   delim   = Pattern.Quote(tagDelimiter);
                bool     isStart = true;
                foreach (string wordTag in toks)
                {
                    string[] wordTagPair = wordTag.Split(delim);
                    System.Diagnostics.Debug.Assert(wordTagPair.Length == 2);
                    string word = wordTagPair[0];
                    if (tokFactory != null)
                    {
                        IList <CoreLabel> lexList = tokFactory.GetTokenizer(new StringReader(word)).Tokenize();
                        if (lexList.Count == 0)
                        {
                            continue;
                        }
                        else
                        {
                            if (lexList.Count == 1)
                            {
                                word = lexList[0].Value();
                            }
                            else
                            {
                                if (lexList.Count > 1)
                                {
                                    string secondWord = lexList[1].Value();
                                    if (secondWord.Equals(DefaultSegMarker.ToString()))
                                    {
                                        // Special case for the null marker in the vocalized section
                                        word = lexList[0].Value() + DefaultSegMarker.ToString();
                                    }
                                    else
                                    {
                                        System.Console.Error.Printf("%s: Raw token generates multiple segments: %s%n", typeof(ArabicDocumentReaderAndWriter).FullName, word);
                                        word = lexList[0].Value();
                                    }
                                }
                            }
                        }
                    }
                    if (!isStart)
                    {
                        System.Console.Out.Write(" ");
                    }
                    System.Console.Out.Write(word);
                    isStart = false;
                }
                System.Console.Out.WriteLine();
            }
        }
Esempio n. 6
0
        /// <summary>
        /// arg[0] := tokenizer options
        /// args[1] := file to tokenize
        /// </summary>
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                System.Console.Out.Printf("Usage: java %s OPTS filename%n", typeof(ArabicTokenizerTester).FullName);
                System.Environment.Exit(-1);
            }
            string tokOptions = args[0];
            File   path       = new File(args[1]);

            log.Info("Reading from: " + path.GetPath());
            try
            {
                BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
                ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.Factory();
                tf.SetOptions(tokOptions);
                IMapper lexMapper = new DefaultLexicalMapper();
                lexMapper.Setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8");
                int lineId = 0;
                for (string line; (line = br.ReadLine()) != null; lineId++)
                {
                    line = line.Trim();
                    // Tokenize with the tokenizer
                    IList <CoreLabel> tokenizedLine = tf.GetTokenizer(new StringReader(line)).Tokenize();
                    System.Console.Out.WriteLine(SentenceUtils.ListToString(tokenizedLine));
                    // Tokenize with the mapper
                    StringBuilder sb   = new StringBuilder();
                    string[]      toks = line.Split("\\s+");
                    foreach (string tok in toks)
                    {
                        string mappedTok = lexMapper.Map(null, tok);
                        sb.Append(mappedTok).Append(" ");
                    }
                    IList <string> mappedToks = Arrays.AsList(sb.ToString().Trim().Split("\\s+"));
                    // Evaluate the output
                    if (mappedToks.Count != tokenizedLine.Count)
                    {
                        System.Console.Error.Printf("Line length mismatch:%norig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks));
                    }
                    else
                    {
                        bool printLines = false;
                        for (int i = 0; i < mappedToks.Count; ++i)
                        {
                            string mappedTok    = mappedToks[i];
                            string tokenizedTok = tokenizedLine[i].Word();
                            if (!mappedTok.Equals(tokenizedTok))
                            {
                                System.Console.Error.Printf("Token mismatch:%nmap: %s%ntok: %s%n", mappedTok, tokenizedTok);
                                printLines = true;
                            }
                        }
                        if (printLines)
                        {
                            System.Console.Error.Printf("orig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks));
                        }
                    }
                }
                System.Console.Error.Printf("Read %d lines.%n", lineId);
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }