/// <summary>Creates an ArabicTokenizer.</summary> /// <remarks> /// Creates an ArabicTokenizer. The default tokenizer /// is ArabicTokenizer.atbFactory(), which produces the /// same orthographic normalization as Green and Manning (2010). /// </remarks> /// <returns>A TokenizerFactory that produces each Arabic token as a CoreLabel</returns> private ITokenizerFactory <CoreLabel> GetTokenizerFactory() { ITokenizerFactory <CoreLabel> tokFactory = null; if (!isTokenized) { if (tokenizerOptions == null) { tokFactory = ArabicTokenizer.AtbFactory(); string atbVocOptions = "removeProMarker,removeMorphMarker,removeLengthening"; tokFactory.SetOptions(atbVocOptions); } else { if (tokenizerOptions.Contains("removeSegMarker")) { throw new Exception("Option 'removeSegMarker' cannot be used with ArabicSegmenter"); } tokFactory = ArabicTokenizer.Factory(); tokFactory.SetOptions(tokenizerOptions); } log.Info("Loaded ArabicTokenizer with options: " + tokenizerOptions); } return(tokFactory); }
/// <summary>A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).</summary> /// <remarks> /// A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding). /// Performs punctuation splitting and light tokenization by default. /// Orthographic normalization options are available, and can be enabled with /// command line options. /// <p> /// Currently, this tokenizer does not do line splitting. It normalizes non-printing /// line separators across platforms and prints the system default line splitter /// to the output. /// <p> /// The following normalization options are provided: /// <ul> /// <li> /// <c>useUTF8Ellipsis</c> /// : Replaces sequences of three or more full stops with \u2026</li> /// <li> /// <c>normArDigits</c> /// : Convert Arabic digits to ASCII equivalents</li> /// <li> /// <c>normArPunc</c> /// : Convert Arabic punctuation to ASCII equivalents</li> /// <li> /// <c>normAlif</c> /// : Change all alif forms to bare alif</li> /// <li> /// <c>normYa</c> /// : Map ya to alif maqsura</li> /// <li> /// <c>removeDiacritics</c> /// : Strip all diacritics</li> /// <li> /// <c>removeTatweel</c> /// : Strip tatweel elongation character</li> /// <li> /// <c>removeQuranChars</c> /// : Remove diacritics that appear in the Quran</li> /// <li> /// <c>removeProMarker</c> /// : Remove the ATB null pronoun marker</li> /// <li> /// <c>removeSegMarker</c> /// : Remove the ATB clitic segmentation marker</li> /// <li> /// <c>removeMorphMarker</c> /// : Remove the ATB morpheme boundary markers</li> /// <li> /// <c>removeLengthening</c> /// : Replace all sequences of three or more identical (non-period) characters with one copy</li> /// <li> /// <c>atbEscaping</c> /// : Replace left/right parentheses with ATB escape characters</li> /// </ul> /// </remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length > 0 && args[0].Contains("help")) { System.Console.Error.Printf("Usage: java %s [OPTIONS] < file%n", typeof(ArabicTokenizer).FullName); System.Console.Error.Printf("%nOptions:%n"); log.Info(" -help : Print this message. See javadocs for all normalization options."); log.Info(" -atb : Tokenization for the parsing experiments in Green and Manning (2010)"); System.Environment.Exit(-1); } // Process normalization options Properties tokenizerOptions = StringUtils.ArgsToProperties(args); ITokenizerFactory <CoreLabel> tf = tokenizerOptions.Contains("atb") ? ArabicTokenizer.AtbFactory() : ArabicTokenizer.Factory(); foreach (string option in tokenizerOptions.StringPropertyNames()) { tf.SetOptions(option); } // Replace line separators with a token so that we can // count lines tf.SetOptions("tokenizeNLs"); // Read the file int nLines = 0; int nTokens = 0; try { string encoding = "UTF-8"; ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding)); bool printSpace = false; while (tokenizer.MoveNext()) { ++nTokens; string word = tokenizer.Current.Word(); if (word.Equals(ArabicLexer.NewlineToken)) { ++nLines; printSpace = false; System.Console.Out.WriteLine(); } else { if (printSpace) { System.Console.Out.Write(" "); } System.Console.Out.Write(word); printSpace = true; } } } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens)%n", nLines, nTokens); }
public virtual void TestArabicTokenizer() { System.Diagnostics.Debug.Assert((untokInputs.Length == tokReferences.Length)); ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory(); tf.SetOptions("removeProMarker"); tf.SetOptions("removeSegMarker"); tf.SetOptions("removeMorphMarker"); for (int i = 0; i < untokInputs.Length; ++i) { string line = untokInputs[i]; ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(line)); IList <CoreLabel> tokens = tokenizer.Tokenize(); string tokenizedLine = SentenceUtils.ListToString(tokens); string reference = tokReferences[i]; NUnit.Framework.Assert.AreEqual("Tokenization deviates from reference", reference, tokenizedLine); } }
public virtual void TestCharOffsets() { string untokInput = "إِنَّ- -نا هادِئ+ُونَ ."; int[] beginOffsets = new int[] { 0, 7, 11, 22 }; int[] endOffsets = new int[] { 6, 10, 21, 23 }; ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory(); tf.SetOptions("removeProMarker"); tf.SetOptions("removeSegMarker"); tf.SetOptions("removeMorphMarker"); ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(untokInput)); IList <CoreLabel> tokens = tokenizer.Tokenize(); NUnit.Framework.Assert.AreEqual("Number of tokens doesn't match reference", tokens.Count, beginOffsets.Length); for (int i = 0; i < beginOffsets.Length; i++) { NUnit.Framework.Assert.AreEqual("Char begin offset deviates from reference", beginOffsets[i], tokens[i].BeginPosition()); NUnit.Framework.Assert.AreEqual("Char end offset deviates from reference", endOffsets[i], tokens[i].EndPosition()); } }
/// <summary>For debugging.</summary> /// <param name="args"/> /// <exception cref="System.IO.IOException"></exception> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s file > output%n", typeof(ArabicDocumentReaderAndWriter).FullName); System.Environment.Exit(-1); } string fileName = args[0]; ITokenizerFactory <CoreLabel> tokFactory = ArabicTokenizer.AtbFactory(); string atbVocOptions = "removeProMarker,removeMorphMarker"; tokFactory.SetOptions(atbVocOptions); BufferedReader reader = IOUtils.ReaderFromString(fileName); for (string line; (line = reader.ReadLine()) != null;) { string[] toks = line.Split("\\s+"); string delim = Pattern.Quote(tagDelimiter); bool isStart = true; foreach (string wordTag in toks) { string[] wordTagPair = wordTag.Split(delim); System.Diagnostics.Debug.Assert(wordTagPair.Length == 2); string word = wordTagPair[0]; if (tokFactory != null) { IList <CoreLabel> lexList = tokFactory.GetTokenizer(new StringReader(word)).Tokenize(); if (lexList.Count == 0) { continue; } else { if (lexList.Count == 1) { word = lexList[0].Value(); } else { if (lexList.Count > 1) { string secondWord = lexList[1].Value(); if (secondWord.Equals(DefaultSegMarker.ToString())) { // Special case for the null marker in the vocalized section word = lexList[0].Value() + DefaultSegMarker.ToString(); } else { System.Console.Error.Printf("%s: Raw token generates multiple segments: %s%n", typeof(ArabicDocumentReaderAndWriter).FullName, word); word = lexList[0].Value(); } } } } } if (!isStart) { System.Console.Out.Write(" "); } System.Console.Out.Write(word); isStart = false; } System.Console.Out.WriteLine(); } }