public virtual void TestCharOffsets() { string untokInput = "إِنَّ- -نا هادِئ+ُونَ ."; int[] beginOffsets = new int[] { 0, 7, 11, 22 }; int[] endOffsets = new int[] { 6, 10, 21, 23 }; ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory(); tf.SetOptions("removeProMarker"); tf.SetOptions("removeSegMarker"); tf.SetOptions("removeMorphMarker"); ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(untokInput)); IList <CoreLabel> tokens = tokenizer.Tokenize(); NUnit.Framework.Assert.AreEqual("Number of tokens doesn't match reference", tokens.Count, beginOffsets.Length); for (int i = 0; i < beginOffsets.Length; i++) { NUnit.Framework.Assert.AreEqual("Char begin offset deviates from reference", beginOffsets[i], tokens[i].BeginPosition()); NUnit.Framework.Assert.AreEqual("Char end offset deviates from reference", endOffsets[i], tokens[i].EndPosition()); } }
/// <summary> /// arg[0] := tokenizer options /// args[1] := file to tokenize /// </summary> /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 2) { System.Console.Out.Printf("Usage: java %s OPTS filename%n", typeof(ArabicTokenizerTester).FullName); System.Environment.Exit(-1); } string tokOptions = args[0]; File path = new File(args[1]); log.Info("Reading from: " + path.GetPath()); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8")); ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.Factory(); tf.SetOptions(tokOptions); IMapper lexMapper = new DefaultLexicalMapper(); lexMapper.Setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8"); int lineId = 0; for (string line; (line = br.ReadLine()) != null; lineId++) { line = line.Trim(); // Tokenize with the tokenizer IList <CoreLabel> tokenizedLine = tf.GetTokenizer(new StringReader(line)).Tokenize(); System.Console.Out.WriteLine(SentenceUtils.ListToString(tokenizedLine)); // Tokenize with the mapper StringBuilder sb = new StringBuilder(); string[] toks = line.Split("\\s+"); foreach (string tok in toks) { string mappedTok = lexMapper.Map(null, tok); sb.Append(mappedTok).Append(" "); } IList <string> mappedToks = Arrays.AsList(sb.ToString().Trim().Split("\\s+")); // Evaluate the output if (mappedToks.Count != tokenizedLine.Count) { System.Console.Error.Printf("Line length mismatch:%norig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks)); } else { bool printLines = false; for (int i = 0; i < mappedToks.Count; ++i) { string mappedTok = mappedToks[i]; string tokenizedTok = tokenizedLine[i].Word(); if (!mappedTok.Equals(tokenizedTok)) { System.Console.Error.Printf("Token mismatch:%nmap: %s%ntok: %s%n", mappedTok, tokenizedTok); printLines = true; } } if (printLines) { System.Console.Error.Printf("orig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks)); } } } System.Console.Error.Printf("Read %d lines.%n", lineId); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary>For debugging.</summary> /// <param name="args"/> /// <exception cref="System.IO.IOException"></exception> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s file > output%n", typeof(ArabicDocumentReaderAndWriter).FullName); System.Environment.Exit(-1); } string fileName = args[0]; ITokenizerFactory <CoreLabel> tokFactory = ArabicTokenizer.AtbFactory(); string atbVocOptions = "removeProMarker,removeMorphMarker"; tokFactory.SetOptions(atbVocOptions); BufferedReader reader = IOUtils.ReaderFromString(fileName); for (string line; (line = reader.ReadLine()) != null;) { string[] toks = line.Split("\\s+"); string delim = Pattern.Quote(tagDelimiter); bool isStart = true; foreach (string wordTag in toks) { string[] wordTagPair = wordTag.Split(delim); System.Diagnostics.Debug.Assert(wordTagPair.Length == 2); string word = wordTagPair[0]; if (tokFactory != null) { IList <CoreLabel> lexList = tokFactory.GetTokenizer(new StringReader(word)).Tokenize(); if (lexList.Count == 0) { continue; } else { if (lexList.Count == 1) { word = lexList[0].Value(); } else { if (lexList.Count > 1) { string secondWord = lexList[1].Value(); if (secondWord.Equals(DefaultSegMarker.ToString())) { // Special case for the null marker in the vocalized section word = lexList[0].Value() + DefaultSegMarker.ToString(); } else { System.Console.Error.Printf("%s: Raw token generates multiple segments: %s%n", typeof(ArabicDocumentReaderAndWriter).FullName, word); word = lexList[0].Value(); } } } } } if (!isStart) { System.Console.Out.Write(" "); } System.Console.Out.Write(word); isStart = false; } System.Console.Out.WriteLine(); } }
/// <summary>A fast, rule-based tokenizer for Spanish based on AnCora.</summary> /// <remarks> /// A fast, rule-based tokenizer for Spanish based on AnCora. /// Performs punctuation splitting and light tokenization by default. /// <p> /// Currently, this tokenizer does not do line splitting. It assumes that the input /// file is delimited by the system line separator. The output will be equivalently /// delimited. /// </p> /// </remarks> /// <param name="args"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs()); if (options.Contains("help")) { log.Info(Usage()); return; } // Lexer options ITokenizerFactory <CoreLabel> tf = SpanishTokenizer.CoreLabelFactory(); string orthoOptions = options.Contains("ancora") ? AncoraOptions : string.Empty; if (options.Contains("options")) { orthoOptions = orthoOptions.IsEmpty() ? options.GetProperty("options") : orthoOptions + ',' + options; } bool tokens = PropertiesUtils.GetBool(options, "tokens", false); if (!tokens) { orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs"; } tf.SetOptions(orthoOptions); // Other options string encoding = options.GetProperty("encoding", "UTF-8"); bool toLower = PropertiesUtils.GetBool(options, "lowerCase", false); Locale es = new Locale("es"); bool onePerLine = PropertiesUtils.GetBool(options, "onePerLine", false); // Read the file from stdin int nLines = 0; int nTokens = 0; long startTime = Runtime.NanoTime(); try { ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new BufferedReader(new InputStreamReader(Runtime.@in, encoding))); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.Console.Out, encoding)); bool printSpace = false; while (tokenizer.MoveNext()) { ++nTokens; string word = tokenizer.Current.Word(); if (word.Equals(SpanishLexer.NewlineToken)) { ++nLines; if (!onePerLine) { writer.NewLine(); printSpace = false; } } else { string outputToken = toLower ? word.ToLower(es) : word; if (onePerLine) { writer.Write(outputToken); writer.NewLine(); } else { if (printSpace) { writer.Write(" "); } writer.Write(outputToken); printSpace = true; } } } } catch (UnsupportedEncodingException e) { throw new RuntimeIOException("Bad character encoding", e); } catch (IOException e) { throw new RuntimeIOException(e); } long elapsedTime = Runtime.NanoTime() - startTime; double linesPerSec = (double)nLines / (elapsedTime / 1e9); System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec); }