/// <summary> /// demoAPI demonstrates other ways of calling the parser with /// already tokenized text, or in some cases, raw text that needs to /// be tokenized as a single sentence. /// </summary> /// <remarks> /// demoAPI demonstrates other ways of calling the parser with /// already tokenized text, or in some cases, raw text that needs to /// be tokenized as a single sentence. Output is handled with a /// TreePrint object. Note that the options used when creating the /// TreePrint can determine what results to print out. Once again, /// one can capture the output by passing a PrintWriter to /// TreePrint.printTree. This code is for English. /// </remarks> public static void DemoAPI(LexicalizedParser lp) { // This option shows parsing a list of correctly tokenized words string[] sent = new string[] { "This", "is", "an", "easy", "sentence", "." }; IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent); Tree parse = lp.Apply(rawWords); parse.PennPrint(); System.Console.Out.WriteLine(); // This option shows loading and using an explicit tokenizer string sent2 = "This is another sentence."; ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty); ITokenizer <CoreLabel> tok = tokenizerFactory.GetTokenizer(new StringReader(sent2)); IList <CoreLabel> rawWords2 = tok.Tokenize(); parse = lp.Apply(rawWords2); ITreebankLanguagePack tlp = lp.TreebankLanguagePack(); // PennTreebankLanguagePack for English IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory(); GrammaticalStructure gs = gsf.NewGrammaticalStructure(parse); IList <TypedDependency> tdl = gs.TypedDependenciesCCprocessed(); System.Console.Out.WriteLine(tdl); System.Console.Out.WriteLine(); // You can also use a TreePrint object to print trees and dependencies TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.PrintTree(parse); }
/// <summary>A fast, rule-based tokenizer for Modern Standard French.</summary> /// <remarks> /// A fast, rule-based tokenizer for Modern Standard French. /// Performs punctuation splitting and light tokenization by default. /// <p> /// Currently, this tokenizer does not do line splitting. It assumes that the input /// file is delimited by the system line separator. The output will be equivalently /// delimited. /// </remarks> /// <param name="args"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs()); if (options.Contains("help")) { log.Info(Usage()); return; } // Lexer options ITokenizerFactory <CoreLabel> tf = options.Contains("ftb") ? FrenchTokenizer.FtbFactory() : FrenchTokenizer.Factory(); string orthoOptions = options.GetProperty("options", string.Empty); // When called from this main method, split on newline. No options for // more granular sentence splitting. orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs"; tf.SetOptions(orthoOptions); // Other options string encoding = options.GetProperty("encoding", "UTF-8"); bool toLower = PropertiesUtils.GetBool(options, "lowerCase", false); // Read the file from stdin int nLines = 0; int nTokens = 0; long startTime = Runtime.NanoTime(); try { ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding)); bool printSpace = false; while (tokenizer.MoveNext()) { ++nTokens; string word = tokenizer.Current.Word(); if (word.Equals(FrenchLexer.NewlineToken)) { ++nLines; printSpace = false; System.Console.Out.WriteLine(); } else { if (printSpace) { System.Console.Out.Write(" "); } string outputToken = toLower ? word.ToLower(Locale.French) : word; System.Console.Out.Write(outputToken); printSpace = true; } } } catch (UnsupportedEncodingException e) { log.Error(e); } long elapsedTime = Runtime.NanoTime() - startTime; double linesPerSec = (double)nLines / (elapsedTime / 1e9); System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec); }
/// <summary>Tokenize the text using the parser's tokenizer</summary> public virtual IList <IHasWord> Tokenize(string sentence) { ITokenizerFactory <IHasWord> tf = TreebankLanguagePack().GetTokenizerFactory(); ITokenizer <IHasWord> tokenizer = tf.GetTokenizer(new StringReader(sentence)); IList <IHasWord> tokens = tokenizer.Tokenize(); return(tokens); }
/// <summary>A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).</summary> /// <remarks> /// A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding). /// Performs punctuation splitting and light tokenization by default. /// Orthographic normalization options are available, and can be enabled with /// command line options. /// <p> /// Currently, this tokenizer does not do line splitting. It normalizes non-printing /// line separators across platforms and prints the system default line splitter /// to the output. /// <p> /// The following normalization options are provided: /// <ul> /// <li> /// <c>useUTF8Ellipsis</c> /// : Replaces sequences of three or more full stops with \u2026</li> /// <li> /// <c>normArDigits</c> /// : Convert Arabic digits to ASCII equivalents</li> /// <li> /// <c>normArPunc</c> /// : Convert Arabic punctuation to ASCII equivalents</li> /// <li> /// <c>normAlif</c> /// : Change all alif forms to bare alif</li> /// <li> /// <c>normYa</c> /// : Map ya to alif maqsura</li> /// <li> /// <c>removeDiacritics</c> /// : Strip all diacritics</li> /// <li> /// <c>removeTatweel</c> /// : Strip tatweel elongation character</li> /// <li> /// <c>removeQuranChars</c> /// : Remove diacritics that appear in the Quran</li> /// <li> /// <c>removeProMarker</c> /// : Remove the ATB null pronoun marker</li> /// <li> /// <c>removeSegMarker</c> /// : Remove the ATB clitic segmentation marker</li> /// <li> /// <c>removeMorphMarker</c> /// : Remove the ATB morpheme boundary markers</li> /// <li> /// <c>removeLengthening</c> /// : Replace all sequences of three or more identical (non-period) characters with one copy</li> /// <li> /// <c>atbEscaping</c> /// : Replace left/right parentheses with ATB escape characters</li> /// </ul> /// </remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length > 0 && args[0].Contains("help")) { System.Console.Error.Printf("Usage: java %s [OPTIONS] < file%n", typeof(ArabicTokenizer).FullName); System.Console.Error.Printf("%nOptions:%n"); log.Info(" -help : Print this message. See javadocs for all normalization options."); log.Info(" -atb : Tokenization for the parsing experiments in Green and Manning (2010)"); System.Environment.Exit(-1); } // Process normalization options Properties tokenizerOptions = StringUtils.ArgsToProperties(args); ITokenizerFactory <CoreLabel> tf = tokenizerOptions.Contains("atb") ? ArabicTokenizer.AtbFactory() : ArabicTokenizer.Factory(); foreach (string option in tokenizerOptions.StringPropertyNames()) { tf.SetOptions(option); } // Replace line separators with a token so that we can // count lines tf.SetOptions("tokenizeNLs"); // Read the file int nLines = 0; int nTokens = 0; try { string encoding = "UTF-8"; ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new InputStreamReader(Runtime.@in, encoding)); bool printSpace = false; while (tokenizer.MoveNext()) { ++nTokens; string word = tokenizer.Current.Word(); if (word.Equals(ArabicLexer.NewlineToken)) { ++nLines; printSpace = false; System.Console.Out.WriteLine(); } else { if (printSpace) { System.Console.Out.Write(" "); } System.Console.Out.Write(word); printSpace = true; } } } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens)%n", nLines, nTokens); }
public virtual void RunTest <_T0>(ITokenizerFactory <_T0> factory, string[] testStrings, string[][] resultsStrings) where _T0 : IHasWord { for (int i = 0; i < testStrings.Length; ++i) { ITokenizer <IHasWord> tokenizer = factory.GetTokenizer(new StringReader(testStrings[i])); IList <IHasWord> tokens = tokenizer.Tokenize(); NUnit.Framework.Assert.AreEqual(resultsStrings[i].Length, tokens.Count); for (int j = 0; j < resultsStrings[i].Length; ++j) { NUnit.Framework.Assert.AreEqual(resultsStrings[i][j], tokens[j].Word()); } } }
private IList <CoreLabel> SegmentStringToIOB(string line) { IList <CoreLabel> tokenList; if (tf == null) { // Whitespace tokenization. tokenList = IOBUtils.StringToIOB(line); } else { IList <CoreLabel> tokens = tf.GetTokenizer(new StringReader(line)).Tokenize(); tokenList = IOBUtils.StringToIOB(tokens, null, false, tf, line); } IOBUtils.LabelDomain(tokenList, domain); tokenList = classifier.Classify(tokenList); return(tokenList); }
public virtual void TestArabicTokenizer() { System.Diagnostics.Debug.Assert((untokInputs.Length == tokReferences.Length)); ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory(); tf.SetOptions("removeProMarker"); tf.SetOptions("removeSegMarker"); tf.SetOptions("removeMorphMarker"); for (int i = 0; i < untokInputs.Length; ++i) { string line = untokInputs[i]; ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(line)); IList <CoreLabel> tokens = tokenizer.Tokenize(); string tokenizedLine = SentenceUtils.ListToString(tokens); string reference = tokReferences[i]; NUnit.Framework.Assert.AreEqual("Tokenization deviates from reference", reference, tokenizedLine); } }
public virtual void TestCharOffsets() { string untokInput = "إِنَّ- -نا هادِئ+ُونَ ."; int[] beginOffsets = new int[] { 0, 7, 11, 22 }; int[] endOffsets = new int[] { 6, 10, 21, 23 }; ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.AtbFactory(); tf.SetOptions("removeProMarker"); tf.SetOptions("removeSegMarker"); tf.SetOptions("removeMorphMarker"); ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new StringReader(untokInput)); IList <CoreLabel> tokens = tokenizer.Tokenize(); NUnit.Framework.Assert.AreEqual("Number of tokens doesn't match reference", tokens.Count, beginOffsets.Length); for (int i = 0; i < beginOffsets.Length; i++) { NUnit.Framework.Assert.AreEqual("Char begin offset deviates from reference", beginOffsets[i], tokens[i].BeginPosition()); NUnit.Framework.Assert.AreEqual("Char end offset deviates from reference", endOffsets[i], tokens[i].EndPosition()); } }
/// <summary>For debugging.</summary> /// <param name="args"/> /// <exception cref="System.IO.IOException"></exception> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s file > output%n", typeof(ArabicDocumentReaderAndWriter).FullName); System.Environment.Exit(-1); } string fileName = args[0]; ITokenizerFactory <CoreLabel> tokFactory = ArabicTokenizer.AtbFactory(); string atbVocOptions = "removeProMarker,removeMorphMarker"; tokFactory.SetOptions(atbVocOptions); BufferedReader reader = IOUtils.ReaderFromString(fileName); for (string line; (line = reader.ReadLine()) != null;) { string[] toks = line.Split("\\s+"); string delim = Pattern.Quote(tagDelimiter); bool isStart = true; foreach (string wordTag in toks) { string[] wordTagPair = wordTag.Split(delim); System.Diagnostics.Debug.Assert(wordTagPair.Length == 2); string word = wordTagPair[0]; if (tokFactory != null) { IList <CoreLabel> lexList = tokFactory.GetTokenizer(new StringReader(word)).Tokenize(); if (lexList.Count == 0) { continue; } else { if (lexList.Count == 1) { word = lexList[0].Value(); } else { if (lexList.Count > 1) { string secondWord = lexList[1].Value(); if (secondWord.Equals(DefaultSegMarker.ToString())) { // Special case for the null marker in the vocalized section word = lexList[0].Value() + DefaultSegMarker.ToString(); } else { System.Console.Error.Printf("%s: Raw token generates multiple segments: %s%n", typeof(ArabicDocumentReaderAndWriter).FullName, word); word = lexList[0].Value(); } } } } } if (!isStart) { System.Console.Out.Write(" "); } System.Console.Out.Write(word); isStart = false; } System.Console.Out.WriteLine(); } }
/// <summary> /// Tokenizes the given text to populate the list of words this Document /// represents. /// </summary> /// <remarks> /// Tokenizes the given text to populate the list of words this Document /// represents. The default implementation uses the current tokenizer and tokenizes /// the entirety of the text into words. Subclasses should override this method /// to parse documents in non-standard formats, and/or to pull the title of the /// document from the text. The given text may be empty ("") but will never /// be null. Subclasses may want to do additional processing and then just /// call super.parse. /// </remarks> /// <seealso cref="BasicDocument{L}.SetTokenizerFactory(Edu.Stanford.Nlp.Process.ITokenizerFactory{T})"/> protected internal virtual void Parse(string text) { ITokenizer <Word> toke = tokenizerFactory.GetTokenizer(new StringReader(text)); Sharpen.Collections.AddAll(this, toke.Tokenize()); }
/// <summary>Returns a thread-safe tokenizer</summary> public virtual ITokenizer <CoreLabel> GetTokenizer(Reader r) { return(factory.GetTokenizer(r)); }
public EnglishTextProcessor(ISentenceDetectorFactory sdFact, ITokenizerFactory tokenizerFact, IStemmerFactory stemmerFact) : base(Language.English) { sd = sdFact.GetSentenceDetector(Language); tokenizer = tokenizerFact.GetTokenizer(Language); stemmer = stemmerFact.GetStemmer(Language); }
/// <exception cref="System.Exception"/> public override Document NextDoc() { IList <IList <CoreLabel> > allWords = new List <IList <CoreLabel> >(); IList <Tree> allTrees = new List <Tree>(); IList <IList <Mention> > allGoldMentions = new List <IList <Mention> >(); IList <IList <Mention> > allPredictedMentions; IList <ICoreMap> allSentences = new List <ICoreMap>(); Annotation docAnno = new Annotation(string.Empty); Pattern docPattern = Pattern.Compile("<DOC>(.*?)</DOC>", Pattern.Dotall + Pattern.CaseInsensitive); Pattern sentencePattern = Pattern.Compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.Dotall + Pattern.CaseInsensitive); Matcher docMatcher = docPattern.Matcher(fileContents); if (!docMatcher.Find(currentOffset)) { return(null); } currentOffset = docMatcher.End(); string doc = docMatcher.Group(1); Matcher sentenceMatcher = sentencePattern.Matcher(doc); string ner = null; //Maintain current document ID. Pattern docIDPattern = Pattern.Compile("<DOCNO>(.*?)</DOCNO>", Pattern.Dotall + Pattern.CaseInsensitive); Matcher docIDMatcher = docIDPattern.Matcher(doc); if (docIDMatcher.Find()) { currentDocumentID = docIDMatcher.Group(1); } else { currentDocumentID = "documentAfter " + currentDocumentID; } while (sentenceMatcher.Find()) { string sentenceString = sentenceMatcher.Group(2); IList <CoreLabel> words = tokenizerFactory.GetTokenizer(new StringReader(sentenceString)).Tokenize(); // FIXING TOKENIZATION PROBLEMS for (int i = 0; i < words.Count; i++) { CoreLabel w = words[i]; if (i > 0 && w.Word().Equals("$")) { if (!words[i - 1].Word().EndsWith("PRP") && !words[i - 1].Word().EndsWith("WP")) { continue; } words[i - 1].Set(typeof(CoreAnnotations.TextAnnotation), words[i - 1].Word() + "$"); words.Remove(i); i--; } else { if (w.Word().Equals("\\/")) { if (words[i - 1].Word().Equals("</COREF>")) { continue; } w.Set(typeof(CoreAnnotations.TextAnnotation), words[i - 1].Word() + "\\/" + words[i + 1].Word()); words.Remove(i + 1); words.Remove(i - 1); } } } // END FIXING TOKENIZATION PROBLEMS IList <CoreLabel> sentence = new List <CoreLabel>(); // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open Stack <Mention> stack = new Stack <Mention>(); IList <Mention> mentions = new List <Mention>(); allWords.Add(sentence); allGoldMentions.Add(mentions); foreach (CoreLabel word in words) { string w = word.Get(typeof(CoreAnnotations.TextAnnotation)); // found regular token: WORD/POS if (!w.StartsWith("<") && w.Contains("\\/") && w.LastIndexOf("\\/") != w.Length - 2) { int i_1 = w.LastIndexOf("\\/"); string w1 = Sharpen.Runtime.Substring(w, 0, i_1); // we do NOT set POS info here. We take the POS tags from the parser! word.Set(typeof(CoreAnnotations.TextAnnotation), w1); word.Remove(typeof(CoreAnnotations.OriginalTextAnnotation)); sentence.Add(word); } else { // found the start SGML tag for a NE, e.g., "<ORGANIZATION>" if (w.StartsWith("<") && !w.StartsWith("<COREF") && !w.StartsWith("</")) { Pattern nerPattern = Pattern.Compile("<(.*?)>"); Matcher m = nerPattern.Matcher(w); m.Find(); ner = m.Group(1); } else { // found the end SGML tag for a NE, e.g., "</ORGANIZATION>" if (w.StartsWith("</") && !w.StartsWith("</COREF")) { Pattern nerPattern = Pattern.Compile("</(.*?)>"); Matcher m = nerPattern.Matcher(w); m.Find(); string ner1 = m.Group(1); if (ner != null && !ner.Equals(ner1)) { throw new Exception("Unmatched NE labels in MUC file: " + ner + " v. " + ner1); } ner = null; } else { // found the start SGML tag for a coref mention if (w.StartsWith("<COREF")) { Mention mention = new Mention(); // position of this mention in the sentence mention.startIndex = sentence.Count; // extract GOLD info about this coref chain. needed for eval Pattern idPattern = Pattern.Compile("ID=\"(.*?)\""); Pattern refPattern = Pattern.Compile("REF=\"(.*?)\""); Matcher m = idPattern.Matcher(w); m.Find(); mention.mentionID = System.Convert.ToInt32(m.Group(1)); m = refPattern.Matcher(w); if (m.Find()) { mention.originalRef = System.Convert.ToInt32(m.Group(1)); } // open mention. keep track of all open mentions using the stack stack.Push(mention); } else { // found the end SGML tag for a coref mention if (w.Equals("</COREF>")) { Mention mention = stack.Pop(); mention.endIndex = sentence.Count; // this is a closed mention. add it to the final list of mentions // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef); mentions.Add(mention); } else { word.Remove(typeof(CoreAnnotations.OriginalTextAnnotation)); sentence.Add(word); } } } } } } StringBuilder textContent = new StringBuilder(); for (int i_2 = 0; i_2 < sentence.Count; i_2++) { CoreLabel w = sentence[i_2]; w.Set(typeof(CoreAnnotations.IndexAnnotation), i_2 + 1); w.Set(typeof(CoreAnnotations.UtteranceAnnotation), 0); if (i_2 > 0) { textContent.Append(" "); } textContent.Append(w.GetString <CoreAnnotations.TextAnnotation>()); } ICoreMap sentCoreMap = new Annotation(textContent.ToString()); allSentences.Add(sentCoreMap); sentCoreMap.Set(typeof(CoreAnnotations.TokensAnnotation), sentence); } // assign goldCorefClusterID IDictionary <int, Mention> idMention = Generics.NewHashMap(); // temporary use foreach (IList <Mention> goldMentions in allGoldMentions) { foreach (Mention m in goldMentions) { idMention[m.mentionID] = m; } } foreach (IList <Mention> goldMentions_1 in allGoldMentions) { foreach (Mention m in goldMentions_1) { if (m.goldCorefClusterID == -1) { if (m.originalRef == -1) { m.goldCorefClusterID = m.mentionID; } else { int @ref = m.originalRef; while (true) { Mention m2 = idMention[@ref]; if (m2.goldCorefClusterID != -1) { m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { if (m2.originalRef == -1) { m2.goldCorefClusterID = m2.mentionID; m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { @ref = m2.originalRef; } } } } } } } docAnno.Set(typeof(CoreAnnotations.SentencesAnnotation), allSentences); stanfordProcessor.Annotate(docAnno); if (allSentences.Count != allWords.Count) { throw new InvalidOperationException("allSentences != allWords"); } for (int i_3 = 0; i_3 < allSentences.Count; i_3++) { IList <CoreLabel> annotatedSent = allSentences[i_3].Get(typeof(CoreAnnotations.TokensAnnotation)); IList <CoreLabel> unannotatedSent = allWords[i_3]; IList <Mention> mentionInSent = allGoldMentions[i_3]; foreach (Mention m in mentionInSent) { m.dependency = allSentences[i_3].Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)); } if (annotatedSent.Count != unannotatedSent.Count) { throw new InvalidOperationException("annotatedSent != unannotatedSent"); } for (int j = 0; j < sz; j++) { CoreLabel annotatedWord = annotatedSent[j]; CoreLabel unannotatedWord = unannotatedSent[j]; if (!annotatedWord.Get(typeof(CoreAnnotations.TextAnnotation)).Equals(unannotatedWord.Get(typeof(CoreAnnotations.TextAnnotation)))) { throw new InvalidOperationException("annotatedWord != unannotatedWord"); } } allWords.Set(i_3, annotatedSent); allTrees.Add(allSentences[i_3].Get(typeof(TreeCoreAnnotations.TreeAnnotation))); } // extract predicted mentions allPredictedMentions = mentionFinder.ExtractPredictedMentions(docAnno, maxID, dictionaries); // add the relevant fields to mentions and order them for coref return(Arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true)); }
// todo: give options for document splitting. A line or the whole file or sentence splitting as now public virtual IEnumerator <IList <In> > GetIterator(Reader r) { ITokenizer <In> tokenizer = tokenizerFactory.GetTokenizer(r); // PTBTokenizer.newPTBTokenizer(r, false, true); IList <In> words = new List <In>(); IN previous = null; StringBuilder prepend = new StringBuilder(); /* * This changes SGML tags into whitespace -- it should maybe be moved elsewhere */ while (tokenizer.MoveNext()) { IN w = tokenizer.Current; string word = w.Get(typeof(CoreAnnotations.TextAnnotation)); Matcher m = sgml.Matcher(word); if (m.Matches()) { string before = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.BeforeAnnotation))); string after = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.AfterAnnotation))); prepend.Append(before).Append(word); if (previous != null) { string previousTokenAfter = StringUtils.GetNotNullString(previous.Get(typeof(CoreAnnotations.AfterAnnotation))); previous.Set(typeof(CoreAnnotations.AfterAnnotation), previousTokenAfter + word + after); } } else { // previous.appendAfter(w.word() + w.after()); string before = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.BeforeAnnotation))); if (prepend.Length > 0) { prepend.Append(before); w.Set(typeof(CoreAnnotations.BeforeAnnotation), prepend.ToString()); prepend = new StringBuilder(); } words.Add(w); previous = w; } } IList <IList <In> > sentences = wts.Process(words); string after_1 = string.Empty; IN last = null; foreach (IList <In> sentence in sentences) { int pos = 0; foreach (IN w in sentence) { w.Set(typeof(CoreAnnotations.PositionAnnotation), int.ToString(pos)); after_1 = StringUtils.GetNotNullString(w.Get(typeof(CoreAnnotations.AfterAnnotation))); w.Remove(typeof(CoreAnnotations.AfterAnnotation)); last = w; } } if (last != null) { last.Set(typeof(CoreAnnotations.AfterAnnotation), after_1); } return(sentences.GetEnumerator()); }
/// <summary>A fast, rule-based tokenizer for Spanish based on AnCora.</summary> /// <remarks> /// A fast, rule-based tokenizer for Spanish based on AnCora. /// Performs punctuation splitting and light tokenization by default. /// <p> /// Currently, this tokenizer does not do line splitting. It assumes that the input /// file is delimited by the system line separator. The output will be equivalently /// delimited. /// </p> /// </remarks> /// <param name="args"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs()); if (options.Contains("help")) { log.Info(Usage()); return; } // Lexer options ITokenizerFactory <CoreLabel> tf = SpanishTokenizer.CoreLabelFactory(); string orthoOptions = options.Contains("ancora") ? AncoraOptions : string.Empty; if (options.Contains("options")) { orthoOptions = orthoOptions.IsEmpty() ? options.GetProperty("options") : orthoOptions + ',' + options; } bool tokens = PropertiesUtils.GetBool(options, "tokens", false); if (!tokens) { orthoOptions = orthoOptions.IsEmpty() ? "tokenizeNLs" : orthoOptions + ",tokenizeNLs"; } tf.SetOptions(orthoOptions); // Other options string encoding = options.GetProperty("encoding", "UTF-8"); bool toLower = PropertiesUtils.GetBool(options, "lowerCase", false); Locale es = new Locale("es"); bool onePerLine = PropertiesUtils.GetBool(options, "onePerLine", false); // Read the file from stdin int nLines = 0; int nTokens = 0; long startTime = Runtime.NanoTime(); try { ITokenizer <CoreLabel> tokenizer = tf.GetTokenizer(new BufferedReader(new InputStreamReader(Runtime.@in, encoding))); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.Console.Out, encoding)); bool printSpace = false; while (tokenizer.MoveNext()) { ++nTokens; string word = tokenizer.Current.Word(); if (word.Equals(SpanishLexer.NewlineToken)) { ++nLines; if (!onePerLine) { writer.NewLine(); printSpace = false; } } else { string outputToken = toLower ? word.ToLower(es) : word; if (onePerLine) { writer.Write(outputToken); writer.NewLine(); } else { if (printSpace) { writer.Write(" "); } writer.Write(outputToken); printSpace = true; } } } } catch (UnsupportedEncodingException e) { throw new RuntimeIOException("Bad character encoding", e); } catch (IOException e) { throw new RuntimeIOException(e); } long elapsedTime = Runtime.NanoTime() - startTime; double linesPerSec = (double)nLines / (elapsedTime / 1e9); System.Console.Error.Printf("Done! Tokenized %d lines (%d tokens) at %.2f lines/sec%n", nLines, nTokens, linesPerSec); }
/// <summary> /// arg[0] := tokenizer options /// args[1] := file to tokenize /// </summary> /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 2) { System.Console.Out.Printf("Usage: java %s OPTS filename%n", typeof(ArabicTokenizerTester).FullName); System.Environment.Exit(-1); } string tokOptions = args[0]; File path = new File(args[1]); log.Info("Reading from: " + path.GetPath()); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8")); ITokenizerFactory <CoreLabel> tf = ArabicTokenizer.Factory(); tf.SetOptions(tokOptions); IMapper lexMapper = new DefaultLexicalMapper(); lexMapper.Setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8"); int lineId = 0; for (string line; (line = br.ReadLine()) != null; lineId++) { line = line.Trim(); // Tokenize with the tokenizer IList <CoreLabel> tokenizedLine = tf.GetTokenizer(new StringReader(line)).Tokenize(); System.Console.Out.WriteLine(SentenceUtils.ListToString(tokenizedLine)); // Tokenize with the mapper StringBuilder sb = new StringBuilder(); string[] toks = line.Split("\\s+"); foreach (string tok in toks) { string mappedTok = lexMapper.Map(null, tok); sb.Append(mappedTok).Append(" "); } IList <string> mappedToks = Arrays.AsList(sb.ToString().Trim().Split("\\s+")); // Evaluate the output if (mappedToks.Count != tokenizedLine.Count) { System.Console.Error.Printf("Line length mismatch:%norig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks)); } else { bool printLines = false; for (int i = 0; i < mappedToks.Count; ++i) { string mappedTok = mappedToks[i]; string tokenizedTok = tokenizedLine[i].Word(); if (!mappedTok.Equals(tokenizedTok)) { System.Console.Error.Printf("Token mismatch:%nmap: %s%ntok: %s%n", mappedTok, tokenizedTok); printLines = true; } } if (printLines) { System.Console.Error.Printf("orig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.ListToString(tokenizedLine), SentenceUtils.ListToString(mappedToks)); } } } System.Console.Error.Printf("Read %d lines.%n", lineId); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
private static bool IsDeletedCharacter(char ch, ITokenizerFactory <CoreLabel> tf) { IList <CoreLabel> tokens = tf.GetTokenizer(new StringReader(char.ToString(ch))).Tokenize(); return(tokens.IsEmpty()); }