private CoreLabel MakeXmlToken(string tokenText, bool doNormalization, int charOffsetBegin, int charOffsetEnd) { CoreLabel token = new CoreLabel(); token.SetOriginalText(tokenText); if (separatorPattern.Matcher(tokenText).Matches()) { // Map to CoreNLP newline token tokenText = AbstractTokenizer.NewlineToken; } else { if (doNormalization && normalizeSpace) { tokenText = tokenText.Replace(' ', '\u00A0'); } } // change space to non-breaking space token.SetWord(tokenText); token.SetValue(tokenText); token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), charOffsetBegin); token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), charOffsetEnd); if (Verbose) { log.Info("Adding token " + token.ToShorterString()); } return(token); }
/// <exception cref="System.IO.IOException"/> private static int TokReader(Reader r, BufferedWriter writer, Pattern parseInsidePattern, Pattern filterPattern, string options, bool preserveLines, bool oneLinePerElement, bool dump, bool lowerCase) { int numTokens = 0; bool beginLine = true; bool printing = (parseInsidePattern == null); // start off printing, unless you're looking for a start entity Matcher m = null; if (parseInsidePattern != null) { m = parseInsidePattern.Matcher(string.Empty); } // create once as performance hack // System.err.printf("parseInsidePattern is: |%s|%n", parseInsidePattern); for (Edu.Stanford.Nlp.Process.PTBTokenizer <CoreLabel> tokenizer = new Edu.Stanford.Nlp.Process.PTBTokenizer <CoreLabel>(r, new CoreLabelTokenFactory(), options); tokenizer.MoveNext();) { CoreLabel obj = tokenizer.Current; // String origStr = obj.get(CoreAnnotations.TextAnnotation.class).replaceFirst("\n+$", ""); // DanC added this to fix a lexer bug, hopefully now corrected string origStr = obj.Get(typeof(CoreAnnotations.TextAnnotation)); string str; if (lowerCase) { str = origStr.ToLower(Locale.English); obj.Set(typeof(CoreAnnotations.TextAnnotation), str); } else { str = origStr; } if (m != null && m.Reset(origStr).Matches()) { printing = m.Group(1).IsEmpty(); // turn on printing if no end element slash, turn it off it there is // System.err.printf("parseInsidePattern matched against: |%s|, printing is %b.%n", origStr, printing); if (!printing) { // true only if matched a stop beginLine = true; if (oneLinePerElement) { writer.NewLine(); } } } else { if (printing) { if (dump) { // after having checked for tags, change str to be exhaustive str = obj.ToShorterString(); } if (filterPattern != null && filterPattern.Matcher(origStr).Matches()) { } else { // skip if (preserveLines) { if (NewlineToken.Equals(origStr)) { beginLine = true; writer.NewLine(); } else { if (!beginLine) { writer.Write(' '); } else { beginLine = false; } // writer.write(str.replace("\n", "")); writer.Write(str); } } else { if (oneLinePerElement) { if (!beginLine) { writer.Write(' '); } else { beginLine = false; } writer.Write(str); } else { writer.Write(str); writer.NewLine(); } } } } } numTokens++; } return(numTokens); }
private void RunSegmentation(ICoreMap annotation) { //0 2 // A BC D E // 1 10 1 1 // 0 12 3 4 // 0, 0+1 , string text = annotation.Get(typeof(CoreAnnotations.TextAnnotation)); // the original text String IList <CoreLabel> sentChars = annotation.Get(typeof(SegmenterCoreAnnotations.CharactersAnnotation)); // the way it was divided by splitCharacters if (Verbose) { log.Info("sentChars (length " + sentChars.Count + ") is " + SentenceUtils.ListToString(sentChars, StringUtils.EmptyStringArray)); } IList <CoreLabel> tokens = new List <CoreLabel>(); annotation.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); // Run the segmenter! On the whole String. It knows not about the splitting into chars. // Can we change this to have it run directly on the already existing list of tokens. That would help, no? IList <string> words; if (!tokenizeNewline) { text = text.ReplaceAll("[\r\n]", string.Empty); words = segmenter.SegmentString(text); } else { // remove leading and trailing newlines text = text.ReplaceAll("^[\\r\\n]+", string.Empty); text = text.ReplaceAll("[\\r\\n]+$", string.Empty); // if using the sentence split on two newlines option, replace single newlines // single newlines should be ignored for segmenting if (sentenceSplitOnTwoNewlines) { text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2"); // do a second pass to handle corner case of consecutive isolated newlines // x \n x \n x text = text.ReplaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2"); } // Run the segmenter on each line so that we don't get tokens that cross line boundaries // Neat trick to keep delimiters from: http://stackoverflow.com/a/2206432 string[] lines = text.Split(string.Format("((?<=%1$s)|(?=%1$s))", separator)); words = new List <string>(); foreach (string line in lines) { if (separatorPattern.Matcher(line).Matches()) { // Don't segment newline tokens, keep them as-is words.Add(line); } else { Sharpen.Collections.AddAll(words, segmenter.SegmentString(line)); } } } if (Verbose) { log.Info(text + "\n--->\n" + words + " (length " + words.Count + ')'); } // Go through everything again and make the final tokens list; for loop is over segmented words int pos = 0; // This is used to index sentChars, the output from splitCharacters StringBuilder xmlBuffer = new StringBuilder(); int xmlBegin = -1; foreach (string w in words) { CoreLabel fl = sentChars[pos]; string xmlCharAnnotation = fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation)); if (Verbose) { log.Info("Working on word " + w + ", sentChar " + fl.ToShorterString() + " (sentChars index " + pos + ')'); } if ("0".Equals(xmlCharAnnotation) || "beginning".Equals(xmlCharAnnotation)) { // Beginnings of plain text and other XML tags are good places to end an XML tag if (xmlBuffer.Length > 0) { // Form the XML token string xmlTag = xmlBuffer.ToString(); CoreLabel fl1 = sentChars[pos - 1]; int end = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end)); // Clean up and prepare for the next XML tag xmlBegin = -1; xmlBuffer = new StringBuilder(); } } if (!"0".Equals(xmlCharAnnotation)) { // found an XML character; fl changes inside this loop! while (fl.Get(typeof(SegmenterCoreAnnotations.XMLCharAnnotation)).Equals("whitespace")) { // Print whitespaces into the XML buffer and move on until the next non-whitespace character is found // and we're in sync with segmenter output again xmlBuffer.Append(' '); pos += 1; fl = sentChars[pos]; } xmlBuffer.Append(w); pos = AdvancePos(sentChars, pos, w); if (xmlBegin < 0) { xmlBegin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); } continue; } // remember that fl may be more than one char long (non-BMP chars like emoji), so use advancePos() fl.Set(typeof(CoreAnnotations.ChineseSegAnnotation), "1"); if (w.IsEmpty()) { if (Verbose) { log.Warn("Encountered an empty word. Shouldn't happen?"); } continue; } // [cdm 2016:] surely this shouldn't happen! int begin = fl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); pos = AdvancePos(sentChars, pos, w); if (pos - 1 >= sentChars.Count) { log.Error("Error: on word " + w + " at position " + (pos - w.Length) + " trying to get at position " + (pos - 1)); log.Error("last element of sentChars is " + sentChars[sentChars.Count - 1]); } else { fl = sentChars[pos - 1]; int end = fl.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); tokens.Add(MakeXmlToken(w, false, begin, end)); } } // end for (go through everything again) if (xmlBuffer.Length > 0) { // Form the last XML token, if any string xmlTag = xmlBuffer.ToString(); CoreLabel fl1 = sentChars[pos - 1]; int end = fl1.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); tokens.Add(MakeXmlToken(xmlTag, true, xmlBegin, end)); } if (Verbose) { foreach (CoreLabel token in tokens) { log.Info(token.ToShorterString()); } } }