public virtual IList <CoreLabel> Apply(string line) { if (line == null) { return(null); } // logger.info("input: " + line); //Matcher tagMatcher = tagPattern.matcher(line); //line = tagMatcher.replaceAll(""); line = line.Trim(); IList <CoreLabel> lwi = new List <CoreLabel>(); string origLine = line; line = this._enclosing.cdtos.Normalization(origLine); int origIndex = 0; int position = 0; StringBuilder nonspaceLineSB = new StringBuilder(); for (int index = 0; index < len; index++) { char ch = line[index]; CoreLabel wi = new CoreLabel(); if (!char.IsWhiteSpace(ch) && !char.IsISOControl(ch)) { string wordString = char.ToString(ch); wi.Set(typeof(CoreAnnotations.CharAnnotation), Sighan2005DocumentReaderAndWriter.Intern(wordString)); nonspaceLineSB.Append(wordString); // non-breaking space is skipped as well while (char.IsWhiteSpace(origLine[origIndex]) || char.IsISOControl(origLine[origIndex]) || (origLine[origIndex] == '\u00A0')) { origIndex++; } wordString = char.ToString(origLine[origIndex]); wi.Set(typeof(CoreAnnotations.OriginalCharAnnotation), Sighan2005DocumentReaderAndWriter.Intern(wordString)); // put in a word shape if (this._enclosing.flags.useShapeStrings) { wi.Set(typeof(CoreAnnotations.ShapeAnnotation), this._enclosing.ShapeOf(wordString)); } if (this._enclosing.flags.useUnicodeType || this._enclosing.flags.useUnicodeType4gram || this._enclosing.flags.useUnicodeType5gram) { wi.Set(typeof(CoreAnnotations.UTypeAnnotation), char.GetType(ch)); } if (this._enclosing.flags.useUnicodeBlock) { wi.Set(typeof(CoreAnnotations.UBlockAnnotation), Characters.UnicodeBlockStringOf(ch)); } origIndex++; if (index == 0) { // first character of a sentence (a line) wi.Set(typeof(CoreAnnotations.AnswerAnnotation), "1"); wi.Set(typeof(CoreAnnotations.SpaceBeforeAnnotation), "1"); wi.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), "1"); } else { if (char.IsWhiteSpace(line[index - 1]) || char.IsISOControl(line[index - 1])) { wi.Set(typeof(CoreAnnotations.AnswerAnnotation), "1"); wi.Set(typeof(CoreAnnotations.SpaceBeforeAnnotation), "1"); wi.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), "1"); } else { wi.Set(typeof(CoreAnnotations.AnswerAnnotation), "0"); wi.Set(typeof(CoreAnnotations.SpaceBeforeAnnotation), "0"); wi.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), "0"); } } wi.Set(typeof(CoreAnnotations.PositionAnnotation), Sighan2005DocumentReaderAndWriter.Intern((position).ToString())); position++; lwi.Add(wi); } } if (this._enclosing.flags.dictionary != null || this._enclosing.flags.serializedDictionary != null) { string nonspaceLine = nonspaceLineSB.ToString(); Sighan2005DocumentReaderAndWriter.AddDictionaryFeatures(this._enclosing.cdict, typeof(CoreAnnotations.LBeginAnnotation), typeof(CoreAnnotations.LMiddleAnnotation), typeof(CoreAnnotations.LEndAnnotation), nonspaceLine, lwi); } if (this._enclosing.flags.dictionary2 != null) { string nonspaceLine = nonspaceLineSB.ToString(); Sighan2005DocumentReaderAndWriter.AddDictionaryFeatures(this._enclosing.cdict2, typeof(CoreAnnotations.D2_LBeginAnnotation), typeof(CoreAnnotations.D2_LMiddleAnnotation), typeof(CoreAnnotations.D2_LEndAnnotation), nonspaceLine, lwi); } // logger.info("output: " + lwi.size()); return(lwi); }
internal CTBDocumentParser(Sighan2005DocumentReaderAndWriter _enclosing) { this._enclosing = _enclosing; }
public static void Main(string[] args) { Properties props = StringUtils.ArgsToProperties(args); // logger.debug(props.toString()); SeqClassifierFlags flags = new SeqClassifierFlags(props); MaxMatchSegmenter seg = new MaxMatchSegmenter(); string lexiconFile = props.GetProperty("lexicon"); if (lexiconFile != null) { seg.AddLexicon(lexiconFile); } else { logger.Error("Error: no lexicon file!"); System.Environment.Exit(1); } Sighan2005DocumentReaderAndWriter sighanRW = new Sighan2005DocumentReaderAndWriter(); sighanRW.Init(flags); BufferedReader br = new BufferedReader(new InputStreamReader(Runtime.@in)); PrintWriter stdoutW = new PrintWriter(System.Console.Out); int lineNb = 0; for (; ;) { ++lineNb; logger.Info("line: " + lineNb); try { string line = br.ReadLine(); if (line == null) { break; } string outputLine = null; if (props.GetProperty("greedy") != null) { List <Word> sentence = seg.GreedilySegmentWords(line); outputLine = SentenceUtils.ListToString(sentence); } else { if (props.GetProperty("maxwords") != null) { seg.BuildSegmentationLattice(line); outputLine = SentenceUtils.ListToString(seg.SegmentWords(MaxMatchSegmenter.MatchHeuristic.Maxwords)); } else { seg.BuildSegmentationLattice(line); outputLine = SentenceUtils.ListToString(seg.MaxMatchSegmentation()); } } StringReader strR = new StringReader(outputLine); IEnumerator <IList <CoreLabel> > itr = sighanRW.GetIterator(strR); while (itr.MoveNext()) { sighanRW.PrintAnswers(itr.Current, stdoutW); } } catch (IOException) { // System.out.println(outputLine); break; } } stdoutW.Flush(); }