public ColumnDocBufferedGetNext(ColumnTabDocumentReaderWriter <In> _enclosing, BufferedReader br, bool returnSegmentsAsDocs, bool includeText) { this._enclosing = _enclosing; this.br = br; this.includeText = includeText; if (returnSegmentsAsDocs) { this.keepBoundaries = false; this.returnTokensOnEmptyLine = true; this.hasDocStart = false; } else { this.keepBoundaries = true; this.returnTokensOnEmptyLine = false; this.hasDocStart = true; } }
private Annotation CreateDoc(string docId, IList <IN> tokens, IList <IntPair> sentenceBoundaries, bool includeText) { try { string docText = includeText ? ColumnTabDocumentReaderWriter.Join(tokens, typeof(CoreAnnotations.TextAnnotation), " ") : null; Annotation doc = new Annotation(docText); doc.Set(typeof(CoreAnnotations.DocIDAnnotation), docId); Type tokensClass = Sharpen.Runtime.GetType(this._enclosing.tokensAnnotationClassName); doc.Set(tokensClass, tokens); bool setTokenCharOffsets = includeText; if (setTokenCharOffsets) { int i = 0; foreach (IN token in tokens) { string tokenText = token.Get(typeof(CoreAnnotations.TextAnnotation)); token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), i); i += tokenText.Length; token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), i); /* * if (i > docText.length()) { log.info("index " + i + * " larger than docText length " + docText.length()); * log.info("Token: " + tokenText); * log.info("DocText: " + docText); } */ System.Diagnostics.Debug.Assert((i <= docText.Length)); i++; } } // Skip space if (sentenceBoundaries != null) { IList <ICoreMap> sentences = new List <ICoreMap>(sentenceBoundaries.Count); foreach (IntPair p in sentenceBoundaries) { // get the sentence text from the first and last character offsets IList <IN> sentenceTokens = new List <IN>(tokens.SubList(p.GetSource(), p.GetTarget() + 1)); int begin = sentenceTokens[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); int last = sentenceTokens.Count - 1; int end = sentenceTokens[last].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); string sentenceText = includeText ? ColumnTabDocumentReaderWriter.Join(sentenceTokens, typeof(CoreAnnotations.TextAnnotation), " ") : null; // create a sentence annotation with text and token offsets Annotation sentence = new Annotation(sentenceText); sentence.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), begin); sentence.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end); sentence.Set(tokensClass, sentenceTokens); sentence.Set(typeof(CoreAnnotations.TokenBeginAnnotation), p.GetSource()); sentence.Set(typeof(CoreAnnotations.TokenEndAnnotation), p.GetTarget() + 1); int sentenceIndex = sentences.Count; sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex); // add the sentence to the list sentences.Add(sentence); } // add the sentences annotations to the document doc.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences); } return(doc); } catch (TypeLoadException e) { Sharpen.Runtime.PrintStackTrace(e, System.Console.Error); } return(null); }
public ColumnDocBufferedGetNext(ColumnTabDocumentReaderWriter <In> _enclosing, BufferedReader br) : this(br, true, false) { this._enclosing = _enclosing; }
public ColumnDocBufferedGetNext(ColumnTabDocumentReaderWriter <In> _enclosing, BufferedReader br, bool returnSegmentsAsDocs) : this(br, returnSegmentsAsDocs, false) { this._enclosing = _enclosing; }
public ColumnDocBufferedGetNextTokens(ColumnTabDocumentReaderWriter <In> _enclosing, BufferedReader br) { this._enclosing = _enclosing; this.docGetNext = new ColumnTabDocumentReaderWriter.ColumnDocBufferedGetNext(this, br, true); }