Beispiel #1
0
 public ColumnDocBufferedGetNext(ColumnTabDocumentReaderWriter <In> _enclosing, BufferedReader br, bool returnSegmentsAsDocs, bool includeText)
 {
     this._enclosing  = _enclosing;
     this.br          = br;
     this.includeText = includeText;
     if (returnSegmentsAsDocs)
     {
         this.keepBoundaries          = false;
         this.returnTokensOnEmptyLine = true;
         this.hasDocStart             = false;
     }
     else
     {
         this.keepBoundaries          = true;
         this.returnTokensOnEmptyLine = false;
         this.hasDocStart             = true;
     }
 }
Beispiel #2
0
            private Annotation CreateDoc(string docId, IList <IN> tokens, IList <IntPair> sentenceBoundaries, bool includeText)
            {
                try
                {
                    string     docText = includeText ? ColumnTabDocumentReaderWriter.Join(tokens, typeof(CoreAnnotations.TextAnnotation), " ") : null;
                    Annotation doc     = new Annotation(docText);
                    doc.Set(typeof(CoreAnnotations.DocIDAnnotation), docId);
                    Type tokensClass = Sharpen.Runtime.GetType(this._enclosing.tokensAnnotationClassName);
                    doc.Set(tokensClass, tokens);
                    bool setTokenCharOffsets = includeText;
                    if (setTokenCharOffsets)
                    {
                        int i = 0;
                        foreach (IN token in tokens)
                        {
                            string tokenText = token.Get(typeof(CoreAnnotations.TextAnnotation));
                            token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), i);
                            i += tokenText.Length;
                            token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), i);

                            /*
                             * if (i > docText.length()) { log.info("index " + i +
                             * " larger than docText length " + docText.length());
                             * log.info("Token: " + tokenText);
                             * log.info("DocText: " + docText); }
                             */
                            System.Diagnostics.Debug.Assert((i <= docText.Length));
                            i++;
                        }
                    }
                    // Skip space
                    if (sentenceBoundaries != null)
                    {
                        IList <ICoreMap> sentences = new List <ICoreMap>(sentenceBoundaries.Count);
                        foreach (IntPair p in sentenceBoundaries)
                        {
                            // get the sentence text from the first and last character offsets
                            IList <IN> sentenceTokens = new List <IN>(tokens.SubList(p.GetSource(), p.GetTarget() + 1));
                            int        begin          = sentenceTokens[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                            int        last           = sentenceTokens.Count - 1;
                            int        end            = sentenceTokens[last].Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                            string     sentenceText   = includeText ? ColumnTabDocumentReaderWriter.Join(sentenceTokens, typeof(CoreAnnotations.TextAnnotation), " ") : null;
                            // create a sentence annotation with text and token offsets
                            Annotation sentence = new Annotation(sentenceText);
                            sentence.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), begin);
                            sentence.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end);
                            sentence.Set(tokensClass, sentenceTokens);
                            sentence.Set(typeof(CoreAnnotations.TokenBeginAnnotation), p.GetSource());
                            sentence.Set(typeof(CoreAnnotations.TokenEndAnnotation), p.GetTarget() + 1);
                            int sentenceIndex = sentences.Count;
                            sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex);
                            // add the sentence to the list
                            sentences.Add(sentence);
                        }
                        // add the sentences annotations to the document
                        doc.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences);
                    }
                    return(doc);
                }
                catch (TypeLoadException e)
                {
                    Sharpen.Runtime.PrintStackTrace(e, System.Console.Error);
                }
                return(null);
            }
Beispiel #3
0
 public ColumnDocBufferedGetNext(ColumnTabDocumentReaderWriter <In> _enclosing, BufferedReader br)
     : this(br, true, false)
 {
     this._enclosing = _enclosing;
 }
Beispiel #4
0
 public ColumnDocBufferedGetNext(ColumnTabDocumentReaderWriter <In> _enclosing, BufferedReader br, bool returnSegmentsAsDocs)
     : this(br, returnSegmentsAsDocs, false)
 {
     this._enclosing = _enclosing;
 }
Beispiel #5
0
 public ColumnDocBufferedGetNextTokens(ColumnTabDocumentReaderWriter <In> _enclosing, BufferedReader br)
 {
     this._enclosing = _enclosing;
     this.docGetNext = new ColumnTabDocumentReaderWriter.ColumnDocBufferedGetNext(this, br, true);
 }