示例#1
0
        public static void ReadLine(DataInput input, BytesRef scratch)
        {
            var upto = 0;

            while (true)
            {
                var b = input.ReadByte();
                if (scratch.Bytes.Length == upto)
                {
                    scratch.Grow(1 + upto);
                }
                if (b == ESCAPE)
                {
                    scratch.Bytes[upto++] = input.ReadByte();
                }
                else
                {
                    if (b == NEWLINE)
                    {
                        break;
                    }

                    scratch.Bytes[upto++] = b;
                }
            }
            scratch.Offset = 0;
            scratch.Length = upto;
        }
示例#2
0
 public static void ReadLine(DataInput input, BytesRef scratch)
 {
     var upto = 0;
     while (true)
     {
         var b = input.ReadSByte();
         if (scratch.Bytes.Length == upto)
         {
             scratch.Grow(1 + upto);
         }
         if (b == ESCAPE)
         {
             scratch.Bytes[upto++] = input.ReadSByte();
         }
         else
         {
             if (b == NEWLINE)
             {
                 break;
             }
             
             scratch.Bytes[upto++] = b;
         }
     }
     scratch.Offset = 0;
     scratch.Length = upto;
 }
示例#3
0
            public override int NextPosition()
            {
                int pos;

                if (_readPositions)
                {
                    SimpleTextUtil.ReadLine(_in, _scratch);
                    Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.POS), "got line=" + _scratch.Utf8ToString());
                    UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.POS.Length, _scratch.Length - SimpleTextFieldsWriter.POS.Length,
                                            _scratchUtf162);
                    pos = ArrayUtil.ParseInt(_scratchUtf162.Chars, 0, _scratchUtf162.Length);
                }
                else
                {
                    pos = -1;
                }

                if (_readOffsets)
                {
                    SimpleTextUtil.ReadLine(_in, _scratch);
                    Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.START_OFFSET), "got line=" + _scratch.Utf8ToString());
                    UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.START_OFFSET.Length,
                                            _scratch.Length - SimpleTextFieldsWriter.START_OFFSET.Length, _scratchUtf162);
                    _startOffset = ArrayUtil.ParseInt(_scratchUtf162.Chars, 0, _scratchUtf162.Length);
                    SimpleTextUtil.ReadLine(_in, _scratch);
                    Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.END_OFFSET), "got line=" + _scratch.Utf8ToString());
                    UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.END_OFFSET.Length,
                                            _scratch.Length - SimpleTextFieldsWriter.END_OFFSET.Length, _scratchUtf162);
                    _endOffset = ArrayUtil.ParseInt(_scratchUtf162.Chars, 0, _scratchUtf162.Length);
                }

                long fp = _in.FilePointer;

                SimpleTextUtil.ReadLine(_in, _scratch);
                if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.PAYLOAD))
                {
                    int len = _scratch.Length - SimpleTextFieldsWriter.PAYLOAD.Length;
                    if (_scratch2.Bytes.Length < len)
                    {
                        _scratch2.Grow(len);
                    }
                    Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.PAYLOAD.Length, _scratch2.Bytes, 0, len);
                    _scratch2.Length = len;
                    _payload         = _scratch2;
                }
                else
                {
                    _payload = null;
                    _in.Seek(fp);
                }
                return(pos);
            }
            private void LoadTerms()
            {
                var posIntOutputs = PositiveIntOutputs.Singleton;
                var outputsInner = new PairOutputs<long?, long?>(posIntOutputs, posIntOutputs);
                var outputs = new PairOutputs<long?, PairOutputs<long?,long?>.Pair>(posIntOutputs, outputsInner);

                // honestly, wtf kind of generic mess is this.
                var b = new Builder<PairOutputs<long?, PairOutputs<long?,long?>.Pair>.Pair>(FST.INPUT_TYPE.BYTE1, outputs);
                var input = (IndexInput) _outerInstance._input.Clone();
                input.Seek(_termsStart);

                var lastTerm = new BytesRef(10);
                long lastDocsStart = -1;
                int docFreq = 0;
                long totalTermFreq = 0;
                var visitedDocs = new FixedBitSet(_maxDoc);

                var scratchIntsRef = new IntsRef();
                while (true)
                {
                    SimpleTextUtil.ReadLine(input, _scratch);
                    if (_scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                            _sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }

                    if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC))
                    {
                        docFreq++;
                        _sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length,
                            _scratchUtf16);
                        int docId = ArrayUtil.ParseInt(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                        visitedDocs.Set(docId);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length,
                            _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16);
                        totalTermFreq += ArrayUtil.ParseInt(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                        }
                        lastDocsStart = input.FilePointer;
                        int len = _scratch.Length - SimpleTextFieldsWriter.TERM.Length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len);
                        lastTerm.Length = len;
                        docFreq = 0;
                        _sumTotalTermFreq += totalTermFreq;
                        totalTermFreq = 0;
                        _termCount++;
                    }
                }
                _docCount = visitedDocs.Cardinality();
                _fst = b.Finish();
            }
        public override Fields Get(int doc)
        {
            var fields = new SortedDictionary<string, SimpleTVTerms>();

            _input.Seek(_offsets[doc]);
            ReadLine();
            Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.NUMFIELDS));
            var numFields = ParseIntAt(SimpleTextTermVectorsWriter.NUMFIELDS.Length);
            if (numFields == 0)
            {
                return null; // no vectors for this doc
            }
            for (var i = 0; i < numFields; i++)
            {
                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELD));
                // skip fieldNumber:
                ParseIntAt(SimpleTextTermVectorsWriter.FIELD.Length);

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDNAME));
                var fieldName = ReadString(SimpleTextTermVectorsWriter.FIELDNAME.Length, _scratch);

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPOSITIONS));
                var positions = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPOSITIONS.Length, _scratch));

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDOFFSETS));
                var offsets = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDOFFSETS.Length, _scratch));

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPAYLOADS));
                var payloads = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPAYLOADS.Length, _scratch));

                ReadLine();
                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDTERMCOUNT));
                var termCount = ParseIntAt(SimpleTextTermVectorsWriter.FIELDTERMCOUNT.Length);

                var terms = new SimpleTVTerms(offsets, positions, payloads);
                fields.Add(fieldName, terms);

                for (var j = 0; j < termCount; j++)
                {
                    ReadLine();
                    Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMTEXT));
                    var term = new BytesRef();
                    var termLength = _scratch.Length - SimpleTextTermVectorsWriter.TERMTEXT.Length;
                    term.Grow(termLength);
                    term.Length = termLength;
                    Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.TERMTEXT.Length, term.Bytes, term.Offset, termLength);

                    var postings = new SimpleTVPostings();
                    terms.TERMS.Add(term, postings);

                    ReadLine();
                    Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMFREQ));
                    postings.FREQ = ParseIntAt(SimpleTextTermVectorsWriter.TERMFREQ.Length);

                    if (!positions && !offsets) continue;

                    if (positions)
                    {
                        postings.POSITIONS = new int[postings.FREQ];
                        if (payloads)
                        {
                            postings.PAYLOADS = new BytesRef[postings.FREQ];
                        }
                    }

                    if (offsets)
                    {
                        postings.START_OFFSETS = new int[postings.FREQ];
                        postings.END_OFFSETS = new int[postings.FREQ];
                    }

                    for (var k = 0; k < postings.FREQ; k++)
                    {
                        if (positions)
                        {
                            ReadLine();
                            Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.POSITION));
                            postings.POSITIONS[k] = ParseIntAt(SimpleTextTermVectorsWriter.POSITION.Length);
                            if (payloads)
                            {
                                ReadLine();
                                Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.PAYLOAD));
                                if (_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length == 0)
                                {
                                    postings.PAYLOADS[k] = null;
                                }
                                else
                                {
                                    var payloadBytes = new byte[_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length];
                                    Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.PAYLOAD.Length, payloadBytes, 0,
                                        payloadBytes.Length);
                                    postings.PAYLOADS[k] = new BytesRef(payloadBytes);
                                }
                            }
                        }

                        if (!offsets) continue;

                        ReadLine();
                        Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.STARTOFFSET));
                        postings.START_OFFSETS[k] = ParseIntAt(SimpleTextTermVectorsWriter.STARTOFFSET.Length);

                        ReadLine();
                        Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.ENDOFFSET));
                        postings.END_OFFSETS[k] = ParseIntAt(SimpleTextTermVectorsWriter.ENDOFFSET.Length);
                    }
                }
            }
            return new SimpleTVFields(this, fields);
        }
示例#6
0
            private void LoadTerms()
            {
                var posIntOutputs = PositiveIntOutputs.Singleton;
                var outputsInner  = new PairOutputs <long?, long?>(posIntOutputs, posIntOutputs);
                var outputs       = new PairOutputs <long?, PairOutputs <long?, long?> .Pair>(posIntOutputs, outputsInner);

                // honestly, wtf kind of generic mess is this.
                var b     = new Builder <PairOutputs <long?, PairOutputs <long?, long?> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);
                var input = (IndexInput)_outerInstance._input.Clone();

                input.Seek(_termsStart);

                var  lastTerm      = new BytesRef(10);
                long lastDocsStart = -1;
                int  docFreq       = 0;
                long totalTermFreq = 0;
                var  visitedDocs   = new FixedBitSet(_maxDoc);

                var scratchIntsRef = new IntsRef();

                while (true)
                {
                    SimpleTextUtil.ReadLine(input, _scratch);
                    if (_scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                            _sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }

                    if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC))
                    {
                        docFreq++;
                        _sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length,
                                                _scratchUtf16);
                        int docId = ArrayUtil.ParseInt(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                        visitedDocs.Set(docId);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length,
                                                _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16);
                        totalTermFreq += ArrayUtil.ParseInt(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                        }
                        lastDocsStart = input.FilePointer;
                        int len = _scratch.Length - SimpleTextFieldsWriter.TERM.Length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len);
                        lastTerm.Length    = len;
                        docFreq            = 0;
                        _sumTotalTermFreq += totalTermFreq;
                        totalTermFreq      = 0;
                        _termCount++;
                    }
                }
                _docCount = visitedDocs.Cardinality();
                _fst      = b.Finish();
            }