Example #1
0
        // Tokenizes the fields of a document into Postings.
        private void  InvertDocument(Document doc)
        {
            foreach (Field field in doc.Fields())
            {
                System.String fieldName   = field.Name();
                int           fieldNumber = fieldInfos.FieldNumber(fieldName);

                int length   = fieldLengths[fieldNumber];   // length of Field
                int position = fieldPositions[fieldNumber]; // position in Field

                if (field.IsIndexed())
                {
                    if (!field.IsTokenized())
                    {
                        // un-tokenized Field
                        AddPosition(fieldName, field.StringValue(), position++);
                        length++;
                    }
                    else
                    {
                        System.IO.TextReader reader; // find or make Reader
                        if (field.ReaderValue() != null)
                        {
                            reader = field.ReaderValue();
                        }
                        else if (field.StringValue() != null)
                        {
                            reader = new System.IO.StringReader(field.StringValue());
                        }
                        else
                        {
                            throw new System.ArgumentException("Field must have either String or Reader value");
                        }

                        // Tokenize Field and add to postingTable
                        TokenStream stream = analyzer.TokenStream(fieldName, reader);
                        try
                        {
                            for (Token t = stream.Next(); t != null; t = stream.Next())
                            {
                                position += (t.GetPositionIncrement() - 1);
                                AddPosition(fieldName, t.TermText(), position++);
                                if (++length > maxFieldLength)
                                {
                                    break;
                                }
                            }
                        }
                        finally
                        {
                            stream.Close();
                        }
                    }

                    fieldLengths[fieldNumber]   = length;   // save Field length
                    fieldPositions[fieldNumber] = position; // save Field position
                    fieldBoosts[fieldNumber]   *= field.GetBoost();
                }
            }
        }
Example #2
0
        internal void  AddDocument(Document doc)
        {
            indexStream.WriteLong(fieldsStream.GetFilePointer());

            int storedCount = 0;

            foreach (Field field  in doc.Fields())
            {
                if (field.IsStored())
                {
                    storedCount++;
                }
            }
            fieldsStream.WriteVInt(storedCount);

            foreach (Field field in doc.Fields())
            {
                if (field.IsStored())
                {
                    fieldsStream.WriteVInt(fieldInfos.FieldNumber(field.Name()));

                    byte bits = 0;
                    if (field.IsTokenized())
                    {
                        bits |= 1;
                    }
                    fieldsStream.WriteByte(bits);

                    fieldsStream.WriteString(field.StringValue());
                }
            }
        }
Example #3
0
        private void  WriteTerm(Term term)
        {
            int start  = StringHelper.StringDifference(lastTerm.text, term.text);
            int length = term.text.Length - start;

            output.WriteVInt(start);                              // write shared prefix length
            output.WriteVInt(length);                             // write delta length
            output.WriteChars(term.text, start, length);          // write delta chars

            output.WriteVInt(fieldInfos.FieldNumber(term.field)); // write Field num

            lastTerm = term;
        }
Example #4
0
        /// <summary> Retrieve the term vector for the given document and Field</summary>
        /// <param name="docNum">The document number to retrieve the vector for
        /// </param>
        /// <param name="Field">The Field within the document to retrieve
        /// </param>
        /// <returns> The TermFreqVector for the document and Field or null
        /// </returns>
        public /*internal*/ virtual TermFreqVector Get(int docNum, System.String field)
        {
            lock (this)
            {
                // Check if no term vectors are available for this segment at all
                int            fieldNumber = fieldInfos.FieldNumber(field);
                TermFreqVector result      = null;
                if (tvx != null)
                {
                    try
                    {
                        //We need to account for the FORMAT_SIZE at when seeking in the tvx
                        //We don't need to do this in other seeks because we already have the file pointer
                        //that was written in another file
                        tvx.Seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
                        //System.out.println("TVX Pointer: " + tvx.getFilePointer());
                        long position = tvx.ReadLong();

                        tvd.Seek(position);
                        int fieldCount = tvd.ReadVInt();
                        //System.out.println("Num Fields: " + fieldCount);
                        // There are only a few fields per document. We opt for a full scan
                        // rather then requiring that they be ordered. We need to read through
                        // all of the fields anyway to get to the tvf pointers.
                        int number = 0;
                        int found  = -1;
                        for (int i = 0; i < fieldCount; i++)
                        {
                            number += tvd.ReadVInt();
                            if (number == fieldNumber)
                            {
                                found = i;
                            }
                        }

                        // This Field, although valid in the segment, was not found in this document
                        if (found != -1)
                        {
                            // Compute position in the tvf file
                            position = 0;
                            for (int i = 0; i <= found; i++)
                            {
                                position += tvd.ReadVLong();
                            }
                            result = ReadTermVector(field, position);
                        }
                        else
                        {
                            //System.out.println("Field not found");
                        }
                    }
                    catch (System.Exception e)
                    {
                        //System.Console.Out.WriteLine(e.StackTrace);
                    }
                }
                else
                {
                    System.Console.Out.WriteLine("No tvx file");
                }
                return(result);
            }
        }