Beispiel #1
0
        /// <summary>
        /// Sort input to output, explicit hint for the buffer size. The amount of allocated
        /// memory may deviate from the hint (may be smaller or larger).
        /// </summary>
        public SortInfo Sort(FileInfo input, FileInfo output)
        {
            sortInfo = new SortInfo(this)
            {
                TotalTime = Environment.TickCount
            };

            // LUCENENET NOTE: Can't do this because another thread could recreate the file before we are done here.
            // and cause this to bomb. We use the existence of the file as an indicator that we are done using it.
            //output.Delete();

            var  merges   = new List <FileInfo>();
            bool success2 = false;

            try
            {
                var  inputStream = new ByteSequencesReader(input);
                bool success     = false;
                try
                {
                    int lines = 0;
                    while ((lines = ReadPartition(inputStream)) > 0)
                    {
                        merges.Add(SortPartition(lines));
                        sortInfo.TempMergeFiles++;
                        sortInfo.Lines += lines;

                        // Handle intermediate merges.
                        if (merges.Count == MaxTempFiles)
                        {
                            var intermediate = new FileInfo(Path.GetTempFileName());
                            try
                            {
                                MergePartitions(merges, intermediate);
                            }
                            finally
                            {
                                foreach (var file in merges)
                                {
                                    file.Delete();
                                }
                                merges.Clear();
                                merges.Add(intermediate);
                            }
                            sortInfo.TempMergeFiles++;
                        }
                    }
                    success = true;
                }
                finally
                {
                    if (success)
                    {
                        IOUtils.Close(inputStream);
                    }
                    else
                    {
                        IOUtils.CloseWhileHandlingException(inputStream);
                    }
                }

                // One partition, try to rename or copy if unsuccessful.
                if (merges.Count == 1)
                {
                    FileInfo single = merges[0];
                    Copy(single, output);
                    try
                    {
                        File.Delete(single.FullName);
                    }
                    catch (Exception)
                    {
                        // ignored
                    }
                }
                else
                {
                    // otherwise merge the partitions with a priority queue.
                    MergePartitions(merges, output);
                }
                success2 = true;
            }
            finally
            {
                foreach (FileInfo file in merges)
                {
                    file.Delete();
                }
                if (!success2)
                {
                    output.Delete();
                }
            }

            sortInfo.TotalTime = (Environment.TickCount - sortInfo.TotalTime);
            return(sortInfo);
        }
Beispiel #2
0
        /// <summary>
        /// Creates a query from the analysis chain.
        /// <p>
        /// Expert: this is more useful for subclasses such as queryparsers.
        /// If using this class directly, just use <seealso cref="#createBooleanQuery(String, String)"/>
        /// and <seealso cref="#createPhraseQuery(String, String)"/> </summary>
        /// <param name="analyzer"> analyzer used for this query </param>
        /// <param name="operator"> default boolean operator used for this query </param>
        /// <param name="field"> field to create queries against </param>
        /// <param name="queryText"> text to be passed to the analysis chain </param>
        /// <param name="quoted"> true if phrases should be generated when terms occur at more than one position </param>
        /// <param name="phraseSlop"> slop factor for phrase/multiphrase queries </param>
        protected Query CreateFieldQuery(Analyzer analyzer, Occur @operator, string field, string queryText, bool quoted, int phraseSlop)
        {
            Debug.Assert(@operator == Occur.SHOULD || @operator == Occur.MUST);
            // Use the analyzer to get all the tokens, and then build a TermQuery,
            // PhraseQuery, or nothing based on the term count
            CachingTokenFilter          buffer     = null;
            ITermToBytesRefAttribute    termAtt    = null;
            IPositionIncrementAttribute posIncrAtt = null;
            int  numTokens     = 0;
            int  positionCount = 0;
            bool severalTokensAtSamePosition = false;
            bool hasMoreTokens = false;

            TokenStream source = null;

            try
            {
                source = analyzer.GetTokenStream(field, new StringReader(queryText));
                source.Reset();
                buffer = new CachingTokenFilter(source);
                buffer.Reset();

                if (buffer.HasAttribute <ITermToBytesRefAttribute>())
                {
                    termAtt = buffer.GetAttribute <ITermToBytesRefAttribute>();
                }
                if (buffer.HasAttribute <IPositionIncrementAttribute>())
                {
                    posIncrAtt = buffer.GetAttribute <IPositionIncrementAttribute>();
                }

                if (termAtt != null)
                {
                    try
                    {
                        hasMoreTokens = buffer.IncrementToken();
                        while (hasMoreTokens)
                        {
                            numTokens++;
                            int positionIncrement = (posIncrAtt != null) ? posIncrAtt.PositionIncrement : 1;
                            if (positionIncrement != 0)
                            {
                                positionCount += positionIncrement;
                            }
                            else
                            {
                                severalTokensAtSamePosition = true;
                            }
                            hasMoreTokens = buffer.IncrementToken();
                        }
                    }
                    catch (System.IO.IOException)
                    {
                        // ignore
                    }
                }
            }
            catch (System.IO.IOException e)
            {
                throw new Exception("Error analyzing query text", e);
            }
            finally
            {
                IOUtils.CloseWhileHandlingException(source);
            }

            // rewind the buffer stream
            buffer.Reset();

            BytesRef bytes = termAtt == null ? null : termAtt.BytesRef;

            if (numTokens == 0)
            {
                return(null);
            }
            else if (numTokens == 1)
            {
                try
                {
                    bool hasNext = buffer.IncrementToken();
                    Debug.Assert(hasNext == true);
                    termAtt.FillBytesRef();
                }
                catch (System.IO.IOException)
                {
                    // safe to ignore, because we know the number of tokens
                }
                return(NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))));
            }
            else
            {
                if (severalTokensAtSamePosition || (!quoted))
                {
                    if (positionCount == 1 || (!quoted))
                    {
                        // no phrase query:

                        if (positionCount == 1)
                        {
                            // simple case: only one position, with synonyms
                            BooleanQuery q = NewBooleanQuery(true);
                            for (int i = 0; i < numTokens; i++)
                            {
                                try
                                {
                                    bool hasNext = buffer.IncrementToken();
                                    Debug.Assert(hasNext == true);
                                    termAtt.FillBytesRef();
                                }
                                catch (System.IO.IOException)
                                {
                                    // safe to ignore, because we know the number of tokens
                                }
                                Query currentQuery = NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes)));
                                q.Add(currentQuery, Occur.SHOULD);
                            }
                            return(q);
                        }
                        else
                        {
                            // multiple positions
                            BooleanQuery q            = NewBooleanQuery(false);
                            Query        currentQuery = null;
                            for (int i = 0; i < numTokens; i++)
                            {
                                try
                                {
                                    bool hasNext = buffer.IncrementToken();
                                    Debug.Assert(hasNext == true);
                                    termAtt.FillBytesRef();
                                }
                                catch (System.IO.IOException)
                                {
                                    // safe to ignore, because we know the number of tokens
                                }
                                if (posIncrAtt != null && posIncrAtt.PositionIncrement == 0)
                                {
                                    if (!(currentQuery is BooleanQuery))
                                    {
                                        Query t = currentQuery;
                                        currentQuery = NewBooleanQuery(true);
                                        ((BooleanQuery)currentQuery).Add(t, Occur.SHOULD);
                                    }
                                    ((BooleanQuery)currentQuery).Add(NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes))), Occur.SHOULD);
                                }
                                else
                                {
                                    if (currentQuery != null)
                                    {
                                        q.Add(currentQuery, @operator);
                                    }
                                    currentQuery = NewTermQuery(new Term(field, BytesRef.DeepCopyOf(bytes)));
                                }
                            }
                            q.Add(currentQuery, @operator);
                            return(q);
                        }
                    }
                    else
                    {
                        // phrase query:
                        MultiPhraseQuery mpq = NewMultiPhraseQuery();
                        mpq.Slop = phraseSlop;
                        IList <Term> multiTerms = new List <Term>();
                        int          position   = -1;
                        for (int i = 0; i < numTokens; i++)
                        {
                            int positionIncrement = 1;
                            try
                            {
                                bool hasNext = buffer.IncrementToken();
                                Debug.Assert(hasNext == true);
                                termAtt.FillBytesRef();
                                if (posIncrAtt != null)
                                {
                                    positionIncrement = posIncrAtt.PositionIncrement;
                                }
                            }
                            catch (System.IO.IOException)
                            {
                                // safe to ignore, because we know the number of tokens
                            }

                            if (positionIncrement > 0 && multiTerms.Count > 0)
                            {
                                if (enablePositionIncrements)
                                {
                                    mpq.Add(multiTerms.ToArray(), position);
                                }
                                else
                                {
                                    mpq.Add(multiTerms.ToArray());
                                }
                                multiTerms.Clear();
                            }
                            position += positionIncrement;
                            multiTerms.Add(new Term(field, BytesRef.DeepCopyOf(bytes)));
                        }
                        if (enablePositionIncrements)
                        {
                            mpq.Add(multiTerms.ToArray(), position);
                        }
                        else
                        {
                            mpq.Add(multiTerms.ToArray());
                        }
                        return(mpq);
                    }
                }
                else
                {
                    PhraseQuery pq = NewPhraseQuery();
                    pq.Slop = phraseSlop;
                    int position = -1;

                    for (int i = 0; i < numTokens; i++)
                    {
                        int positionIncrement = 1;

                        try
                        {
                            bool hasNext = buffer.IncrementToken();
                            Debug.Assert(hasNext == true);
                            termAtt.FillBytesRef();
                            if (posIncrAtt != null)
                            {
                                positionIncrement = posIncrAtt.PositionIncrement;
                            }
                        }
                        catch (System.IO.IOException)
                        {
                            // safe to ignore, because we know the number of tokens
                        }

                        if (enablePositionIncrements)
                        {
                            position += positionIncrement;
                            pq.Add(new Term(field, BytesRef.DeepCopyOf(bytes)), position);
                        }
                        else
                        {
                            pq.Add(new Term(field, BytesRef.DeepCopyOf(bytes)));
                        }
                    }
                    return(pq);
                }
            }
        }