Example #1
0
        /// <summary>
        /// 查找指定数目的Term
        /// </summary>
        /// <param name="num"></param>
        /// <returns></returns>
        public TermModel[] FindTerms(int num)
        {
            num++;
            TermInfoQueue queue = new TermInfoQueue(num);
            TermEnum      enum2 = open.Reader.Terms();
            int           count = 0;

            while (enum2.Next())
            {
                string str = enum2.Term().Field();
                if ((currentField != null) && (!str.Equals(currentField)))
                {
                    continue;
                }
                if (enum2.DocFreq() > count)
                {
                    queue.Put(new TermModel(enum2.Term(), enum2.DocFreq()));
                    if (queue.Size() < num)
                    {
                        continue;
                    }
                    queue.Pop();
                    count = ((TermModel)queue.Top()).Count;
                }
            }
            enum2.Close();
            TermModel[] modleArray = new TermModel[queue.Size()];
            for (int i = 0; i < modleArray.Length; i++)
            {
                modleArray[(modleArray.Length - i) - 1] = (TermModel)queue.Pop();
            }
            return(modleArray);
        }
Example #2
0
        public IEnumerable <TermInfo> GetTerms()
        {
            var         directory   = _openIndexModel.Directory;
            IndexReader indexReader = null;
            TermEnum    terms       = null;

            try
            {
                indexReader = IndexReader.Open(directory, true); // ToDo should i open this only once
                terms       = indexReader.Terms();

                while (terms.Next())
                {
                    System.Threading.Thread.Sleep(2);
                    var term = terms.Term();
                    yield return(new TermInfo {
                        Term = term.Text(), Field = term.Field(), Frequency = terms.DocFreq()
                    });
                }
            }
            finally
            {
                if (indexReader != null)
                {
                    indexReader.Close();
                }

                if (terms != null)
                {
                    terms.Close();
                }
            }

            yield break;
        }
 /// <summary> Returns the docFreq of the current Term in the enumeration.
 /// Returns -1 if no Term matches or all terms have been enumerated.
 /// </summary>
 public override int DocFreq()
 {
     if (currentTerm == null)
     {
         return(-1);
     }
     System.Diagnostics.Debug.Assert(actualEnum != null);
     return(actualEnum.DocFreq());
 }
Example #4
0
        /// <summary>
        /// 查找指定字段Term
        /// </summary>
        /// <param name="field"></param>
        /// <param name="text"></param>
        /// <param name="current"></param>
        /// <returns></returns>
        public TermModel FindTerm(string field, string text, bool current)
        {
            TermEnum enum2 = open.Reader.Terms();

            if (enum2.SkipTo(new Term(field, text)))
            {
                TermModel modle2 = null;
                while ((!current && enum2.Next() && field.Equals(enum2.Term().Field())) ||
                       current)
                {
                    modle2 = new TermModel(enum2.Term(), enum2.DocFreq());
                    break;
                }
                enum2.Close();
                return(modle2);
            }
            return(null);
        }
Example #5
0
        private OpenBitSet FastBits(IndexReader reader)
        {
            OpenBitSet bits = new OpenBitSet(reader.MaxDoc);

            bits.Set(0, reader.MaxDoc); //assume all are valid
            Term     startTerm = new Term(fieldName);
            TermEnum te        = reader.Terms(startTerm);

            if (te != null)
            {
                Term currTerm = te.Term;

                while ((currTerm != null) && (currTerm.Field == startTerm.Field)) //term fieldnames are interned
                {
                    if (te.DocFreq() > 1)
                    {
                        int lastDoc = -1;
                        //unset potential duplicates
                        TermDocs td = reader.TermDocs(currTerm);
                        td.Next();
                        if (keepMode == KM_USE_FIRST_OCCURRENCE)
                        {
                            td.Next();
                        }
                        do
                        {
                            lastDoc = td.Doc;
                            bits.Clear(lastDoc);
                        } while (td.Next());
                        if (keepMode == KM_USE_LAST_OCCURRENCE)
                        {
                            //restore the last bit
                            bits.Set(lastDoc);
                        }
                    }
                    if (!te.Next())
                    {
                        break;
                    }
                    currTerm = te.Term;
                }
            }
            return(bits);
        }
        /*
         * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
         *
         * @param reader     The {@link IndexReader} which will be consulted to identify potential stop words that
         *                   exceed the required document frequency
         * @param fieldName  The field for which stopwords will be added
         * @param maxDocFreq The maximum number of index documents which
         *                   can contain a term, after which the term is considered to be a stop word.
         * @return The number of stop words identified.
         * @throws IOException
         */
        public int AddStopWords(IndexReader reader, String fieldName, int maxDocFreq)
        {
            var      stopWords         = Support.Compatibility.SetFactory.CreateHashSet <string>();
            String   internedFieldName = StringHelper.Intern(fieldName);
            TermEnum te   = reader.Terms(new Term(fieldName));
            Term     term = te.Term;

            while (term != null)
            {
                if (term.Field != internedFieldName)
                {
                    break;
                }
                if (te.DocFreq() > maxDocFreq)
                {
                    stopWords.Add(term.Text);
                }
                if (!te.Next())
                {
                    break;
                }
                term = te.Term;
            }
            stopWordsPerField.Add(fieldName, stopWords);

            /* if the stopwords for a field are changed,
             * then saved streams for that field are erased.
             */
            IDictionary <String, SavedStreams> streamMap = (IDictionary <String, SavedStreams>)PreviousTokenStream;

            if (streamMap != null)
            {
                streamMap.Remove(fieldName);
            }

            return(stopWords.Count);
        }
Example #7
0
        private void AddTerms(IndexReader reader, FieldVals f)
        {
            if (f.queryString == null)
            {
                return;
            }
            TokenStream   ts      = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString));
            TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));

            int       corpusNumDocs            = reader.NumDocs();
            Term      internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects
            Hashtable processedTerms           = new Hashtable();

            while (ts.IncrementToken())
            {
                String term = termAtt.Term();
                if (!processedTerms.Contains(term))
                {
                    processedTerms.Add(term, term);
                    ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                    float          minScore  = 0;
                    Term           startTerm = internSavingTemplateTerm.CreateTerm(term);
                    FuzzyTermEnum  fe        = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength);
                    TermEnum       origEnum  = reader.Terms(startTerm);
                    int            df        = 0;
                    if (startTerm.Equals(origEnum.Term()))
                    {
                        df = origEnum.DocFreq(); //store the df so all variants use same idf
                    }
                    int numVariants          = 0;
                    int totalVariantDocFreqs = 0;
                    do
                    {
                        Term possibleMatch = fe.Term();
                        if (possibleMatch != null)
                        {
                            numVariants++;
                            totalVariantDocFreqs += fe.DocFreq();
                            float score = fe.Difference();
                            if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore)
                            {
                                ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm);
                                variantsQ.Insert(st);
                                minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain minScore
                            }
                        }
                    }while (fe.Next());
                    if (numVariants > 0)
                    {
                        int avgDf = totalVariantDocFreqs / numVariants;
                        if (df == 0)    //no direct match we can use as df for all variants
                        {
                            df = avgDf; //use avg df of all variants
                        }

                        // take the top variants (scored by edit distance) and reset the score
                        // to include an IDF factor then add to the global queue for ranking
                        // overall top query terms
                        int size = variantsQ.Size();
                        for (int i = 0; i < size; i++)
                        {
                            ScoreTerm st = (ScoreTerm)variantsQ.Pop();
                            st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs);
                            q.Insert(st);
                        }
                    }
                }
            }
        }
Example #8
0
        public static TermInfo[] GetHighFreqTerms(Directory dir,
                                                  Hashtable junkWords,
                                                  int numTerms,
                                                  String[] fields)
        {
            if (dir == null || fields == null)
            {
                return(new TermInfo[0]);
            }

            IndexReader   reader = IndexReader.Open(dir, true);
            TermInfoQueue tiq    = new TermInfoQueue(numTerms);
            TermEnum      terms  = reader.Terms();

            int minFreq = 0;

            while (terms.Next())
            {
                String field = terms.Term().Field();

                if (fields != null && fields.Length > 0)
                {
                    bool skip = true;

                    for (int i = 0; i < fields.Length; i++)
                    {
                        if (field.Equals(fields[i]))
                        {
                            skip = false;
                            break;
                        }
                    }
                    if (skip)
                    {
                        continue;
                    }
                }

                if (junkWords != null && junkWords[terms.Term().Text()] != null)
                {
                    continue;
                }

                if (terms.DocFreq() > minFreq)
                {
                    TermInfo top = (TermInfo)tiq.Add(new TermInfo(terms.Term(), terms.DocFreq()));
                    if (tiq.Size() >= numTerms)                        // if tiq overfull
                    {
                        tiq.Pop();                                     // remove lowest in tiq
                        minFreq = top.DocFreq;                         // reset minFreq
                    }
                }
            }

            TermInfo[] res = new TermInfo[tiq.Size()];

            for (int i = 0; i < res.Length; i++)
            {
                res[res.Length - i - 1] = (TermInfo)tiq.Pop();
            }

            reader.Close();

            return(res);
        }