/// <summary> /// 查找指定数目的Term /// </summary> /// <param name="num"></param> /// <returns></returns> public TermModel[] FindTerms(int num) { num++; TermInfoQueue queue = new TermInfoQueue(num); TermEnum enum2 = open.Reader.Terms(); int count = 0; while (enum2.Next()) { string str = enum2.Term().Field(); if ((currentField != null) && (!str.Equals(currentField))) { continue; } if (enum2.DocFreq() > count) { queue.Put(new TermModel(enum2.Term(), enum2.DocFreq())); if (queue.Size() < num) { continue; } queue.Pop(); count = ((TermModel)queue.Top()).Count; } } enum2.Close(); TermModel[] modleArray = new TermModel[queue.Size()]; for (int i = 0; i < modleArray.Length; i++) { modleArray[(modleArray.Length - i) - 1] = (TermModel)queue.Pop(); } return(modleArray); }
public IEnumerable <TermInfo> GetTerms() { var directory = _openIndexModel.Directory; IndexReader indexReader = null; TermEnum terms = null; try { indexReader = IndexReader.Open(directory, true); // ToDo should i open this only once terms = indexReader.Terms(); while (terms.Next()) { System.Threading.Thread.Sleep(2); var term = terms.Term(); yield return(new TermInfo { Term = term.Text(), Field = term.Field(), Frequency = terms.DocFreq() }); } } finally { if (indexReader != null) { indexReader.Close(); } if (terms != null) { terms.Close(); } } yield break; }
/// <summary> Returns the docFreq of the current Term in the enumeration. /// Returns -1 if no Term matches or all terms have been enumerated. /// </summary> public override int DocFreq() { if (currentTerm == null) { return(-1); } System.Diagnostics.Debug.Assert(actualEnum != null); return(actualEnum.DocFreq()); }
/// <summary> /// 查找指定字段Term /// </summary> /// <param name="field"></param> /// <param name="text"></param> /// <param name="current"></param> /// <returns></returns> public TermModel FindTerm(string field, string text, bool current) { TermEnum enum2 = open.Reader.Terms(); if (enum2.SkipTo(new Term(field, text))) { TermModel modle2 = null; while ((!current && enum2.Next() && field.Equals(enum2.Term().Field())) || current) { modle2 = new TermModel(enum2.Term(), enum2.DocFreq()); break; } enum2.Close(); return(modle2); } return(null); }
private OpenBitSet FastBits(IndexReader reader) { OpenBitSet bits = new OpenBitSet(reader.MaxDoc); bits.Set(0, reader.MaxDoc); //assume all are valid Term startTerm = new Term(fieldName); TermEnum te = reader.Terms(startTerm); if (te != null) { Term currTerm = te.Term; while ((currTerm != null) && (currTerm.Field == startTerm.Field)) //term fieldnames are interned { if (te.DocFreq() > 1) { int lastDoc = -1; //unset potential duplicates TermDocs td = reader.TermDocs(currTerm); td.Next(); if (keepMode == KM_USE_FIRST_OCCURRENCE) { td.Next(); } do { lastDoc = td.Doc; bits.Clear(lastDoc); } while (td.Next()); if (keepMode == KM_USE_LAST_OCCURRENCE) { //restore the last bit bits.Set(lastDoc); } } if (!te.Next()) { break; } currTerm = te.Term; } } return(bits); }
/* * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs * * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that * exceed the required document frequency * @param fieldName The field for which stopwords will be added * @param maxDocFreq The maximum number of index documents which * can contain a term, after which the term is considered to be a stop word. * @return The number of stop words identified. * @throws IOException */ public int AddStopWords(IndexReader reader, String fieldName, int maxDocFreq) { var stopWords = Support.Compatibility.SetFactory.CreateHashSet <string>(); String internedFieldName = StringHelper.Intern(fieldName); TermEnum te = reader.Terms(new Term(fieldName)); Term term = te.Term; while (term != null) { if (term.Field != internedFieldName) { break; } if (te.DocFreq() > maxDocFreq) { stopWords.Add(term.Text); } if (!te.Next()) { break; } term = te.Term; } stopWordsPerField.Add(fieldName, stopWords); /* if the stopwords for a field are changed, * then saved streams for that field are erased. */ IDictionary <String, SavedStreams> streamMap = (IDictionary <String, SavedStreams>)PreviousTokenStream; if (streamMap != null) { streamMap.Remove(fieldName); } return(stopWords.Count); }
private void AddTerms(IndexReader reader, FieldVals f) { if (f.queryString == null) { return; } TokenStream ts = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString)); TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute)); int corpusNumDocs = reader.NumDocs(); Term internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid constructing new Term() objects Hashtable processedTerms = new Hashtable(); while (ts.IncrementToken()) { String term = termAtt.Term(); if (!processedTerms.Contains(term)) { processedTerms.Add(term, term); ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term float minScore = 0; Term startTerm = internSavingTemplateTerm.CreateTerm(term); FuzzyTermEnum fe = new FuzzyTermEnum(reader, startTerm, f.minSimilarity, f.prefixLength); TermEnum origEnum = reader.Terms(startTerm); int df = 0; if (startTerm.Equals(origEnum.Term())) { df = origEnum.DocFreq(); //store the df so all variants use same idf } int numVariants = 0; int totalVariantDocFreqs = 0; do { Term possibleMatch = fe.Term(); if (possibleMatch != null) { numVariants++; totalVariantDocFreqs += fe.DocFreq(); float score = fe.Difference(); if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore) { ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm); variantsQ.Insert(st); minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain minScore } } }while (fe.Next()); if (numVariants > 0) { int avgDf = totalVariantDocFreqs / numVariants; if (df == 0) //no direct match we can use as df for all variants { df = avgDf; //use avg df of all variants } // take the top variants (scored by edit distance) and reset the score // to include an IDF factor then add to the global queue for ranking // overall top query terms int size = variantsQ.Size(); for (int i = 0; i < size; i++) { ScoreTerm st = (ScoreTerm)variantsQ.Pop(); st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs); q.Insert(st); } } } } }
public static TermInfo[] GetHighFreqTerms(Directory dir, Hashtable junkWords, int numTerms, String[] fields) { if (dir == null || fields == null) { return(new TermInfo[0]); } IndexReader reader = IndexReader.Open(dir, true); TermInfoQueue tiq = new TermInfoQueue(numTerms); TermEnum terms = reader.Terms(); int minFreq = 0; while (terms.Next()) { String field = terms.Term().Field(); if (fields != null && fields.Length > 0) { bool skip = true; for (int i = 0; i < fields.Length; i++) { if (field.Equals(fields[i])) { skip = false; break; } } if (skip) { continue; } } if (junkWords != null && junkWords[terms.Term().Text()] != null) { continue; } if (terms.DocFreq() > minFreq) { TermInfo top = (TermInfo)tiq.Add(new TermInfo(terms.Term(), terms.DocFreq())); if (tiq.Size() >= numTerms) // if tiq overfull { tiq.Pop(); // remove lowest in tiq minFreq = top.DocFreq; // reset minFreq } } } TermInfo[] res = new TermInfo[tiq.Size()]; for (int i = 0; i < res.Length; i++) { res[res.Length - i - 1] = (TermInfo)tiq.Pop(); } reader.Close(); return(res); }