/// <summary> Find words for a more-like-this query former. /// /// </summary> /// <param name="docNum">the id of the lucene document from which to find terms /// </param> private PriorityQueue RetrieveTerms(int docNum) { System.Collections.IDictionary termFreqMap = new System.Collections.Hashtable(); for (int i = 0; i < fieldNames.Length; i++) { System.String fieldName = fieldNames[i]; TermFreqVector vector = ir.GetTermFreqVector(docNum, fieldName); // field does not store term vector info if (vector == null) { Document d = ir.Document(docNum); System.String[] text = d.GetValues(fieldName); if (text != null) { for (int j = 0; j < text.Length; j++) { AddTermFrequencies(new System.IO.StringReader(text[j]), termFreqMap, fieldName); } } } else { AddTermFrequencies(termFreqMap, vector); } } return(CreateQueue(termFreqMap)); }
/// <summary> Adds terms and frequencies found in vector into the Map termFreqMap</summary> /// <param name="termFreqMap">a Map of terms and their frequencies /// </param> /// <param name="vector">List of terms and their frequencies for a doc/field /// </param> private void AddTermFrequencies(System.Collections.IDictionary termFreqMap, TermFreqVector vector) { System.String[] terms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int j = 0; j < terms.Length; j++) { System.String term = terms[j]; if (IsNoiseWord(term)) { continue; } // increment frequency Int cnt = (Int)termFreqMap[term]; if (cnt == null) { cnt = new Int(); termFreqMap[term] = cnt; cnt.x = freqs[j]; } else { cnt.x += freqs[j]; } } }
private void TestTermVectors() { // check: int numDocs = reader.NumDocs(); long start = 0L; for (int docId = 0; docId < numDocs; docId++) { start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); TermFreqVector[] vectors = reader.GetTermFreqVectors(docId); timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start; // verify vectors result VerifyVectors(vectors, docId); start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); TermFreqVector vector = reader.GetTermFreqVector(docId, "field"); timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start; vectors = new TermFreqVector[1]; vectors[0] = vector; VerifyVectors(vectors, docId); } }
public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field) { TermFreqVector tfv = (TermFreqVector)reader.GetTermFreqVector(docId, field); if (tfv == null) { throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored"); } if (tfv is TermPositionVector) { TermPositionVector tpv = (TermPositionVector)reader.GetTermFreqVector(docId, field); return(GetTokenStream(tpv)); } throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored"); }
/// <summary> A convenience method that tries a number of approaches to getting a token stream. /// The cost of finding there are no termVectors in the index is minimal (1000 invocations still /// registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable /// </summary> /// <param name="">reader /// </param> /// <param name="">docId /// </param> /// <param name="">field /// </param> /// <param name="">analyzer /// </param> /// <returns> null if field not stored correctly /// </returns> /// <throws> IOException </throws> public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, System.String field, Analyzer analyzer) { TokenStream ts = null; TermFreqVector tfv = (TermFreqVector)reader.GetTermFreqVector(docId, field); if (tfv != null) { if (tfv is TermPositionVector) { ts = GetTokenStream((TermPositionVector)tfv); } } //No token info stored so fall back to analyzing raw content if (ts == null) { ts = GetTokenStream(reader, docId, field, analyzer); } return(ts); }
/// <summary> Adds terms and frequencies found in vector into the Map termFreqMap</summary> /// <param name="termFreqMap">a Map of terms and their frequencies /// </param> /// <param name="vector">List of terms and their frequencies for a doc/field /// </param> private void AddTermFrequencies(System.Collections.IDictionary termFreqMap, TermFreqVector vector) { System.String[] terms = vector.GetTerms(); int[] freqs = vector.GetTermFrequencies(); for (int j = 0; j < terms.Length; j++) { System.String term = terms[j]; if (IsNoiseWord(term)) { continue; } // increment frequency Int cnt = (Int) termFreqMap[term]; if (cnt == null) { cnt = new Int(); termFreqMap[term] = cnt; cnt.x = freqs[j]; } else { cnt.x += freqs[j]; } } }
private void VerifyVectors(TermFreqVector[] vectors, int num) { System.Text.StringBuilder temp = new System.Text.StringBuilder(); System.String[] terms = null; for (int i = 0; i < vectors.Length; i++) { terms = vectors[i].GetTerms(); for (int z = 0; z < terms.Length; z++) { temp.Append(terms[z]); } } if (!English.IntToEnglish(num).Trim().Equals(temp.ToString().Trim())) System.Console.Out.WriteLine("wrong term result"); }
private void TestTermVectors() { // check: int numDocs = reader.NumDocs(); long start = 0L; for (int docId = 0; docId < numDocs; docId++) { start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); TermFreqVector[] vectors = reader.GetTermFreqVectors(docId); timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start; // verify vectors result VerifyVectors(vectors, docId); start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond); TermFreqVector vector = reader.GetTermFreqVector(docId, "field"); timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start; vectors = new TermFreqVector[1]; vectors[0] = vector; VerifyVectors(vectors, docId); } }