Esempio n. 1
0
        /// <summary> Find words for a more-like-this query former.
        ///
        /// </summary>
        /// <param name="docNum">the id of the lucene document from which to find terms
        /// </param>
        private PriorityQueue RetrieveTerms(int docNum)
        {
            System.Collections.IDictionary termFreqMap = new System.Collections.Hashtable();
            for (int i = 0; i < fieldNames.Length; i++)
            {
                System.String  fieldName = fieldNames[i];
                TermFreqVector vector    = ir.GetTermFreqVector(docNum, fieldName);

                // field does not store term vector info
                if (vector == null)
                {
                    Document        d    = ir.Document(docNum);
                    System.String[] text = d.GetValues(fieldName);
                    if (text != null)
                    {
                        for (int j = 0; j < text.Length; j++)
                        {
                            AddTermFrequencies(new System.IO.StringReader(text[j]), termFreqMap, fieldName);
                        }
                    }
                }
                else
                {
                    AddTermFrequencies(termFreqMap, vector);
                }
            }

            return(CreateQueue(termFreqMap));
        }
Esempio n. 2
0
        /// <summary> Adds terms and frequencies found in vector into the Map termFreqMap</summary>
        /// <param name="termFreqMap">a Map of terms and their frequencies
        /// </param>
        /// <param name="vector">List of terms and their frequencies for a doc/field
        /// </param>
        private void AddTermFrequencies(System.Collections.IDictionary termFreqMap, TermFreqVector vector)
        {
            System.String[] terms = vector.GetTerms();
            int[]           freqs = vector.GetTermFrequencies();
            for (int j = 0; j < terms.Length; j++)
            {
                System.String term = terms[j];

                if (IsNoiseWord(term))
                {
                    continue;
                }
                // increment frequency
                Int cnt = (Int)termFreqMap[term];
                if (cnt == null)
                {
                    cnt = new Int();
                    termFreqMap[term] = cnt;
                    cnt.x             = freqs[j];
                }
                else
                {
                    cnt.x += freqs[j];
                }
            }
        }
        private void  TestTermVectors()
        {
            // check:
            int  numDocs = reader.NumDocs();
            long start   = 0L;

            for (int docId = 0; docId < numDocs; docId++)
            {
                start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                TermFreqVector[] vectors = reader.GetTermFreqVectors(docId);
                timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start;

                // verify vectors result
                VerifyVectors(vectors, docId);

                start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
                TermFreqVector vector = reader.GetTermFreqVector(docId, "field");
                timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start;

                vectors    = new TermFreqVector[1];
                vectors[0] = vector;

                VerifyVectors(vectors, docId);
            }
        }
Esempio n. 4
0
        public static TokenStream GetTokenStream(IndexReader reader, int docId, System.String field)
        {
            TermFreqVector tfv = (TermFreqVector)reader.GetTermFreqVector(docId, field);

            if (tfv == null)
            {
                throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored");
            }
            if (tfv is TermPositionVector)
            {
                TermPositionVector tpv = (TermPositionVector)reader.GetTermFreqVector(docId, field);
                return(GetTokenStream(tpv));
            }
            throw new System.ArgumentException(field + " in doc #" + docId + "does not have any term position data stored");
        }
Esempio n. 5
0
        /// <summary> A convenience method that tries a number of approaches to getting a token stream.
        /// The cost of finding there are no termVectors in the index is minimal (1000 invocations still
        /// registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
        /// </summary>
        /// <param name="">reader
        /// </param>
        /// <param name="">docId
        /// </param>
        /// <param name="">field
        /// </param>
        /// <param name="">analyzer
        /// </param>
        /// <returns> null if field not stored correctly
        /// </returns>
        /// <throws>  IOException </throws>
        public static TokenStream GetAnyTokenStream(IndexReader reader, int docId, System.String field, Analyzer analyzer)
        {
            TokenStream ts = null;

            TermFreqVector tfv = (TermFreqVector)reader.GetTermFreqVector(docId, field);

            if (tfv != null)
            {
                if (tfv is TermPositionVector)
                {
                    ts = GetTokenStream((TermPositionVector)tfv);
                }
            }
            //No token info stored so fall back to analyzing raw content
            if (ts == null)
            {
                ts = GetTokenStream(reader, docId, field, analyzer);
            }
            return(ts);
        }
Esempio n. 6
0
        /// <summary> Adds terms and frequencies found in vector into the Map termFreqMap</summary>
        /// <param name="termFreqMap">a Map of terms and their frequencies
        /// </param>
        /// <param name="vector">List of terms and their frequencies for a doc/field
        /// </param>
        private void  AddTermFrequencies(System.Collections.IDictionary termFreqMap, TermFreqVector vector)
        {
            System.String[] terms = vector.GetTerms();
            int[] freqs = vector.GetTermFrequencies();
            for (int j = 0; j < terms.Length; j++)
            {
                System.String term = terms[j];
				
                if (IsNoiseWord(term))
                {
                    continue;
                }
                // increment frequency
                Int cnt = (Int) termFreqMap[term];
                if (cnt == null)
                {
                    cnt = new Int();
                    termFreqMap[term] = cnt;
                    cnt.x = freqs[j];
                }
                else
                {
                    cnt.x += freqs[j];
                }
            }
        }
		private void  VerifyVectors(TermFreqVector[] vectors, int num)
		{
			System.Text.StringBuilder temp = new System.Text.StringBuilder();
			System.String[] terms = null;
			for (int i = 0; i < vectors.Length; i++)
			{
				terms = vectors[i].GetTerms();
				for (int z = 0; z < terms.Length; z++)
				{
					temp.Append(terms[z]);
				}
			}
			
			if (!English.IntToEnglish(num).Trim().Equals(temp.ToString().Trim()))
				System.Console.Out.WriteLine("wrong term result");
		}
		private void  TestTermVectors()
		{
			// check:
			int numDocs = reader.NumDocs();
			long start = 0L;
			for (int docId = 0; docId < numDocs; docId++)
			{
				start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
				TermFreqVector[] vectors = reader.GetTermFreqVectors(docId);
				timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start;
				
				// verify vectors result
				VerifyVectors(vectors, docId);
				
				start = (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond);
				TermFreqVector vector = reader.GetTermFreqVector(docId, "field");
				timeElapsed += (DateTime.Now.Ticks / TimeSpan.TicksPerMillisecond) - start;
				
				vectors = new TermFreqVector[1];
				vectors[0] = vector;
				
				VerifyVectors(vectors, docId);
			}
		}