Beispiel #1
0
        /// <summary>
        /// Index the BaseIndexEntry in the inverted index, by query term
        /// </summary>
        /// <param name="indexEntry"></param>
        public void AddBaseIndexEntry(BaseIndexEntry indexEntry)
        {
            // Extract the terms from the BaseIndexEntry query, and place in index for each term
            foreach (string term in indexEntry.QueryTerms)
            {
                // Create non-existent posting list
                if (!_entries.ContainsKey(term))
                    _entries[term] = new HashSet<BaseIndexEntry>();

                if (!_entries[term].Contains(indexEntry))
                    _entries[term].Add(indexEntry);
            }
        }
        /// <summary>
        /// Make prediction using linear regression
        /// </summary>
        /// <param name="query"></param>
        /// <returns></returns> 
        private double MakePrediction(BaseIndexEntry indexEntry, string partialQuery)
        {
            FeaturePackage featurePackage = BuildFeaturePackage(indexEntry, partialQuery);

            double prediction = _lrModel.MakePrediction(featurePackage.NtbFeatures);

            return prediction;
        }
        /// <summary>
        /// Build the feature package including NTB likelihoods for the specified query. Doesn't include the target likelihood.
        /// </summary>
        /// <param name="query"></param>
        /// <param name="partialQUery"></param>
        /// <returns></returns>
        private FeaturePackage BuildFeaturePackage(BaseIndexEntry prefixEntry, string partialQuery, bool includeQueryCountsFeature = false)
        {
            FeaturePackage tpq = new FeaturePackage();
            tpq.Query = prefixEntry.Query;
            tpq.NtbFeatures = new double[_ntbs.Length];

            // Set the NTB counts
            for (int i = 0; i < _ntbs.Length; i++)
            {
                double queryFrequency = _ntbs[i][partialQuery].GetQueryFrequency(prefixEntry.Query);

                // Calculate probability and apply
                tpq.NtbFeatures[i] = queryFrequency; // TODO: don't normalise for now / Convert.ToDouble(ntbN);

                // Include query counts feature if necessary
                if (includeQueryCountsFeature)
                {
                    // IGNORE THIS FOR NOW
                    //tpq.QueriesSinceLastTrain = _currentQueryCount - _queryCountsForPrefix[partialQuery].OldestItem();
                }
            }

            return tpq;
        }
        /// <summary>
        /// Autocomplete a query after n characters
        /// </summary>
        /// <param name="queryTime"></param>
        /// <param name="partialQuery"></param>
        /// <param name="fullQuery"></param>
        /// <returns></returns>
        protected override AutoCompletionList AutoCompleteQuery(DateTime queryTime, string partialQuery, string fullQuery)
        {
            // Increment the current query count
            _currentQueryCount++;

            // Determine whether the multiple NTBs for a prefix need to be created first
            #region Setup NTBs
            // Setup the main NTBs
            if (!_ntbsCreated.Contains(partialQuery))
            {
                // Create NTBs
                for (int i = 0; i < _multipleNs.Length; i++)
                {
                    // Retrieve the existing bucket for the prefix
                    _ntbs[i][partialQuery] = new NonTemporalBucket<BaseIndexEntry>(_multipleNs[i], _multipleMaxSingleQueryN[i]);

                    // Hook up the events
                    if (i > 0)
                    {
                        _ntbs[i - 1][partialQuery].OnQueryRemovedFromNTB += _ntbs[i][partialQuery].AddQueryEvent; // Send to next NTB
                    }
                }

                // Create overall NTB
                _overallNtb[partialQuery] = new NonTemporalBucket<BaseIndexEntry>(_totalQueriesAcrossAllNTBs, _totalQueriesAcrossAllNTBs);

                // Create base NTB
                _baseNtb[partialQuery] = new NonTemporalBucket<BaseIndexEntry>(_baseNtbSize, _baseNtbSize);

                // Track the NTBs have been created for the prefix
                _ntbsCreated.Add(partialQuery);
            }

            // Setup the training NTB
            if (!_trainingNtb.ContainsKey(partialQuery))
            {
                _trainingNtb[partialQuery] = new NonTemporalBucket<BaseIndexEntry>(_trainAfterNQueriesForPrefix, _trainAfterNQueriesForPrefix); // Create NTB sized for the training horizon (in queries)
            }
            #endregion

            // Deal with the online training
            #region Handle training package building and use
            if (!_queriesObservedForPrefix.ContainsKey(partialQuery))
                _queriesObservedForPrefix[partialQuery] = 0;

            // Setup a training package for the ML if necessary, and do any training that is outstanding
            if ((_prefixHasStartedTraining.Contains(partialQuery) && _queriesObservedForPrefix[partialQuery] == _trainAfterNQueriesForPrefix)
                    || (!_prefixHasStartedTraining.Contains(partialQuery) && _queriesObservedForPrefix[partialQuery] == _totalQueriesAcrossAllNTBs)) // - start training when biggest NTB is full
            {
                // Apply existing training
                if (_prefixTrainingPackages.ContainsKey(partialQuery))
                {
                    // Train
                    TrainingPackage trainingPackage = _prefixTrainingPackages[partialQuery];
                    trainingPackage.QueryCountAtTrain = _currentQueryCount;

                    // Update the target likelihood variable for each query from the last _trainAfterNQueriesForPrefix queries
                    foreach (FeaturePackage queryFeaturePackage in trainingPackage.TrainingPackageQueries)
                    {
                        double queryCount = _trainingNtb[partialQuery].GetQueryFrequency(queryFeaturePackage.Query);
                        if (queryCount > 0)
                            queryFeaturePackage.TargetLikelihood = queryCount; // / Convert.ToDouble(_trainAfterNQueriesForPrefix);
                    }

                    // Use the training package for the model
                    UseTrainingPackage(trainingPackage);
                    _firstTrainingHasHappened = true;
                    //if (_trainCount % 200 == 0)
                    //Console.WriteLine("Training package " + _trainCount.ToString() + " used for prefix " + trainingPackage.ForPrefix + " (first? " + trainingPackage.IsFirstTrainingPackageForPrefix + ")");

                    _trainCount++;
                }

                // Build package containing exist queries to train with
                //Console.WriteLine("Build package for " + partialQuery);
                bool isFirstTrainingPackage = !_prefixHasStartedTraining.Contains(partialQuery);
                _prefixTrainingPackages[partialQuery] = BuildTrainingPackage(partialQuery, isFirstTrainingPackage);

                _prefixHasStartedTraining.Add(partialQuery); // Mark the prefix as started training

                // Reset the queries observed for the prefix, ready for next training package to run
                _queriesObservedForPrefix[partialQuery] = 0;
            }
            #endregion

            NonTemporalBucket<BaseIndexEntry> overallNTB = _overallNtb[partialQuery];

            // Get the NTB entries
            IEnumerable<BaseIndexEntry> biggestNTBPrefixEntries = null;

            // Try largest NTB first, if it doesn't have the prefix, then no others will either
            biggestNTBPrefixEntries = overallNTB.AllBucketQueries;

            // The prefix entries for output
            List<BaseIndexEntry> outputPrefixEntries = new List<BaseIndexEntry>();

            // Compute scores on the auto-completions
            foreach (BaseIndexEntry prefixEntry in biggestNTBPrefixEntries)
            {
                if (prefixEntry.QueryLogFrequency < 2)
                    continue; // Ignore low frequency completions - increases speed and removes junk

                BaseIndexEntry outputIndexEntry = new BaseIndexEntry();
                outputIndexEntry.Query = prefixEntry.Query;

                // Use predicted likelihood if overall NTB is full, otherwise just use frequency in the largest NTB (it probably doesn't make sense to predict using incomplete NTBs)
                if (overallNTB.TotalQueriesInBucket == overallNTB.QMaxSum)
                {
                    // Use prediction to rank suggestion
                    outputIndexEntry.QueryLogFrequency = Math.Round(2.0 + MakePrediction(prefixEntry, partialQuery), 5);
                }
                else
                {
                    // Use overall NTB count by default
                    outputIndexEntry.QueryLogFrequency = _baseNtb[partialQuery].GetQueryFrequency(outputIndexEntry.Query); // TODO: change this prefixEntry.QueryLogFrequency;
                    if (outputIndexEntry.QueryLogFrequency < 2)
                        continue;
                }

                outputPrefixEntries.Add(outputIndexEntry);
            }

            // Create and rank the autocompletions
            AutoCompletionList autoCompletionListOutput = CreateAutoCompletionList(outputPrefixEntries);

            // Increment the queries observed with the prefix (for knowing when to train)
            if (_queriesObservedForPrefix.ContainsKey(partialQuery))
                _queriesObservedForPrefix[partialQuery] += 1;
            else
                _queriesObservedForPrefix[partialQuery] = 1;

            // Add the new query to the NTBs
            _ntbs[0][partialQuery].AddQuery(fullQuery, null);
            _overallNtb[partialQuery].AddQuery(fullQuery, null);
            _baseNtb[partialQuery].AddQuery(fullQuery, null);

            // Add the query to the training ntb (used for computing prediction likelihood)
            _trainingNtb[partialQuery].AddQuery(fullQuery, null);

            // Return the autocompletion list ready to be sent off for evaluation
            return autoCompletionListOutput;
        }