/// <summary>
        /// Recomputes the specified table.
        /// </summary>
        /// <param name="table">The table.</param>
        /// <param name="logger">The logger.</param>
        /// <param name="forSingleWebSite">if set to <c>true</c> [for single web site].</param>
        /// <param name="lemmas">The lemmas.</param>
        /// <returns></returns>
        /// <exception cref="aceGeneralException">Permanent Add() lemma problem at [" + table.name + "] - Permanent Lemma TF-IDF Add(Lemma) failure</exception>
        public virtual webLemmaTermTable recompute(webLemmaTermTable table, ILogBuilder logger, Boolean forSingleWebSite, List <webLemmaTerm> lemmas)
        {
            kernelComputeWeightTask kernelTask = new kernelComputeWeightTask(lemmas, logger, forSingleWebSite, settings);

            ITermWeightKernel kernel = kernelManager.GetKernel(weightKernelName);

            kernel.compute(kernelTask);

            #region OLD_CODE

            /*
             * Double documentSetFrequencyMax = 0;
             * Double documentFrequencyMax = 0;
             * Double termFrequencyMax = 0;
             *
             * // List<webLemmaTerm> lemmas = tabl//e.GetList();
             *
             * if (lemmas.Count == 0)
             * {
             *  logger.log("ERROR: NO ENTRIES IN TF-TDF TABLE [" + table.name + "] - is for single web site [" + forSingleWebSite.ToString() + "]");
             * }
             *
             * foreach (webLemmaTerm lemma in lemmas)
             * {
             *  documentSetFrequencyMax = Math.Max(documentSetFrequencyMax, lemma.documentSetFrequency);
             *  documentFrequencyMax = Math.Max(documentFrequencyMax, lemma.documentFrequency);
             *  termFrequencyMax = Math.Max(termFrequencyMax, lemma.termFrequency);
             * }
             *
             * if (forSingleWebSite) {
             *  if (settings.doAdjustIDFForCase)
             *  {
             *      documentFrequencyMax = (documentFrequencyMax * settings.documentFrequencyMaxFactor) + settings.documentFrequencyMaxCorrection;
             *  } else
             *  {
             *      documentFrequencyMax = documentFrequencyMax + settings.documentFrequencyMaxCorrection;
             *  }
             * } else
             * {
             *  documentFrequencyMax = (documentFrequencyMax * settings.documentFrequencyMaxFactor) + settings.documentFrequencyMaxCorrection;
             * }
             *
             * /// COMPUTING NON NORMALIZED WEIGHTs
             * ///
             * Double weightMax = Double.MinValue;
             *
             * foreach (webLemmaTerm lemma in lemmas)
             * {
             *  lemma.termFrequency = lemma.termFrequency.GetRatio(termFrequencyMax);
             *
             *  if (settings.doUseIDF)
             *  {
             *      if (settings.doUseNaturalLog)
             *      {
             *          lemma.documentFactor = Math.Log(documentFrequencyMax.GetRatio(lemma.documentFrequency));
             *      } else
             *      {
             *          lemma.documentFactor = Math.Log10(documentFrequencyMax.GetRatio(lemma.documentFrequency));
             *      }
             *  } else
             *  {
             *      lemma.documentFactor = 1;
             *  }
             *
             *  lemma.weight = lemma.termFrequency * lemma.documentFactor;
             *
             *  if (settings.doUseDocumentSet)
             *  {
             *      if ((documentSetFrequencyMax != 1) || !forSingleWebSite)
             *      {
             *          if (lemma.documentSetFrequency == 0)
             *          {
             *              lemma.weight = 0;
             *          }
             *          else
             *          {
             *              Double docSetFactor = (1 - Math.Log10(documentSetFrequencyMax / lemma.documentSetFrequency));
             *              lemma.weight = lemma.weight * docSetFactor;
             *          }
             *      }
             *  }
             *
             *  weightMax = Math.Max(weightMax, lemma.weight);
             * }
             *
             * /// WEIGHT NORMALIZATION
             * foreach (webLemmaTerm lemma in lemmas)
             * {
             *  lemma.weight = lemma.weight.GetRatio(weightMax);
             * }
             */

            #endregion OLD_CODE

            /// SAVING THE RESULTS
            Int32 globalRetry = retry_global_limit;
            foreach (webLemmaTerm lemma in lemmas)
            {
                Int32 retry = retry_limit;
                while (retry > 0)
                {
                    try
                    {
                        table.Add(lemma);
                        retry = 0;
                    }
                    catch (Exception ex)
                    {
                        retry--;
                        globalRetry--;

                        if (doBeep)
                        {
                            logger.log("WFT [" + table.name + "] add lemma [" + lemma.name + "]  retries left [" + retry + "] global[" + globalRetry + "]");
                            imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 200, 1);
                        }
                        Thread.Sleep(250);

                        if (globalRetry < 0)
                        {
                            throw new aceGeneralException("Permanent Add() lemma problem at [" + table.name + "]", ex, this, "Permanent Lemma TF-IDF Add(Lemma) failure");
                        }
                    }
                }
            }

            logger.log("WFT [" + table.name + "] recomputed TFmax[" + kernelTask.weightMax + "] : DFmax[" + kernelTask.documentFrequencyMax + "]  TC[" + lemmas.Count + "]");

            return(table);
        }
예제 #2
0
        public virtual webLemmaTermTable recompute(webLemmaTermTable table, ILogBuilder logger, Boolean forSingleWebSite, List <webLemmaTerm> lemmas)
        {
            Double documentSetFrequencyMax = 0;
            Double documentFrequencyMax    = 0;
            Double termFrequencyMax        = 0;


            // List<webLemmaTerm> lemmas = tabl//e.GetList();

            if (lemmas.Count == 0)
            {
                logger.log("ERROR: NO ENTRIES IN TF-TDF TABLE [" + table.name + "] - is for single web site [" + forSingleWebSite.ToString() + "]");
            }

            foreach (webLemmaTerm lemma in lemmas)
            {
                documentSetFrequencyMax = Math.Max(documentSetFrequencyMax, lemma.documentSetFrequency);
                documentFrequencyMax    = Math.Max(documentFrequencyMax, lemma.documentFrequency);
                termFrequencyMax        = Math.Max(termFrequencyMax, lemma.termFrequency);
            }

            if (forSingleWebSite)
            {
                if (settings.doAdjustIDFForCase)
                {
                    documentFrequencyMax = (documentFrequencyMax * settings.documentFrequencyMaxFactor) + settings.documentFrequencyMaxCorrection;
                }
                else
                {
                    documentFrequencyMax = documentFrequencyMax + settings.documentFrequencyMaxCorrection;
                }
            }
            else
            {
                documentFrequencyMax = (documentFrequencyMax * settings.documentFrequencyMaxFactor) + settings.documentFrequencyMaxCorrection;
            }

            Double weightMax = Double.MinValue;

            foreach (webLemmaTerm lemma in lemmas)
            {
                lemma.termFrequency = lemma.termFrequency.GetRatio(termFrequencyMax);

                if (settings.doUseIDF)
                {
                    if (settings.doUseNaturalLog)
                    {
                        lemma.documentFactor = Math.Log(documentFrequencyMax.GetRatio(lemma.documentFrequency));
                    }
                    else
                    {
                        lemma.documentFactor = Math.Log10(documentFrequencyMax.GetRatio(lemma.documentFrequency));
                    }
                }
                else
                {
                    lemma.documentFactor = 1;
                }

                lemma.weight = lemma.termFrequency * lemma.documentFactor;



                if (settings.doUseDocumentSet)
                {
                    if ((documentSetFrequencyMax != 1) || !forSingleWebSite)
                    {
                        if (lemma.documentSetFrequency == 0)
                        {
                            lemma.weight = 0;
                        }
                        else
                        {
                            Double docSetFactor = (1 - Math.Log10(documentSetFrequencyMax / lemma.documentSetFrequency));
                            lemma.weight = lemma.weight * docSetFactor;
                        }
                    }
                }

                weightMax = Math.Max(weightMax, lemma.weight);
            }

            foreach (webLemmaTerm lemma in lemmas)
            {
                lemma.weight = lemma.weight.GetRatio(weightMax);
            }

            Int32 globalRetry = retry_global_limit;

            foreach (webLemmaTerm lemma in lemmas)
            {
                Int32 retry = retry_limit;
                while (retry > 0)
                {
                    try
                    {
                        table.Add(lemma);
                        retry = 0;
                    }
                    catch (Exception ex)
                    {
                        retry--;
                        globalRetry--;

                        if (doBeep)
                        {
                            logger.log("WFT [" + table.name + "] add lemma [" + lemma.name + "]  retries left [" + retry + "] global[" + globalRetry + "]");
                            imbACE.Services.terminal.aceTerminalInput.doBeepViaConsole(1200, 200, 1);
                        }
                        Thread.Sleep(250);

                        if (globalRetry < 0)
                        {
                            throw new aceGeneralException("Permanent Add() lemma problem at [" + table.name + "]", ex, this, "Permanent Lemma TF-IDF Add(Lemma) failure");
                        }
                    }
                }
            }


            logger.log("WFT [" + table.name + "] recomputed TFmax[" + weightMax + "] : DFmax[" + documentFrequencyMax + "]  TC[" + lemmas.Count + "]");

            return(table);
        }