コード例 #1
0
        private static void Main(string[] args)
        {
            var wasErrors = false;

            try
            {
                #region [.print to console config.]
                Console.WriteLine(Environment.NewLine + "----------------------------------------------");
                Console.WriteLine("USE_BOOST_PRIORITY: '" + USE_BOOST_PRIORITY + "'");
                Console.WriteLine("BUILD_MODE        : '" + BUILD_MODE + "'");
                switch (BUILD_MODE)
                {
                case BuildModeEnum.single_model:
                    Console.WriteLine("METHOD            : '" + METHOD + "'");
                    Console.WriteLine("NGARMS            : '" + NGARMS + "'");
                    Console.WriteLine("D_PARAM           : '" + D_PARAM + "'");
                    break;

                case BuildModeEnum.all_models_by_method:
                    Console.WriteLine("METHOD            : '" + METHOD + "'");
                    break;
                }
                Console.WriteLine("INPUT_FILES       : '" + string.Join("'; '", INPUT_FILES) + "'");
                Console.WriteLine("INPUT_FOLDER      : '" + INPUT_FOLDER + "'");
                Console.WriteLine("INPUT_ENCODING    : '" + INPUT_ENCODING.WebName + "'");
                Console.WriteLine("OUTPUT_FILE       : '" + OUTPUT_FILE_PATTERN + "'");
                Console.WriteLine("OUTPUT_ENCODING   : '" + OUTPUT_ENCODING.WebName + "'");
                Console.WriteLine("----------------------------------------------" + Environment.NewLine);
                #endregion

                #region [.GC.]
                GCSettings.LatencyMode = GCLatencyMode.LowLatency;
                if (GCSettings.LatencyMode != GCLatencyMode.LowLatency)
                {
                    GCSettings.LatencyMode = GCLatencyMode.Batch;
                }
                #endregion

                #region [.use boost priority.]
                if (USE_BOOST_PRIORITY)
                {
                    var pr = Process.GetCurrentProcess();
                    pr.PriorityClass              = ProcessPriorityClass.RealTime;
                    pr.PriorityBoostEnabled       = true;
                    Thread.CurrentThread.Priority = ThreadPriority.Highest;
                }
                #endregion

                #region [.url-detector.]
                var urlDetectorModel = new UrlDetectorModel(URL_DETECTOR_RESOURCES_XML_FILENAME);
                #endregion

                #region [.build model's.]
                if (BUILD_MODE == BuildModeEnum.single_model)
                {
                    var bp = new build_params_t()
                    {
                        UrlDetectorModel      = urlDetectorModel,
                        InputFolder           = INPUT_FOLDER,
                        InputFilenames        = INPUT_FILES,
                        Method                = METHOD,
                        Ngrams                = NGARMS,
                        D_param               = D_PARAM,
                        OutputFilenamePattern = OUTPUT_FILE_PATTERN,
                    };
                    var sw = Stopwatch.StartNew();
                    Build(bp);
                    sw.Stop();

                    Console.WriteLine("'" + METHOD + "; " + NGARMS + "; " + D_PARAM + "' - success, elapsed: " + sw.Elapsed);
                }
                else
                {
                    var tuples = (BUILD_MODE == BuildModeEnum.all_models_by_method)
                                 ? GetProcessParams(METHOD)
                                 : GetProcessParams();

                    #region [.build model's.]
                    var sw_total = Stopwatch.StartNew();
                    foreach (var t in tuples)
                    {
                        var bp = new build_params_t()
                        {
                            UrlDetectorModel      = urlDetectorModel,
                            InputFolder           = INPUT_FOLDER,
                            InputFilenames        = INPUT_FILES,
                            Method                = t.Item1,
                            Ngrams                = t.Item2,
                            D_param               = t.Item3,
                            OutputFilenamePattern = OUTPUT_FILE_PATTERN,
                        };
                        try
                        {
                            var sw = Stopwatch.StartNew();
                            Build(bp);
                            sw.Stop();

                            Console.WriteLine("'" + bp.Method + "; " + bp.Ngrams + "; " + bp.D_param + "' - success, elapsed: " + sw.Elapsed);
                        }
                        catch (Exception ex)
                        {
                            var fc = Console.ForegroundColor; Console.ForegroundColor = ConsoleColor.Red;
                            Console.WriteLine("'" + bp.Method + "; " + bp.Ngrams + "; " + bp.D_param + "' - " + ex.GetType() + ": " + ex.Message);
                            Console.ForegroundColor = fc;
                            wasErrors = true;
                        }
                    }
                    sw_total.Stop();

                    Console.WriteLine("total elapsed: " + sw_total.Elapsed);
                    #endregion
                }
                #endregion
            }
            catch (Exception ex)
            {
                Console.ForegroundColor = ConsoleColor.Red;
                Console.WriteLine(Environment.NewLine + ex + Environment.NewLine);
                Console.ResetColor();
                wasErrors = true;
            }

            Console.WriteLine(Environment.NewLine + "[.....finita fusking comedy (push ENTER 4 exit).....]");
            if (wasErrors)
            {
                Console.ReadLine();
            }
        }
コード例 #2
0
        private static void Build(build_params_t bp)
        {
            Console.WriteLine("start process: '" + bp.ToString() + "'...");

            #region [.-0-.]
            var _tfidf    = new tfidf(bp.Ngrams, bp.D_param);
            var tokenizer = new ClassifyTokenizer(bp.UrlDetectorModel);
            #endregion

            #region [.-1-.]
            foreach (var inputFilename in bp.InputFilenames)
            {
                Console.WriteLine("start process file: '" + new FileInfo(inputFilename).Name + "'...");

                var fileName = Path.Combine(bp.InputFolder, inputFilename);
                var text     = File.ReadAllText(fileName, INPUT_ENCODING);
                #region commented. xml

                /*
                 *              var sents = (from doc in XDocument.Load( fileName ).Descendants( "document" )
                 *                                  //.Take( 10 )
                 *                                  from sent in doc.Elements( "sent" )
                 *                                  //.Take( 10 )
                 *                                  select sent.Value
                 *                                  )
                 *                                      .ToArray();
                 *              var text = string.Join( Environment.NewLine, sents );
                 * sents = null;
                 */
                #endregion
                if (string.IsNullOrWhiteSpace(text))
                {
                    throw (new InvalidDataException("input text is-null-or-white-space, filename: '" + fileName + '\''));
                }

                _tfidf.BeginAddDocument();
                tokenizer.Run(text, (word) =>
                {
                    _tfidf.AddDocumentWord(word);
                });
                _tfidf.EndAddDocument();

                text = null;

                GCCollect();

                #region commented

                /*
                 * var words = tokenizer.run( text );
                 * text = null;
                 * GC.Collect();
                 *
                 *              _tfidf.AddDocument( words );
                 * words = null;
                 * GC.Collect();
                 */
                #endregion

                Console.WriteLine("end process file");
            }
            #endregion

            #region [.-2-.]
            Console.WriteLine("start process TFiDF...");

            var _tfidf_result = default(tfidf.result);
            switch (bp.Method)
            {
            case MethodEnum.tfidf:
                _tfidf_result = _tfidf.Process();
                break;

            case MethodEnum.bm25:
                _tfidf_result = _tfidf.Process_BM25();
                break;

            case MethodEnum.R_tfidf:
                _tfidf_result = _tfidf.Process_R();
                break;
            }
            _tfidf = null;
            GCCollect();

            Console.WriteLine("end process TFiDF");
            #endregion

            #region [.-3-.]
            Console.WriteLine("start write result...");
            var fi = new FileInfo(bp.OutputFilenamePattern);
            if (!fi.Directory.Exists)
            {
                fi.Directory.Create();
            }
            var outputFile = Path.Combine(fi.DirectoryName, fi.Name.Substring(0, fi.Name.Length - fi.Extension.Length) +
                                          "-(" + bp.Method + "-" + bp.Ngrams + "-" + bp.D_param + ")" + fi.Extension);

            var sb  = new StringBuilder();
            var nfi = new NumberFormatInfo()
            {
                NumberDecimalSeparator = "."
            };
            using (var sw = new StreamWriter(outputFile, false, OUTPUT_ENCODING))
            {
                var header = "#\t'" + string.Join("'\t'", INPUT_FILES) + '\'';
                sw.WriteLine(header);

                for (int i = 0, len = _tfidf_result.TFiDF.Length; i < len; i++)
                {
                    var values = _tfidf_result.TFiDF[i];
                    //if ( values.Sum() != 0 )
                    if (!AllValuesAreEquals(values))
                    {
                        var w = _tfidf_result.Words[i];
                        sb.Clear().Append(w).Append('\t');
                        for (int j = 0, values_len = values.Length; j < values_len; j++)
                        {
                            sb.Append(values[j].ToString(nfi)).Append('\t');
                        }
                        sb.Remove(sb.Length - 1, 1);

                        sw.WriteLine(sb.ToString());
                    }
                }
            }
            Console.WriteLine("end write result" + Environment.NewLine);
            #endregion
        }