Пример #1
0
        public Classifier(ClassifierConfig config, IModel model)
        {
            #region [.check config.]
            config.ThrowIfNull("config");
            config.UrlDetectorModel.ThrowIfNull("config.UrlDetectorConfig");
            model.ThrowIfNull("model");
            #endregion

            _Model            = model;
            _Tokenizer        = new ClassifyTokenizer(config.UrlDetectorModel);
            _ScalarProducts   = new double[_Model.TotalClassCount];
            _TextTFDictionary = new Dictionary <string, int>(TEXT_TF_DICTIONARY_CAPACITY);
            _ClassInfo        = new ClassifyInfo[_Model.TotalClassCount];
            for (int i = 0, len = _Model.TotalClassCount; i < len; i++)
            {
                _ClassInfo[i].ClassIndex = i;
            }
        }
Пример #2
0
        private static void Build(build_params_t bp)
        {
            Console.WriteLine("start process: '" + bp.ToString() + "'...");

            #region [.-0-.]
            var _tfidf    = new tfidf(bp.Ngrams, bp.D_param);
            var tokenizer = new ClassifyTokenizer(bp.UrlDetectorModel);
            #endregion

            #region [.-1-.]
            foreach (var inputFilename in bp.InputFilenames)
            {
                Console.WriteLine("start process file: '" + new FileInfo(inputFilename).Name + "'...");

                var fileName = Path.Combine(bp.InputFolder, inputFilename);
                var text     = File.ReadAllText(fileName, INPUT_ENCODING);
                #region commented. xml

                /*
                 *              var sents = (from doc in XDocument.Load( fileName ).Descendants( "document" )
                 *                                  //.Take( 10 )
                 *                                  from sent in doc.Elements( "sent" )
                 *                                  //.Take( 10 )
                 *                                  select sent.Value
                 *                                  )
                 *                                      .ToArray();
                 *              var text = string.Join( Environment.NewLine, sents );
                 * sents = null;
                 */
                #endregion
                if (string.IsNullOrWhiteSpace(text))
                {
                    throw (new InvalidDataException("input text is-null-or-white-space, filename: '" + fileName + '\''));
                }

                _tfidf.BeginAddDocument();
                tokenizer.Run(text, (word) =>
                {
                    _tfidf.AddDocumentWord(word);
                });
                _tfidf.EndAddDocument();

                text = null;

                GCCollect();

                #region commented

                /*
                 * var words = tokenizer.run( text );
                 * text = null;
                 * GC.Collect();
                 *
                 *              _tfidf.AddDocument( words );
                 * words = null;
                 * GC.Collect();
                 */
                #endregion

                Console.WriteLine("end process file");
            }
            #endregion

            #region [.-2-.]
            Console.WriteLine("start process TFiDF...");

            var _tfidf_result = default(tfidf.result);
            switch (bp.Method)
            {
            case MethodEnum.tfidf:
                _tfidf_result = _tfidf.Process();
                break;

            case MethodEnum.bm25:
                _tfidf_result = _tfidf.Process_BM25();
                break;

            case MethodEnum.R_tfidf:
                _tfidf_result = _tfidf.Process_R();
                break;
            }
            _tfidf = null;
            GCCollect();

            Console.WriteLine("end process TFiDF");
            #endregion

            #region [.-3-.]
            Console.WriteLine("start write result...");
            var fi = new FileInfo(bp.OutputFilenamePattern);
            if (!fi.Directory.Exists)
            {
                fi.Directory.Create();
            }
            var outputFile = Path.Combine(fi.DirectoryName, fi.Name.Substring(0, fi.Name.Length - fi.Extension.Length) +
                                          "-(" + bp.Method + "-" + bp.Ngrams + "-" + bp.D_param + ")" + fi.Extension);

            var sb  = new StringBuilder();
            var nfi = new NumberFormatInfo()
            {
                NumberDecimalSeparator = "."
            };
            using (var sw = new StreamWriter(outputFile, false, OUTPUT_ENCODING))
            {
                var header = "#\t'" + string.Join("'\t'", INPUT_FILES) + '\'';
                sw.WriteLine(header);

                for (int i = 0, len = _tfidf_result.TFiDF.Length; i < len; i++)
                {
                    var values = _tfidf_result.TFiDF[i];
                    //if ( values.Sum() != 0 )
                    if (!AllValuesAreEquals(values))
                    {
                        var w = _tfidf_result.Words[i];
                        sb.Clear().Append(w).Append('\t');
                        for (int j = 0, values_len = values.Length; j < values_len; j++)
                        {
                            sb.Append(values[j].ToString(nfi)).Append('\t');
                        }
                        sb.Remove(sb.Length - 1, 1);

                        sw.WriteLine(sb.ToString());
                    }
                }
            }
            Console.WriteLine("end write result" + Environment.NewLine);
            #endregion
        }