public Classifier(ClassifierConfig config, IModel model) { #region [.check config.] config.ThrowIfNull("config"); config.UrlDetectorModel.ThrowIfNull("config.UrlDetectorConfig"); model.ThrowIfNull("model"); #endregion _Model = model; _Tokenizer = new ClassifyTokenizer(config.UrlDetectorModel); _ScalarProducts = new double[_Model.TotalClassCount]; _TextTFDictionary = new Dictionary <string, int>(TEXT_TF_DICTIONARY_CAPACITY); _ClassInfo = new ClassifyInfo[_Model.TotalClassCount]; for (int i = 0, len = _Model.TotalClassCount; i < len; i++) { _ClassInfo[i].ClassIndex = i; } }
private static void Build(build_params_t bp) { Console.WriteLine("start process: '" + bp.ToString() + "'..."); #region [.-0-.] var _tfidf = new tfidf(bp.Ngrams, bp.D_param); var tokenizer = new ClassifyTokenizer(bp.UrlDetectorModel); #endregion #region [.-1-.] foreach (var inputFilename in bp.InputFilenames) { Console.WriteLine("start process file: '" + new FileInfo(inputFilename).Name + "'..."); var fileName = Path.Combine(bp.InputFolder, inputFilename); var text = File.ReadAllText(fileName, INPUT_ENCODING); #region commented. xml /* * var sents = (from doc in XDocument.Load( fileName ).Descendants( "document" ) * //.Take( 10 ) * from sent in doc.Elements( "sent" ) * //.Take( 10 ) * select sent.Value * ) * .ToArray(); * var text = string.Join( Environment.NewLine, sents ); * sents = null; */ #endregion if (string.IsNullOrWhiteSpace(text)) { throw (new InvalidDataException("input text is-null-or-white-space, filename: '" + fileName + '\'')); } _tfidf.BeginAddDocument(); tokenizer.Run(text, (word) => { _tfidf.AddDocumentWord(word); }); _tfidf.EndAddDocument(); text = null; GCCollect(); #region commented /* * var words = tokenizer.run( text ); * text = null; * GC.Collect(); * * _tfidf.AddDocument( words ); * words = null; * GC.Collect(); */ #endregion Console.WriteLine("end process file"); } #endregion #region [.-2-.] Console.WriteLine("start process TFiDF..."); var _tfidf_result = default(tfidf.result); switch (bp.Method) { case MethodEnum.tfidf: _tfidf_result = _tfidf.Process(); break; case MethodEnum.bm25: _tfidf_result = _tfidf.Process_BM25(); break; case MethodEnum.R_tfidf: _tfidf_result = _tfidf.Process_R(); break; } _tfidf = null; GCCollect(); Console.WriteLine("end process TFiDF"); #endregion #region [.-3-.] Console.WriteLine("start write result..."); var fi = new FileInfo(bp.OutputFilenamePattern); if (!fi.Directory.Exists) { fi.Directory.Create(); } var outputFile = Path.Combine(fi.DirectoryName, fi.Name.Substring(0, fi.Name.Length - fi.Extension.Length) + "-(" + bp.Method + "-" + bp.Ngrams + "-" + bp.D_param + ")" + fi.Extension); var sb = new StringBuilder(); var nfi = new NumberFormatInfo() { NumberDecimalSeparator = "." }; using (var sw = new StreamWriter(outputFile, false, OUTPUT_ENCODING)) { var header = "#\t'" + string.Join("'\t'", INPUT_FILES) + '\''; sw.WriteLine(header); for (int i = 0, len = _tfidf_result.TFiDF.Length; i < len; i++) { var values = _tfidf_result.TFiDF[i]; //if ( values.Sum() != 0 ) if (!AllValuesAreEquals(values)) { var w = _tfidf_result.Words[i]; sb.Clear().Append(w).Append('\t'); for (int j = 0, values_len = values.Length; j < values_len; j++) { sb.Append(values[j].ToString(nfi)).Append('\t'); } sb.Remove(sb.Length - 1, 1); sw.WriteLine(sb.ToString()); } } } Console.WriteLine("end write result" + Environment.NewLine); #endregion }