コード例 #1
0
        public void CutDictionaryIfNeed(Dictionary <string, int> dict, NGramsEnum dictType)
        {
            var percent = GetCutPercent(dictType, _CutThreshold);

            if (percent.HasValue)
            {
                var ss  = new SortedSet <word_t>(word_t_comparer.Instance);
                var sum = 0;
                foreach (var p in dict)
                {
                    sum += p.Value;
                    var word = new word_t()
                    {
                        Value = p.Key, Count = p.Value
                    };
                    ss.Add(word);
                }

                var threshold         = sum * percent.Value / 100.0f;
                var threshold_current = 0;

                dict.Clear();
                foreach (var word in ss)
                {
                    threshold_current += word.Count;
                    if (threshold < threshold_current)
                    {
                        break;
                    }

                    dict.Add(word.Value, word.Count);
                }
                ss = null;
            }
        }
コード例 #2
0
        private void ProcessNgrams_Routine(NGramsEnum ngram)
        {
            var outputFilenames = _OutputFilenames[ngram];

            if (outputFilenames.Count == 0)
            {
                var dict = default(Dictionary <string, int>);
                switch (ngram)
                {
                //case NGramsEnum.ngram_1: dict = _DocumentNgrams_1; break;
                case NGramsEnum.ngram_2: dict = _DocumentNgrams_2; break;

                case NGramsEnum.ngram_3: dict = _DocumentNgrams_3; break;

                case NGramsEnum.ngram_4: dict = _DocumentNgrams_4; break;

                default:                 dict = _DocumentNgrams_1; break;
                }

                _TFProcessor.AddDocumentTerms(dict);
            }
            else
            {
                Console.WriteLine();

                //-1-
                _OutputFilenames[ngram] = new List <string>();
                var ss = new SortedSet <TermFrequency>(TermFrequencyComparer.Instance);
                foreach (var tf in GroupByMerging_1(outputFilenames))
                {
                    ss.Add(tf);
                    if (_Bp.MaxPortionSize <= ss.Count)
                    {
                        var lst            = _OutputFilenames[ngram];
                        var outputFilename = Write2File(_Fi, lst.Count, ss, ngram, _CurrentProcesId);
                        lst.Add(outputFilename);
                        ss.Clear();
                    }
                }
                if (ss.Count != 0)
                {
                    var lst            = _OutputFilenames[ngram];
                    var outputFilename = Write2File(_Fi, lst.Count, ss, ngram, _CurrentProcesId);
                    lst.Add(outputFilename);
                    ss.Clear();
                }

                //-2-
                outputFilenames.ForEach(outputFilename => File.Delete(outputFilename));
                outputFilenames = _OutputFilenames[ngram];
                var tuples = CreateTuples4Merging(outputFilenames);
                ss = TFProcessor.CreateSortedSetAndCutIfNeed(GroupByMerging_2(tuples));

                //-3-
                tuples.ForEach(t => t.Dispose());
                outputFilenames.ForEach(outputFilename => File.Delete(outputFilename));
                _TFProcessor.AddDocumentTerms(ss);
            }
        }
コード例 #3
0
 private void CheckPortion(Dictionary <string, int> dict, NGramsEnum ngram)
 {
     if (_Bp.MaxPortionSize <= dict.Count)
     {
         var lst            = _OutputFilenames[ngram];
         var outputFilename = Write2File(_Fi, lst.Count, dict, ngram, _CurrentProcesId);
         lst.Add(outputFilename);
         dict.Clear();
     }
 }
コード例 #4
0
        public tfidf(NGramsEnum ngrams, CutThresholdEnum cutThreshold)
        {
            _Ngrams       = ngrams;
            _CutThreshold = cutThreshold;

            _WordsByDocsHashset = new HashSet <string>();
            _DocWordsList       = new List <Dictionary <string, int> >();

            _WordsCountByDocList = new List <int>();
            _Sb = new StringBuilder();
        }
コード例 #5
0
        public tfidf(NGramsEnum ngrams, D_ParamEnum d_param)
        {
            _Ngrams  = ngrams;
            _D_param = d_param;

            _WordsByDocsHashset = new HashSet <string>();
            _DocWordsList       = new List <Dictionary <string, int> >();

            _WordsCountByDocList = new List <int>();
            _Sb = new StringBuilder();
        }
コード例 #6
0
        private static float?GetCutPercent(NGramsEnum ngrams, CutThresholdEnum cutThreshold)
        {
            switch (cutThreshold)
            {
            //case CutThresholdEnum.cut_0: return (null);
            case CutThresholdEnum.cut_1:
            {
                switch (ngrams)
                {
                case NGramsEnum.ngram_1: return(100 - 5);

                case NGramsEnum.ngram_2: return(100 - 50);

                case NGramsEnum.ngram_3: return(100 - 85);

                case NGramsEnum.ngram_4: return(100 - 95);
                    //default: return (null);
                }
            }
            break;

            case CutThresholdEnum.cut_2:
            {
                switch (ngrams)
                {
                case NGramsEnum.ngram_1: return(100 - 50);

                case NGramsEnum.ngram_2: return(100 - 85);

                case NGramsEnum.ngram_3: return(100 - 95);

                case NGramsEnum.ngram_4: return(100 - 98);
                    //default: return (null);
                }
            }
            break;
                //default: return (null);
            }
            return(null);
        }
コード例 #7
0
        private static float?GetCutPercent(NGramsEnum ngrams, D_ParamEnum d_param)
        {
            switch (d_param)
            {
            //case TDiDF_d_enum.d0: return (null);
            case D_ParamEnum.d1:
            {
                switch (ngrams)
                {
                case NGramsEnum.ngram_1: return(100 - 5);

                case NGramsEnum.ngram_2: return(100 - 50);

                case NGramsEnum.ngram_3: return(100 - 85);

                case NGramsEnum.ngram_4: return(100 - 95);
                    //default: return (null);
                }
            }
            break;

            case D_ParamEnum.d2:
            {
                switch (ngrams)
                {
                case NGramsEnum.ngram_1: return(100 - 50);

                case NGramsEnum.ngram_2: return(100 - 85);

                case NGramsEnum.ngram_3: return(100 - 95);

                case NGramsEnum.ngram_4: return(100 - 98);
                    //default: return (null);
                }
            }
            break;
                //default: return (null);
            }
            return(null);
        }
コード例 #8
0
        private static string Write2File(FileInfo fi, int portionNumber, SortedSet <TermFrequency> ss, NGramsEnum ss_type, int currentProcesId)
        {
            var outputFilename = Path.Combine(fi.DirectoryName, "temp", $"pid_{currentProcesId}, {fi.Name}.ss.{ss_type}.{portionNumber}");

            Console.Write($"start write portion-file: '{outputFilename}'...");

            var ofi = new FileInfo(outputFilename);

            if (!ofi.Directory.Exists)
            {
                ofi.Directory.Create();
            }
            using (var sw = new StreamWriter(outputFilename, false, Config.Inst.OUTPUT_ENCODING))
            {
                foreach (var tf in ss)
                {
                    sw.Write(tf.Term);
                    sw.Write('\t');
                    sw.WriteLine(tf.Frequency);
                }
            }
            ss.Clear();
            ss = null;
            GC.Collect();

            Console.WriteLine(" => end write portion-file");

            return(outputFilename);
        }
コード例 #9
0
        private static string Write2File(FileInfo fi, int portionNumber, Dictionary <string, int> tf_matrix, NGramsEnum tf_matrix_type, int currentProcesId)
        {
            var outputFilename = Path.Combine(fi.DirectoryName, "temp", $"pid_{currentProcesId}, {fi.Name}.{tf_matrix_type}.{portionNumber}");

            Console.Write($"start write portion-file: '{outputFilename}'...");

            var ss = new SortedDictionary <string, int>(stringComparer.Instance);

            foreach (var p in tf_matrix)
            {
                ss.Add(p.Key, p.Value);
            }
            tf_matrix.Clear();

            var ofi = new FileInfo(outputFilename);

            if (!ofi.Directory.Exists)
            {
                ofi.Directory.Create();
            }
            using (var sw = new StreamWriter(outputFilename, false, Config.Inst.OUTPUT_ENCODING))
            {
                foreach (var p in ss)
                {
                    sw.Write(p.Key);
                    sw.Write('\t');
                    sw.WriteLine(p.Value);
                }
            }
            ss.Clear();
            ss = null;
            GC.Collect();

            Console.WriteLine(" => end write portion-file");

            return(outputFilename);
        }
コード例 #10
0
 [M(O.AggressiveInlining)] private void CheckLastPortion(Dictionary <string, int> dict, NGramsEnum ngram)
 {
     if (dict.Count != 0)
     {
         var lst = _OutputFilenames[ngram];
         if (0 < lst.Count)
         {
             var outputFilename = Write2File(_Fi, lst.Count, dict, ngram, _CurrentProcesId);
             lst.Add(outputFilename);
             dict.Clear();
         }
     }
 }
コード例 #11
0
        public SortedSet <word_t> CreateSortedSetAndCutIfNeed(IEnumerable <word_t> words, NGramsEnum dictType, int sum)
        {
            var ss = new SortedSet <word_t>(word_t_comparer.Instance);

            var percent = GetCutPercent(dictType, _CutThreshold);

            if (percent.HasValue)
            {
                var threshold         = sum * percent.Value / 100.0f;
                var threshold_current = 0;

                foreach (var word in words)
                {
                    threshold_current += word.Count;
                    if (threshold < threshold_current)
                    {
                        break;
                    }

                    ss.Add(word);
                }
            }
            else
            {
                foreach (var word in words)
                {
                    ss.Add(word);
                }
            }

            return(ss);
        }
コード例 #12
0
        private Dictionary <string, int> CreateNGramsDictionary(IList <string> words, NGramsEnum dictType)
        {
            switch (dictType)
            {
            case NGramsEnum.ngram_2:
                #region
            {
                var ngramDict = new Dictionary <string, int>();
                for (int p = 0, len = words.Count - 1; p < len; p++)
                {
                    var next = words[p + 1];
                    var curr = words[p];

                    _Sb.Clear().Append(curr).Append(' ').Append(next);

                    ngramDict.AddOrUpdate(_Sb.ToString());
                }

                CutDictionaryIfNeed(ngramDict, dictType);

                return(ngramDict);
            }
                #endregion

            case NGramsEnum.ngram_3:
                #region
            {
                var ngramDict = new Dictionary <string, int>();
                for (int p = 0, len = words.Count - 2; p < len; p++)
                {
                    var curr  = words[p];
                    var next1 = words[p + 1];
                    var next2 = words[p + 2];

                    _Sb.Clear().Append(curr).Append(' ').Append(next1).Append(' ').Append(next2);

                    ngramDict.AddOrUpdate(_Sb.ToString());
                }

                CutDictionaryIfNeed(ngramDict, dictType);

                return(ngramDict);
            }
                #endregion

            case NGramsEnum.ngram_4:
                #region
            {
                var ngramDict = new Dictionary <string, int>();
                for (int p = 0, len = words.Count - 3; p < len; p++)
                {
                    var curr  = words[p];
                    var next1 = words[p + 1];
                    var next2 = words[p + 2];
                    var next3 = words[p + 3];

                    _Sb.Clear().Append(curr).Append(' ').Append(next1).Append(' ').Append(next2).Append(' ').Append(next3);

                    ngramDict.AddOrUpdate(_Sb.ToString());
                }

                CutDictionaryIfNeed(ngramDict, dictType);

                return(ngramDict);
            }
                #endregion

            default:     //case NGramsEnum.ngram_1:
                return(null);
            }
        }