public void CutDictionaryIfNeed(Dictionary <string, int> dict, NGramsEnum dictType) { var percent = GetCutPercent(dictType, _CutThreshold); if (percent.HasValue) { var ss = new SortedSet <word_t>(word_t_comparer.Instance); var sum = 0; foreach (var p in dict) { sum += p.Value; var word = new word_t() { Value = p.Key, Count = p.Value }; ss.Add(word); } var threshold = sum * percent.Value / 100.0f; var threshold_current = 0; dict.Clear(); foreach (var word in ss) { threshold_current += word.Count; if (threshold < threshold_current) { break; } dict.Add(word.Value, word.Count); } ss = null; } }
private void ProcessNgrams_Routine(NGramsEnum ngram) { var outputFilenames = _OutputFilenames[ngram]; if (outputFilenames.Count == 0) { var dict = default(Dictionary <string, int>); switch (ngram) { //case NGramsEnum.ngram_1: dict = _DocumentNgrams_1; break; case NGramsEnum.ngram_2: dict = _DocumentNgrams_2; break; case NGramsEnum.ngram_3: dict = _DocumentNgrams_3; break; case NGramsEnum.ngram_4: dict = _DocumentNgrams_4; break; default: dict = _DocumentNgrams_1; break; } _TFProcessor.AddDocumentTerms(dict); } else { Console.WriteLine(); //-1- _OutputFilenames[ngram] = new List <string>(); var ss = new SortedSet <TermFrequency>(TermFrequencyComparer.Instance); foreach (var tf in GroupByMerging_1(outputFilenames)) { ss.Add(tf); if (_Bp.MaxPortionSize <= ss.Count) { var lst = _OutputFilenames[ngram]; var outputFilename = Write2File(_Fi, lst.Count, ss, ngram, _CurrentProcesId); lst.Add(outputFilename); ss.Clear(); } } if (ss.Count != 0) { var lst = _OutputFilenames[ngram]; var outputFilename = Write2File(_Fi, lst.Count, ss, ngram, _CurrentProcesId); lst.Add(outputFilename); ss.Clear(); } //-2- outputFilenames.ForEach(outputFilename => File.Delete(outputFilename)); outputFilenames = _OutputFilenames[ngram]; var tuples = CreateTuples4Merging(outputFilenames); ss = TFProcessor.CreateSortedSetAndCutIfNeed(GroupByMerging_2(tuples)); //-3- tuples.ForEach(t => t.Dispose()); outputFilenames.ForEach(outputFilename => File.Delete(outputFilename)); _TFProcessor.AddDocumentTerms(ss); } }
private void CheckPortion(Dictionary <string, int> dict, NGramsEnum ngram) { if (_Bp.MaxPortionSize <= dict.Count) { var lst = _OutputFilenames[ngram]; var outputFilename = Write2File(_Fi, lst.Count, dict, ngram, _CurrentProcesId); lst.Add(outputFilename); dict.Clear(); } }
public tfidf(NGramsEnum ngrams, CutThresholdEnum cutThreshold) { _Ngrams = ngrams; _CutThreshold = cutThreshold; _WordsByDocsHashset = new HashSet <string>(); _DocWordsList = new List <Dictionary <string, int> >(); _WordsCountByDocList = new List <int>(); _Sb = new StringBuilder(); }
public tfidf(NGramsEnum ngrams, D_ParamEnum d_param) { _Ngrams = ngrams; _D_param = d_param; _WordsByDocsHashset = new HashSet <string>(); _DocWordsList = new List <Dictionary <string, int> >(); _WordsCountByDocList = new List <int>(); _Sb = new StringBuilder(); }
private static float?GetCutPercent(NGramsEnum ngrams, CutThresholdEnum cutThreshold) { switch (cutThreshold) { //case CutThresholdEnum.cut_0: return (null); case CutThresholdEnum.cut_1: { switch (ngrams) { case NGramsEnum.ngram_1: return(100 - 5); case NGramsEnum.ngram_2: return(100 - 50); case NGramsEnum.ngram_3: return(100 - 85); case NGramsEnum.ngram_4: return(100 - 95); //default: return (null); } } break; case CutThresholdEnum.cut_2: { switch (ngrams) { case NGramsEnum.ngram_1: return(100 - 50); case NGramsEnum.ngram_2: return(100 - 85); case NGramsEnum.ngram_3: return(100 - 95); case NGramsEnum.ngram_4: return(100 - 98); //default: return (null); } } break; //default: return (null); } return(null); }
private static float?GetCutPercent(NGramsEnum ngrams, D_ParamEnum d_param) { switch (d_param) { //case TDiDF_d_enum.d0: return (null); case D_ParamEnum.d1: { switch (ngrams) { case NGramsEnum.ngram_1: return(100 - 5); case NGramsEnum.ngram_2: return(100 - 50); case NGramsEnum.ngram_3: return(100 - 85); case NGramsEnum.ngram_4: return(100 - 95); //default: return (null); } } break; case D_ParamEnum.d2: { switch (ngrams) { case NGramsEnum.ngram_1: return(100 - 50); case NGramsEnum.ngram_2: return(100 - 85); case NGramsEnum.ngram_3: return(100 - 95); case NGramsEnum.ngram_4: return(100 - 98); //default: return (null); } } break; //default: return (null); } return(null); }
private static string Write2File(FileInfo fi, int portionNumber, SortedSet <TermFrequency> ss, NGramsEnum ss_type, int currentProcesId) { var outputFilename = Path.Combine(fi.DirectoryName, "temp", $"pid_{currentProcesId}, {fi.Name}.ss.{ss_type}.{portionNumber}"); Console.Write($"start write portion-file: '{outputFilename}'..."); var ofi = new FileInfo(outputFilename); if (!ofi.Directory.Exists) { ofi.Directory.Create(); } using (var sw = new StreamWriter(outputFilename, false, Config.Inst.OUTPUT_ENCODING)) { foreach (var tf in ss) { sw.Write(tf.Term); sw.Write('\t'); sw.WriteLine(tf.Frequency); } } ss.Clear(); ss = null; GC.Collect(); Console.WriteLine(" => end write portion-file"); return(outputFilename); }
private static string Write2File(FileInfo fi, int portionNumber, Dictionary <string, int> tf_matrix, NGramsEnum tf_matrix_type, int currentProcesId) { var outputFilename = Path.Combine(fi.DirectoryName, "temp", $"pid_{currentProcesId}, {fi.Name}.{tf_matrix_type}.{portionNumber}"); Console.Write($"start write portion-file: '{outputFilename}'..."); var ss = new SortedDictionary <string, int>(stringComparer.Instance); foreach (var p in tf_matrix) { ss.Add(p.Key, p.Value); } tf_matrix.Clear(); var ofi = new FileInfo(outputFilename); if (!ofi.Directory.Exists) { ofi.Directory.Create(); } using (var sw = new StreamWriter(outputFilename, false, Config.Inst.OUTPUT_ENCODING)) { foreach (var p in ss) { sw.Write(p.Key); sw.Write('\t'); sw.WriteLine(p.Value); } } ss.Clear(); ss = null; GC.Collect(); Console.WriteLine(" => end write portion-file"); return(outputFilename); }
[M(O.AggressiveInlining)] private void CheckLastPortion(Dictionary <string, int> dict, NGramsEnum ngram) { if (dict.Count != 0) { var lst = _OutputFilenames[ngram]; if (0 < lst.Count) { var outputFilename = Write2File(_Fi, lst.Count, dict, ngram, _CurrentProcesId); lst.Add(outputFilename); dict.Clear(); } } }
public SortedSet <word_t> CreateSortedSetAndCutIfNeed(IEnumerable <word_t> words, NGramsEnum dictType, int sum) { var ss = new SortedSet <word_t>(word_t_comparer.Instance); var percent = GetCutPercent(dictType, _CutThreshold); if (percent.HasValue) { var threshold = sum * percent.Value / 100.0f; var threshold_current = 0; foreach (var word in words) { threshold_current += word.Count; if (threshold < threshold_current) { break; } ss.Add(word); } } else { foreach (var word in words) { ss.Add(word); } } return(ss); }
private Dictionary <string, int> CreateNGramsDictionary(IList <string> words, NGramsEnum dictType) { switch (dictType) { case NGramsEnum.ngram_2: #region { var ngramDict = new Dictionary <string, int>(); for (int p = 0, len = words.Count - 1; p < len; p++) { var next = words[p + 1]; var curr = words[p]; _Sb.Clear().Append(curr).Append(' ').Append(next); ngramDict.AddOrUpdate(_Sb.ToString()); } CutDictionaryIfNeed(ngramDict, dictType); return(ngramDict); } #endregion case NGramsEnum.ngram_3: #region { var ngramDict = new Dictionary <string, int>(); for (int p = 0, len = words.Count - 2; p < len; p++) { var curr = words[p]; var next1 = words[p + 1]; var next2 = words[p + 2]; _Sb.Clear().Append(curr).Append(' ').Append(next1).Append(' ').Append(next2); ngramDict.AddOrUpdate(_Sb.ToString()); } CutDictionaryIfNeed(ngramDict, dictType); return(ngramDict); } #endregion case NGramsEnum.ngram_4: #region { var ngramDict = new Dictionary <string, int>(); for (int p = 0, len = words.Count - 3; p < len; p++) { var curr = words[p]; var next1 = words[p + 1]; var next2 = words[p + 2]; var next3 = words[p + 3]; _Sb.Clear().Append(curr).Append(' ').Append(next1).Append(' ').Append(next2).Append(' ').Append(next3); ngramDict.AddOrUpdate(_Sb.ToString()); } CutDictionaryIfNeed(ngramDict, dictType); return(ngramDict); } #endregion default: //case NGramsEnum.ngram_1: return(null); } }