/// <summary> /// /// </summary> /// <param name="freeMemory">是否释放内存</param> public void Commit(bool freeMemory) { if (_isDirty == false) { return; } using (new L(this)) { log.Debug("writing " + _FileName); int[] keys = _cache.Keys(); Array.Sort(keys); foreach (int k in keys) { WAHBitArray bmp = null; if (_cache.TryGetValue(k, out bmp) && bmp.isDirty) { this.SaveBitmap(k, bmp); bmp.FreeMemory(); bmp.isDirty = false; } } Flush(); if (freeMemory) { _cache = //new SafeDictionary<int, WAHBitArray>(); new SafeSortedList <int, WAHBitArray>(); log.Debug(" freeing cache"); } _isDirty = false; } }
internal ProbabilityChangeStateWrapper() { countInitialStrings = new SafeSortedList<string, int>(); countCombinedStrings = new SafeSortedList<string, int>(); fragmentsOfModel = new List<LanguageIdentificationModelWrapper>(); }
public void GetPageList(List <int> PageListDiskPages, SafeSortedList <T, PageInfo> PageList, out int lastIndexedRow) { lastIndexedRow = Helper.ToInt32(_FileHeader, 11); // load page list PageListDiskPages.Add(0); // first page list int nextpage = LoadPageListData(0, PageList); while (nextpage != -1) { nextpage = LoadPageListData(nextpage, PageList); if (nextpage != -1) { PageListDiskPages.Add(nextpage); } } }
internal void SavePageList(SafeSortedList <T, PageInfo> _pages, List <int> diskpages) { lock (_fileLock) { // save page list int c = (_pages.Count / Global.PageItemCount) + 1; // allocate pages needed while (c > diskpages.Count) { diskpages.Add(GetNewPageNumber()); } byte[] page = new byte[_PageLength]; for (int i = 0; i < (diskpages.Count - 1); i++) { byte[] block = CreateBlockHeader(1, Global.PageItemCount, diskpages[i + 1]); Buffer.BlockCopy(block, 0, page, 0, block.Length); for (int j = 0; j < Global.PageItemCount; j++) { CreatePageListData(_pages, i * Global.PageItemCount, block.Length, j, page); } SeekPage(diskpages[i]); _file.Write(page, 0, page.Length); } c = _pages.Count % Global.PageItemCount; byte[] lastblock = CreateBlockHeader(1, (ushort)c, -1); Buffer.BlockCopy(lastblock, 0, page, 0, lastblock.Length); int lastoffset = (_pages.Count / Global.PageItemCount) * Global.PageItemCount; for (int j = 0; j < c; j++) { CreatePageListData(_pages, lastoffset, lastblock.Length, j, page); } SeekPage(diskpages[diskpages.Count - 1]); _file.Write(page, 0, page.Length); } }
private int LoadPageListData(int page, SafeSortedList <T, PageInfo> PageList) { lock (_fileLock) { // load page list data int nextpage = -1; SeekPage(page); byte[] b = new byte[_PageLength]; _file.Read(b, 0, _PageLength); if (b[0] == _BlockHeader[0] && b[1] == _BlockHeader[1] && b[2] == _BlockHeader[2] && b[3] == _BlockHeader[3]) { short count = Helper.ToInt16(b, 5); if (count > _PageNodeCount) { throw new Exception("Count > node size"); } nextpage = Helper.ToInt32(b, 11); int index = _BlockHeader.Length; for (int i = 0; i < count; i++) { int idx = index + _rowSize * i; byte ks = b[idx]; T key = _T.GetObject(b, idx + 1, ks); int pagenum = Helper.ToInt32(b, idx + 1 + _maxKeySize); // add counts int unique = Helper.ToInt32(b, idx + 1 + _maxKeySize + 4); // FEATURE : add dup count PageList.Add(key, new PageInfo(pagenum, unique, 0)); } } else { throw new Exception("Page List header is invalid"); } return(nextpage); } }
private void CreatePageListData(SafeSortedList <T, PageInfo> _pages, int offset, int index, int counter, byte[] page) { int idx = index + _rowSize * counter; // key bytes byte[] kk = _T.GetBytes(_pages.GetKey(counter + offset)); byte size = (byte)kk.Length; if (size > _maxKeySize) { size = _maxKeySize; } // key size = 1 byte page[idx] = size; Buffer.BlockCopy(kk, 0, page, idx + 1, page[idx]); // offset = 4 bytes byte[] b = Helper.GetBytes(_pages.GetValue(offset + counter).PageNumber, false); Buffer.BlockCopy(b, 0, page, idx + 1 + _maxKeySize, b.Length); // add counts b = Helper.GetBytes(_pages.GetValue(offset + counter).UniqueCount, false); Buffer.BlockCopy(b, 0, page, idx + 1 + _maxKeySize + 4, b.Length); // FEATURE : add dup counts }
private Return<object> CreateVectorSpaceModel(List<Document> documents, int trainingId, List<Word> words, SqlTransaction transaction, string nameForVectorSpaceModelTable) { Return<object> _answer = new Return<object>(); { try { //Word,DocumentId,Weight_Binary,Weight_TermFrequency,Weight_WeightedTermFrequency,Weight_AugmentedNormalizedTermFrequency //,Weight_Logarithmic,Weight_InverseDocumentFrequency,Weight_TF_IDF,Weight_ProbabilisticInverseFrequency,Weight_NormalFunction //,Weight_GF_IDF,Weight_Entropy,Weight_WeightedInverseFrequency //the list of words in each document is Distinct List<string> _vocabulary = words.Select(w => w.word).ToList(); SafeSortedList<string, SortedList<int, WritableTuple<bool?, int?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?>>> _wordsDocumentWeighted = new SafeSortedList<string, SortedList<int, WritableTuple<bool?, int?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?>>>(); foreach (string _word in _vocabulary) {//word to word SortedList<int, WritableTuple<bool?, int?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?>> _listOfDocuments = new SortedList<int, WritableTuple<bool?, int?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?>>(); foreach (Document _document in documents) {//info to info _listOfDocuments.Add(_document.id, new WritableTuple<bool?, int?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?> (null, null, null, null, null, null, null, null, null, null, null, null) ); }//info to info _wordsDocumentWeighted.Add(_word, _listOfDocuments); }//word to word List<Tuple<int, List<string>>> _wordsDocument = documents.Select(i => new Tuple<int, List<string>>(i.id, i.wordsFromVocabulary)).ToList(); if (!_answer.theresError) {//BinaryWeighting Return<List<Tuple<int, List<Tuple<string, bool>>>>> _answerWeighting = vectorSpaceModelUtility.BinaryWeighting(_vocabulary, _wordsDocument); if (_answerWeighting.theresError) { _answer.theresError = true; _answer.error = _answerWeighting.error; } else { foreach (Tuple<int, List<Tuple<string, bool>>> _tupleWeightingForDocument in _answerWeighting.data) {//document to document foreach (Tuple<string, bool> _tupleWeightingForWord in _tupleWeightingForDocument.Item2) {//word to word SortedList<int, WritableTuple<bool?, int?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?>> _documents = _wordsDocumentWeighted[_tupleWeightingForWord.Item1]; _documents[_tupleWeightingForDocument.Item1].Item1 = _tupleWeightingForWord.Item2; }//word to word }//document to document } }//BinaryWeighting if (!_answer.theresError) {//TermFrequencyWeighting Return<List<Tuple<int, List<Tuple<string, int>>>>> _answerWeighting = vectorSpaceModelUtility.TermFrequencyWeighting(_vocabulary, _wordsDocument); if (_answerWeighting.theresError) { _answer.theresError = true; _answer.error = _answerWeighting.error; } else { foreach (Tuple<int, List<Tuple<string, int>>> _tupleWeightingForDocument in _answerWeighting.data) {//document to document foreach (Tuple<string, int> _tupleWeightingForWord in _tupleWeightingForDocument.Item2) {//word to word SortedList<int, WritableTuple<bool?, int?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?>> _documents = _wordsDocumentWeighted[_tupleWeightingForWord.Item1]; _documents[_tupleWeightingForDocument.Item1].Item2 = _tupleWeightingForWord.Item2; }//word to word }//document to document } }//TermFrequencyWeighting if (!_answer.theresError) { List<VectorSpaceModel> _vectorSpaceModels = new List<VectorSpaceModel>(); foreach (Word _word in words) {//word to word SortedList<int, WritableTuple<bool?, int?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?, decimal?>> _wordOfDocuments = _wordsDocumentWeighted[_word.word]; _vectorSpaceModels.AddRange(_wordOfDocuments.Select(t => new VectorSpaceModel() { documentId = t.Key , wordId = _word.id , weight_Binary = t.Value.Item1 , weight_TermFrequency = !t.Value.Item2.HasValue ? (byte?)null : (byte)t.Value.Item2.Value , weight_WeightedTermFrequency = !t.Value.Item3.HasValue ? (float?)null : (float)t.Value.Item3.Value , weight_AugmentedNormalizedTermFrequency = !t.Value.Item4.HasValue ? (float?)null : (float)t.Value.Item4.Value , weight_Logarithmic = !t.Value.Item5.HasValue ? (float?)null : (float)t.Value.Item5.Value , weight_InverseDocumentFrequency = !t.Value.Item6.HasValue ? (float?)null : (float)t.Value.Item6.Value , weight_TF_IDF = !t.Value.Item7.HasValue ? (float?)null : (float)t.Value.Item7.Value , weight_ProbabilisticInverseFrequency = !t.Value.Item8.HasValue ? (float?)null : (float)t.Value.Item8.Value , weight_NormalFunction = !t.Value.Item9.HasValue ? (float?)null : (float)t.Value.Item9.Value , weight_GF_IDF = !t.Value.Item10.HasValue ? (float?)null : (float)t.Value.Item10.Value , weight_Entropy = !t.Value.Item11.HasValue ? (float?)null : (float)t.Value.Item11.Value , weight_WeightedInverseFrequency = !t.Value.Item12.HasValue ? (float?)null : (float)t.Value.Item12.Value } ) .ToList()); }//word to word Return<int> _answerAddVectorSpaceModel = this.grpClassDataAccess.AddVectorSpaceModel(_vectorSpaceModels, transaction, nameForVectorSpaceModelTable); if (_answerAddVectorSpaceModel.theresError) { _answer.theresError = true; _answer.error = _answerAddVectorSpaceModel.error; } } } catch (Exception _ex) { _answer.theresError = true; _answer.error = Utility.GetError(_ex, this.GetType()); } } return _answer; }
/// <summary> /// Makes an ab->abc type model where is registered the probability that the string "ab" would be followed by the character "c" /// It also returns an abc->string.Empty model with empty probabilities /// </summary> /// <param name="sizeBefore">size of initial string (ab)</param> /// <param name="sizeAfter">size of initial string (d)</param> /// <param name="text">The text to be used to learn the model</param> /// <returns>ab,c,probability. If d==string.Empty it is mean it's an initial probability</returns> public Return<List<LanguageIdentificationModelWrapper>> Model(int countSubprocesses, int sizeBefore, int sizeAfter, List<string> text) { Return<List<LanguageIdentificationModelWrapper>> _answer = new Return<List<LanguageIdentificationModelWrapper>>() { data = new List<LanguageIdentificationModelWrapper>() }; if (countSubprocesses <= 0) { _answer.theresError = true; _answer.error = Utility.GetError(new ArgumentException("countSubprocesses >0"), this.GetType()); } else if (sizeBefore <= 0) { _answer.theresError = true; _answer.error = Utility.GetError(new ArgumentException("sizeBefore >0"), this.GetType()); } else if (sizeAfter <= 0) { _answer.theresError = true; _answer.error = Utility.GetError(new ArgumentException("sizeAfter >0"), this.GetType()); } else if (text == null) { _answer.theresError = true; _answer.error = Utility.GetError(new ArgumentNullException("text"), this.GetType()); } else { try { Return<List<string>> _answerExtractSentences = MLUtility.ExtractSentences(text); if (_answerExtractSentences.theresError) { _answer.theresError = true; _answer.error = _answerExtractSentences.error; } else {//we have sentences List<string> _sentences = new List<string>(); List<string> _words = new List<string>(); foreach (string _sentence in _answerExtractSentences.data.Where(s => !string.IsNullOrWhiteSpace(s) && s.Where(c => char.IsLetter(c)).Any())) {//sentence to sentence if (_words.Count > MAXIMUM_NUMBER_WORDS_ANALYZE) break; else { _sentences.Add(_sentence); Return<List<string>> _answerGetWords = MLUtility.GetWords(new List<string>() { _sentence }); if (_answerGetWords.theresError) { _answer.theresError = true; _answer.error = _answerGetWords.error; } else _words.AddRange(_answerGetWords.data); } if (_answer.theresError) break; }//sentence to sentence if (!_answer.theresError) if (_words.Count < MINIMUM_NUMBER_WORDS_ANALYZE) { string _message = string.Format("There's a minimum limit of {0} character to create the model. You just have {1}", Convert.ToString(MINIMUM_NUMBER_WORDS_ANALYZE), Convert.ToString(_words.Count)); _answer.theresError = true; _answer.error = Utility.GetError(new Exception(_message), this.GetType()); } if (!_answer.theresError) {//we have words string _entireText = string.Join(GetSeparator(sizeBefore, sizeAfter), _words); List<string> _initialStrings = new List<string>() { string.Empty }; List<string> _finalString = new List<string>() { string.Empty }; //normalizing characters = characters.Select(c => c.ToLower()).Distinct().ToList(); _entireText = _entireText.ToLower().Trim(); charactersChanges.ForEach(t => _entireText = _entireText.Replace(t.Item2, string.Empty)); charactersChanges.ForEach(t => _entireText = _entireText.Replace(t.Item1, t.Item2)); for (int i = 0; i < sizeBefore; i++) { List<string> _addedStrings = _initialStrings.SelectMany(ci => AddCharactersToTheRight(ci)).ToList(); _initialStrings = _addedStrings; } for (int i = 0; i < sizeAfter; i++) { List<string> _addedStrings = _finalString.SelectMany(ci => AddCharactersToTheRight(ci)).ToList(); _finalString = _addedStrings; } _initialStrings = _initialStrings.OrderBy(c => c).ToList(); _finalString = _finalString.OrderBy(c => c).ToList(); Return<List<WritableTuple<string, int>>> _answerCountExistingCombinations = CountExistingCombinations(countSubprocesses, _entireText, _initialStrings, _finalString); if (_answerCountExistingCombinations.theresError) { _answer.theresError = true; _answer.error = _answerCountExistingCombinations.error; } else {//let us count the probabilities of all combinations Return<List<WritableTuple<string, int>>> _answerCountExistingStrings = CountExistingStrings(countSubprocesses, _entireText, _initialStrings); if (_answerCountExistingStrings.theresError) { _answer.theresError = true; _answer.error = _answerCountExistingStrings.error; } else { Stopwatch _stopwatchCalculateProbabilitiesChangeState = new Stopwatch(); _stopwatchCalculateProbabilitiesChangeState.Start(); SafeSortedList<string, int> _countCombinedStrings = new SafeSortedList<string, int>() , _countInitialStrings = new SafeSortedList<string, int>(); _answerCountExistingCombinations.data.ForEach(t => _countCombinedStrings.Add(t.Item1, t.Item2)); _answerCountExistingStrings.data.ForEach(t => _countInitialStrings.Add(t.Item1, t.Item2)); Return<List<LanguageIdentificationModelWrapper>> _answerCalculateProbabilitiesChangeState = CalculateProbabilitiesChangeStateNew(countSubprocesses, _initialStrings, _finalString , _countCombinedStrings, _countInitialStrings); _stopwatchCalculateProbabilitiesChangeState.Stop(); if (_answerCalculateProbabilitiesChangeState.theresError) { _answer.theresError = true; _answer.error = _answerCalculateProbabilitiesChangeState.error; } else _answer.data.AddRange(_answerCalculateProbabilitiesChangeState.data); } }//let us count the probabilities of all combinations if (!_answer.theresError) {//save initial probabilities Return<List<WritableTuple<string, int>>> _answerCountBeginningOfSentence = CountBeginningOfSentence(countSubprocesses, _sentences, _initialStrings); if (_answerCountBeginningOfSentence.theresError) { _answer.theresError = true; _answer.error = _answerCountBeginningOfSentence.error; } else { Return<List<LanguageIdentificationModelWrapper>> _answerCalculateProbabilitiesInitialState = CalculateProbabilitiesInitialState(countSubprocesses, _sentences, _initialStrings, _answerCountBeginningOfSentence.data); if (_answerCalculateProbabilitiesInitialState.theresError) { _answer.theresError = true; _answer.error = _answerCalculateProbabilitiesInitialState.error; } else _answer.data.AddRange(_answerCalculateProbabilitiesInitialState.data); } }//save initial probabilities }//we have words }//we have sentences } catch (Exception _ex) { _answer.theresError = true; _answer.error = Utility.GetError(_ex, this.GetType()); } } return _answer; }
/// <summary> /// Weights the text using the models passed as parameters. /// </summary> /// <param name="text"></param> /// <param name="languageModel">The models model</param> /// <param name="sizeInitialString">The size of the initial string</param> /// <param name="sizeFinalString">The size of the final string (when found)</param> /// <returns>Weight, number of times a factorization occurred</returns> private Return<WritableTuple<FloatingPointBigScale, int>> WeightLanguage(string text, SafeSortedList<string, LanguageIdentificationModelWrapper> languageModel, int sizeInitialString, int sizeFinalString) { Return<WritableTuple<FloatingPointBigScale, int>> _answer = new Return<WritableTuple<FloatingPointBigScale, int>>() { data = new WritableTuple<FloatingPointBigScale, int>(new FloatingPointBigScale(0, 0), 0) }; if (string.IsNullOrWhiteSpace(text)) { _answer.theresError = true; _answer.error = Utility.GetError(new ArgumentNullException("text"), this.GetType()); } else if (languageModel == null) { _answer.theresError = true; _answer.error = Utility.GetError(new ArgumentNullException("languageModel"), this.GetType()); } else { try { Stopwatch _stopwatch = new Stopwatch(); _stopwatch.Start(); int _indexSearchString = 0; {//first, we search probability of initial string while (_answer.data.Item1.numericalBase == 0 && (text.Length >= _indexSearchString + sizeInitialString)) { string _initialString = text.Substring(_indexSearchString, sizeInitialString); LanguageIdentificationModelWrapper _found = languageModel.ContainsKey(_initialString) ? languageModel[_initialString] : null; if (_found != null && _found.probability != 0) { _answer.data.Item1.numericalBase = _found.probability; _answer.data.Item2++; } else _indexSearchString++; } }//first, we search probability of initial string _stopwatch.Stop(); string _timeInitialString = string.Format("{0} ticks", _stopwatch.Elapsed.Ticks.ToString()); _stopwatch.Reset(); _stopwatch.Start(); if (_answer.data.Item1.numericalBase > 0) { while (_answer.data.Item1.numericalBase <= 0.1M) { _answer.data.Item1.numericalBase *= 10; _answer.data.Item1.exponent += -1; } while (text.Length >= _indexSearchString + sizeInitialString + sizeFinalString) { string _initialString = text.Substring(_indexSearchString, sizeInitialString), _finalString = text.Substring(_indexSearchString + sizeInitialString, sizeFinalString) , _composedString = string.Format("{0}{1}", _initialString, _finalString); LanguageIdentificationModelWrapper _found = languageModel.ContainsKey(_composedString) ? languageModel[_composedString] : null; decimal _factor = SOFT_PROBABILITY_0; if (_found != null && _found.probability > 0) {//we've found a probability not equal 0 _factor = _found.probability; _answer.data.Item2++; }//we've found a probability not equal 0 _answer.data.Item1.numericalBase *= _factor; while (_answer.data.Item1.numericalBase <= 0.1M) { _answer.data.Item1.numericalBase *= 10; _answer.data.Item1.exponent += -1; } _indexSearchString++; } } _stopwatch.Stop(); string _timeCombinedString = string.Format("{0} ticks", _stopwatch.Elapsed.Ticks.ToString()); } catch (Exception _ex) { _answer.theresError = true; _answer.error = Utility.GetError(_ex, this.GetType()); } } return _answer; }
private Return<List<LanguageIdentificationModelWrapper>> CalculateProbabilitiesChangeStateNew(int countSubprocesses, List<string> initialStrings, List<string> finalStrings , SafeSortedList<string, int> countCombinedStrings, SafeSortedList<string, int> countInitialStrings) { Return<List<LanguageIdentificationModelWrapper>> _answer = new Return<List<LanguageIdentificationModelWrapper>>() { data = new List<LanguageIdentificationModelWrapper>() }; { try { List<LanguageIdentificationModelWrapper> _allCombinations = (from _initialString in initialStrings from _finalString in finalStrings select new LanguageIdentificationModelWrapper(_initialString, _finalString, 1) ) .ToList(); List<List<LanguageIdentificationModelWrapper>> _partitions = _allCombinations.PartitionAccordingSizeOfPartitions(SIZE_PARTITION_STRINGS); List<ProbabilityChangeStateWrapper> _wrappers = _partitions.Select(p => new ProbabilityChangeStateWrapper() { countInitialStrings = countInitialStrings , countCombinedStrings = countCombinedStrings , fragmentsOfModel = p } ).ToList(); int _countSubprocesses = _wrappers.Count < countSubprocesses ? _wrappers.Count : countSubprocesses; errorsMultithreadedProcess.Clear(); Return<object> _answerProcess = new Return<object>(); using (Processor<ProbabilityChangeStateWrapper> _processor = new Processor<ProbabilityChangeStateWrapper>(_countSubprocesses, PAUSE_MILISECONDS_PROCESSOR)) { _processor.ErrorProcessEntity += processorCalculateProbabilitiesChangeState_ErrorProcessEntity; _processor.Progress += processorCalculateProbabilitiesChangeState_Progress; _answerProcess = _countSubprocesses == 1 ? _processor.ProcessSynchronously(_wrappers, CalculateProbabilitiesChangeStateDirtyHands) : _processor.Process(_wrappers, CalculateProbabilitiesChangeStateDirtyHands); _processor.Progress -= processorCalculateProbabilitiesChangeState_Progress; _processor.ErrorProcessEntity -= processorCalculateProbabilitiesChangeState_ErrorProcessEntity; } if (_answerProcess.theresError) { _answer.theresError = true; _answer.error = _answerProcess.error; } else if (errorsMultithreadedProcess.Any()) { _answer.theresError = true; _answer.error = errorsMultithreadedProcess.First().Item2; } else _answer.data = _wrappers.SelectMany(c => c.fragmentsOfModel).ToList(); } catch (Exception _ex) { _answer.theresError = true; _answer.error = Utility.GetError(_ex, this.GetType()); } } return _answer; }