// not used function public RegExpProcessingResultsCollection <ColRegExpStatisticsProcessingResult> CalcScores(DataTable table, IEnumerable <DataRow> documentRows, ColRegExpStatisticsProcessingParams param) { var columnIndexID = table.Columns.IndexOf("ED_ENC_NUM"); var columnIndexText = table.Columns.IndexOf("NOTE_TEXT"); if (columnIndexID == -1 || columnIndexText == -1) { throw new Exception("Cannot find source columns"); } /////////////////////////////////////////////////////////////////////////////// var docsCount = table.Rows.Count; Parallel_CalcScores(DatabaseHelper.AsDataRecordEnumerable(documentRows), docsCount, columnIndexID, columnIndexText, param); var matchResults = new RegExpProcessingResultsCollection <ColRegExpStatisticsProcessingResult>(_listRegExps.Select(x => new ColRegExpStatisticsProcessingResult { ID = x.ID, TotalMatches = x.TotalMatches, TotalDocuments = x.TotalDocuments })); return(matchResults); }
protected void ColRegExp_WriteScores(ViewsManager views, RegExpProcessingResultsCollection <ColRegExpStatisticsProcessingResult> matches) { var positiveDocumentsCount = (double)views.MainForm.datasetMain.Documents.Count(p => !p.IsScoreNull() && p.Score > 0); var regExpJoin = from row in views.MainForm.datasetMain.ColRegExp.Cast <MainDataSet.ColRegExpRow>() .Where(x => x.RowState != DataRowState.Deleted) join match in matches.Items on row.ID equals match.ID select new { RegExpRow = row, Match = match }; foreach (var item in regExpJoin) { item.RegExpRow["TotalDocuments"] = item.Match.TotalDocuments; item.RegExpRow["TotalMatches"] = item.Match.TotalMatches; item.RegExpRow["PosDocuments"] = item.Match.PostitiveDocuments; /////////////////////////////////////////////////////////////////////////////// var positiveDocumetsPercentage = positiveDocumentsCount > 0 ? (item.Match.PostitiveDocuments / positiveDocumentsCount) * 100D : 0; item.RegExpRow["PercentPosDocuments"] = positiveDocumetsPercentage; } /////////////////////////////////////////////////////////////////////////////// //views.MainForm.adapterColRegExp.Update(views.MainForm.datasetMain.ColRegExp); //views.MainForm.adapterColRegExp.Fill(views.MainForm.datasetMain.ColRegExp); }
public void CalcScores(RegExpScoreProcessingParams param) { var synergies = LoadSynergies(Path.Combine(param.WorkingFolder, param.SynergiesFileName)); using (var docsConnection = DatabaseHelper.CreateConnection(param.DocumentsDatabaseFilePath, param.Password)) { docsConnection.Open(); /////////////////////////////////////////////////////////////////////////////// var noteColumnCount = DatabaseHelper.GetNoteColumnCount(docsConnection, "Documents", "NOTE_TEXT"); /////////////////////////////////////////////////////////////////////////////// List <int> noteColumnIndexList = new List <int>(); noteColumnIndexList.Add(2); /////////////////////////////////////////////////////////////////////////////// var docsCount = DatabaseHelper.GetRowsCount(docsConnection, "Documents"); var query = "SELECT ED_ENC_NUM, Category, NOTE_TEXT"; for (var noteColumnIndex = 1; noteColumnIndex < noteColumnCount; noteColumnIndex++) { query += ", NOTE_TEXT" + noteColumnIndex.ToString(); noteColumnIndexList.Add(noteColumnIndex + 2); } query += " FROM Documents"; var documentRecords = DatabaseHelper.GetDataRecords(docsConnection, query); /////////////////////////////////////////////////////////////////////////////// var results = Parallel_CalcScores(documentRecords, docsCount, 0, noteColumnIndexList, synergies); results.Serialize(param.GetFullPath(param.ScoreOutputFileName)); /////////////////////////////////////////////////////////////////////////////// var matchResults = new RegExpProcessingResultsCollection <RegExpMatchProcessingResult>(_listRegExps.Select(x => new RegExpMatchProcessingResult { RegExpID = x.ID, TotalMatches = x.TotalMatches, TotalDocuments = x.TotalDocuments, TotalRecords = x.TotalRecords, CategorizedRecords = x.CategorizedRecords })); matchResults.Serialize(param.GetFullPath(param.MatchesOutputFileName)); } }
protected RegExpProcessingResultsCollection <ColRegExpExtractProcessingResult> Parallel_ExtractValues(IEnumerable <IDataRecord> enumerableDocs, long docsCount, int columnIndexID, int columnIndexText, ColRegExpExtractProcessingParams param, Dictionary <int, CSScriptManager> scripts) { var results = new RegExpProcessingResultsCollection <ColRegExpExtractProcessingResult>(); /////////////////////////////////////////////////////////////////////////////// long progressStep = _listRegExps.Count; long progressMax = docsCount * progressStep; long progressValue = 0; /////////////////////////////////////////////////////////////////////////////// Parallel.ForEach(enumerableDocs, (record, state) => { try { if (record.IsDBNull(columnIndexID)) { return; } /////////////////////////////////////////////////////////////////////////////// var documentID = record.GetDouble(columnIndexID); if (param.DocumentsList.Any() && !param.DocumentsList.Contains(documentID)) { return; } /////////////////////////////////////////////////////////////////////////////// ExtractDocumentValues(documentID, columnIndexText, record, scripts, param.ColumnID, results, param); /////////////////////////////////////////////////////////////////////////////// var threadProgressValue = Interlocked.Add(ref progressValue, progressStep); var progressPercentage = (int)(threadProgressValue / (double)progressMax * 100D); if (!this.Logger.ReportProgress(progressPercentage, threadProgressValue)) { state.Stop(); } } catch (Exception ex) { Logger.HandleException(ex); } }); /////////////////////////////////////////////////////////////////////////////// return(results); }
protected RegExpProcessingResultsCollection <RegExpScoreProcessingResult> Parallel_CalcScores(IEnumerable <IDataRecord> enumerableDocs, long docsCount, int columnIndexID, List <int> columnIndexTextList, List <RegExpSynergy> synergies) { var results = new RegExpProcessingResultsCollection <RegExpScoreProcessingResult>(); /////////////////////////////////////////////////////////////////////////////// long progressStep = _listRegExps.Count; long progressMax = docsCount * progressStep; long progressValue = 0; ////////////////////////////////////////////////////////////////////////// Parallel.ForEach(enumerableDocs, (record, state) => { try { //List<int> tmp = new List<int>(); //tmp.Add(1); var scoreResult = CalcDocumentScore(columnIndexID, columnIndexTextList, record, synergies); if (scoreResult != null) { results.Add(scoreResult); } /////////////////////////////////////////////////////////////////////////////// var threadProgressValue = Interlocked.Add(ref progressValue, progressStep); var progressPercentage = (int)(threadProgressValue / (double)progressMax * 100D); if (!this.Logger.ReportProgress(progressPercentage, threadProgressValue)) { state.Stop(); } } catch (Exception ex) { //Logger.AppendToLog("Parallel_CalcScores"); Logger.HandleException(ex); } }); /////////////////////////////////////////////////////////////////////////////// return(results); }
public void CalcScores(ColRegExpStatisticsProcessingParams param) { //MessageBox.Show("RegExpProcessor: CalcScores"); using (var docsConnection = DatabaseHelper.CreateConnection(param.DocumentsDatabaseFilePath, param.Password)) { docsConnection.Open(); /////////////////////////////////////////////////////////////////////////////// var docsCount = DatabaseHelper.GetRowsCount(docsConnection, "Documents", param.OnlyPositiveScore ? "Score > 0" : null); string query = "SELECT ED_ENC_NUM, NOTE_TEXT, Score FROM Documents"; if (param.OnlyPositiveScore) { query += " WHERE Score > 0"; } var documentRecords = DatabaseHelper.GetDataRecords(docsConnection, query); /////////////////////////////////////////////////////////////////////////////// Parallel_CalcScores(documentRecords, docsCount, 1, 2, param); /////////////////////////////////////////////////////////////////////////////// var matchResults = new RegExpProcessingResultsCollection <ColRegExpStatisticsProcessingResult>(_listRegExps.Select(x => new ColRegExpStatisticsProcessingResult { ID = x.ID, TotalMatches = x.TotalMatches, TotalDocuments = x.TotalDocuments })); matchResults.Serialize(param.GetFullPath(param.MatchesOutputFileName)); } }
public RegExpProcessingResultsCollection <RegExpMatchProcessingResult> CalcScores(DataTable table, IEnumerable <DataRow> documentRows, List <RegExpSynergy> synergies) { var columnIndexID = table.Columns.IndexOf("ED_ENC_NUM"); List <int> columnIndexTextList = new List <int>(); for (int i = 0; i < table.Columns.Count; i++) { if (table.Columns[i].ColumnName.StartsWith("NOTE_TEXT")) { columnIndexTextList.Add(i); } } // var columnIndexText = table.Columns.IndexOf("NOTE_TEXT"); if (columnIndexID == -1 || columnIndexTextList.Count < 1) { throw new Exception("Cannot find source columns"); } /////////////////////////////////////////////////////////////////////////////// var docsCount = table.Rows.Count; Parallel_CalcScores(DatabaseHelper.AsDataRecordEnumerable(documentRows), docsCount, columnIndexID, columnIndexTextList, synergies); var matchResults = new RegExpProcessingResultsCollection <RegExpMatchProcessingResult>(_listRegExps.Select(x => new RegExpMatchProcessingResult { RegExpID = x.ID, TotalMatches = x.TotalMatches, TotalDocuments = x.TotalDocuments })); return(matchResults); }
protected int ColRegExp_WriteExtractedValues(ViewsManager views, RegExpProcessingResultsCollection <ColRegExpExtractProcessingResult> values, bool scriptExtract) { var progressMax = 0; /////////////////////////////////////////////////////////////////////////////// var join = (from extractResult in values.Items join rowDynamicColumn in views.MainForm.datasetMain.DynamicColumns on extractResult.ColumnID equals rowDynamicColumn.ID join rowDocument in views.MainForm.datasetMain.Documents on extractResult.DocumentID equals rowDocument.ED_ENC_NUM orderby extractResult.ExtractOptions != null ? extractResult.ExtractOptions.Order : 0 descending select new { extractResult, rowDocument, rowDynamicColumn }) .ToList(); if (!join.Any()) { return(0); } /////////////////////////////////////////////////////////////////////////////// var rowsToUpdate = new List <MainDataSet.DocumentsRow>(); /////////////////////////////////////////////////////////////////////////////// int progressValue = 0; foreach (var j in join) { try { var columnName = j.rowDynamicColumn.Title; /////////////////////////////////////////////////////////////////////////////// var needUpdate = false; switch ((DynamicColumnType)j.rowDynamicColumn.Type) { case DynamicColumnType.FreeText: needUpdate = Extract_WriteFreeTextValue(j.extractResult, j.rowDocument, columnName, scriptExtract); break; case DynamicColumnType.Numeric: needUpdate = Extract_WriteNumericValue(j.extractResult, j.rowDocument, columnName, scriptExtract); break; case DynamicColumnType.DateTime: needUpdate = Extract_WriteDateTimeValue(j.extractResult, j.rowDocument, columnName, scriptExtract); break; } if (needUpdate && !j.rowDynamicColumn.Title.StartsWith("NOTE_TEXT")) { rowsToUpdate.Add(j.rowDocument); progressMax++; } if (needUpdate && j.rowDynamicColumn.Title.StartsWith("NOTE_TEXT")) { progressMax++; } /////////////////////////////////////////////////////////////////////////////// j.rowDocument.AcceptChanges(); } catch (Exception ex) { HandleException(ex); } /////////////////////////////////////////////////////////////////////////////// progressValue++; if (progressValue % 50 == 0) { if (_worker.CancellationPending) { break; } var progressPercentage = (int)((double)progressValue / join.Count * 100D); ReportProgress(progressPercentage, "Writing values " + progressValue + " of " + join.Count); } } /////////////////////////////////////////////////////////////////////////////// progressValue = 0; if (rowsToUpdate.Count > 0) { progressMax = rowsToUpdate.Count; progressValue = 0; ReportProgress(0); var columnsToUpdate = join.Where(x => !x.rowDynamicColumn.Title.StartsWith("NOTE_TEXT")).Select(x => x.rowDynamicColumn.Title).Distinct().ToList(); var columnsInfo = views.MainForm.adapterDocuments.GetColumnsInfo(columnsToUpdate); if (columnsToUpdate.Count != columnsInfo.Count) { throw new Exception("Failed to create update command"); } if (columnsInfo.Count != 0) { var updateCommand = views.MainForm.adapterDocuments.CreateUpdateColumnsCommand(columnsInfo); foreach (var row in rowsToUpdate) { views.MainForm.adapterDocuments.Update(row, columnsInfo, updateCommand); /////////////////////////////////////////////////////////////////////////////// progressValue++; if (progressValue % 50 == 0) { if (_worker.CancellationPending) { break; } var progressPercentage = (int)((double)progressValue / progressMax * 100D); ReportProgress(progressPercentage, "Writing values " + progressValue + " of " + join.Count); } } views.MainForm.adapterDocuments.Fill(); } } /////////////////////////////////////////////////////////////////////////////// /* For Note_Text Columns */ var noteColumnsToUpdate = join.Where(x => x.rowDynamicColumn.Title.StartsWith("NOTE_TEXT")).Distinct().ToList(); List <string> noteColumnsValue = new List <string>(); foreach (var row in noteColumnsToUpdate) { List <ColumnInfo> noteColumnsInfo = new List <ColumnInfo>(); ColumnInfo colInfo = new ColumnInfo(); colInfo.Name = row.rowDynamicColumn.Title; // to give string type using name's type colInfo.Type = colInfo.Name.GetType(); noteColumnsInfo.Add(colInfo); views.MainForm.adapterDocuments.StartBatchQuery(); var updateCommand = views.MainForm.adapterDocuments.CreateUpdateColumnsCommand(noteColumnsInfo); updateCommand.Parameters[0].Value = row.extractResult.Value; updateCommand.Parameters[1].Value = row.extractResult.DocumentID; var count = updateCommand.ExecuteNonQuery(); if (count != 1) { throw new Exception(String.Format("Command updated {0} of expected 1 rows", count)); } views.MainForm.adapterDocuments.StartBatchQuery(); progressValue++; if (progressValue % 50 == 0) { if (_worker.CancellationPending) { break; } var progressPercentage = (int)((double)progressValue / progressMax * 100D); ReportProgress(progressPercentage, "Writing values " + progressValue + " of " + join.Count); } } /////////////////////////////////////////////////////////////////////////////// return(progressMax); }
protected int ColRegExp_WritePythonExtractedValues(ViewsManager views, RegExpProcessingResultsCollection <RegExpPythonExtractSingleProcessingResult> values) { var progressMax = 0; /////////////////////////////////////////////////////////////////////////////// var join = (from extractResult in values.Items join rowDynamicColumn in views.MainForm.datasetMain.DynamicColumns on extractResult.ColumnID equals rowDynamicColumn.ID join rowDocument in views.MainForm.datasetMain.Documents on extractResult.DocumentID equals rowDocument.ED_ENC_NUM select new { extractResult, rowDocument, rowDynamicColumn }) .ToList(); if (!join.Any()) { return(0); } /////////////////////////////////////////////////////////////////////////////// var rowsToUpdate = new List <MainDataSet.DocumentsRow>(); /////////////////////////////////////////////////////////////////////////////// int progressValue = 0; foreach (var j in join) { try { var columnName = j.rowDynamicColumn.Title; /////////////////////////////////////////////////////////////////////////////// if (columnName.StartsWith("NOTE_TEXT")) { var newValue = JsonConvert.SerializeObject(new Tuple <object, string>(JsonConvert.DeserializeObject(j.extractResult.Result), j.extractResult.NoteTextColumnName)); views.MainForm.adapterDocuments.StartBatchQuery(); var connection = views.MainForm.adapterDocuments.Connection; var cmdText = String.Format("UPDATE [{0}] SET [{1}] = @NewValue WHERE [ED_ENC_NUM] = {2}", "Documents", columnName, j.extractResult.DocumentID); var cmd = new OleDbCommand(cmdText, connection); cmd.Parameters.AddWithValue("@NewValue", newValue); cmd.ExecuteNonQuery(); } else { PythonExtract_WriteFreeTextValue(j.extractResult, j.rowDocument, columnName); } } catch (Exception ex) { HandleException(ex); } /////////////////////////////////////////////////////////////////////////////// progressValue++; if (progressValue % 20 == 0) { if (_worker.CancellationPending) { break; } var progressPercentage = (int)((double)progressValue / join.Count * 50D); ReportProgress(progressPercentage + 50, "Writing values " + progressValue + " of " + join.Count); } j.rowDocument.AcceptChanges(); } ReportProgress(100); return(progressMax); }
protected void RegExp_WriteScores(ViewsManager views, RegExpProcessingResultsCollection <RegExpMatchProcessingResult> matches, RegExpProcessingResultsCollection <RegExpScoreProcessingResult> scores) { var regExpJoin = from row in views.MainForm.datasetMain.RegExp.Cast <MainDataSet.RegExpRow>() .Where(x => x.RowState != DataRowState.Deleted) join match in matches.Items on row.ID equals match.RegExpID select new { RegExpRow = row, Match = match }; foreach (var item in regExpJoin) { item.RegExpRow["TotalRecords"] = item.Match.TotalRecords; int categorized = 0; foreach (var key in item.Match.CategorizedRecords.Keys) { categorized += item.Match.CategorizedRecords[key]; } item.RegExpRow["TotalCategorized"] = categorized; item.RegExpRow["CategorizedRecords"] = item.Match.CategorizedRecords; item.RegExpRow["TotalDocuments"] = item.Match.TotalDocuments; item.RegExpRow["TotalMatches"] = item.Match.TotalMatches; } /////////////////////////////////////////////////////////////////////////////// var docJoin = (from row in views.MainForm.datasetMain.Documents.Cast <MainDataSet.DocumentsRow>() join score in scores.Items on row.ED_ENC_NUM equals score.DocumentID select new { DocumentRow = row, Scores = score }) .ToList(); if (docJoin.Count == 0) { return; } /////////////////////////////////////////////////////////////////////////////// var noteColumnCount = docJoin[0].Scores.Score.Count(); for (int i = 1; i < docJoin[0].Scores.Score.Count; i++) { try { var score = docJoin[0].DocumentRow["Score" + i.ToString()]; } catch (Exception e) { noteColumnCount = i; break; } } double progressMax = docJoin.Count; var progressValue = 0; foreach (var item in docJoin) { item.DocumentRow["Score"] = item.Scores.Score[0]; for (int i = 1; i < noteColumnCount; i++) { item.DocumentRow["Score" + i.ToString()] = item.Scores.Score[i]; } if (item.Scores.CategoryID != -1) { item.DocumentRow["Category"] = item.Scores.CategoryID; } /////////////////////////////////////////////////////////////////////////////// progressValue++; if (progressValue % 50 == 0) { if (_worker.CancellationPending) { break; } var progressPercentage = (int)((progressValue / progressMax) * 100d); ReportProgress(progressPercentage, "Writing value " + progressValue + " of " + progressMax); } } /////////////////////////////////////////////////////////////////////////////// progressValue = 0; ReportProgress(0); /////////////////////////////////////////////////////////////////////////////// var cmdWithCategory = CreateUpdateCommand(views, true, noteColumnCount); var cmdNoCategory = CreateUpdateCommand(views, false, noteColumnCount); foreach (var item in docJoin) { if (item.Scores.CategoryID != -1) { int i = 0; for (i = 0; i < noteColumnCount; i++) { cmdWithCategory.Parameters[i].Value = item.Scores.Score[i]; } cmdWithCategory.Parameters[i].Value = item.Scores.CategoryID; cmdWithCategory.Parameters[i + 1].Value = item.Scores.DocumentID; cmdWithCategory.ExecuteNonQuery(); } else { int i = 0; for (i = 0; i < noteColumnCount; i++) { cmdNoCategory.Parameters[i].Value = item.Scores.Score[i]; } cmdNoCategory.Parameters[i].Value = item.Scores.DocumentID; cmdNoCategory.ExecuteNonQuery(); } item.DocumentRow.AcceptChanges(); /////////////////////////////////////////////////////////////////////////////// progressValue++; if (progressValue % 50 == 0) { if (_worker.CancellationPending) { break; } var progressPercentage = (int)((progressValue / progressMax) * 100d); ReportProgress(progressPercentage, "Updating document row " + progressValue + " of " + progressMax); } } /////////////////////////////////////////////////////////////////////////////// views.MainForm.adapterDocuments.Fill(); }
private void CalcDocumentStatisticsSingle(int columnIndexID, List <int> columnIndexList, IDataRecord record, RegExpProcessingResultsCollection <RegExpStatisticsSingleProcessingResult> results) { if (record.IsDBNull(columnIndexID)) { return; } /////////////////////////////////////////////////////////////////////////////// for (var i = 0; i < columnIndexList.Count; i++) { if (record.IsDBNull(columnIndexList[i])) { return; } /////////////////////////////////////////////////////////////////////////////// var documentID = record.GetDouble(columnIndexID); var docText = record.GetString(columnIndexList[i]); /////////////////////////////////////////////////////////////////////////////// var matches = _regExp.GetFilteredMatches(docText); if (matches.Any()) { matches.ForEach(x => results.Add(new RegExpStatisticsSingleProcessingResult { Word = x.Value, DocumentID = documentID, ColumnID = columnIndexList[i] - 1, Start = x.Index, Length = x.Length })); } } }
private RegExpProcessingResultsCollection <RegExpStatisticsSingleProcessingResult> Parallel_CalcStatisticsSingle(IEnumerable <IDataRecord> enumerableDocs, long docsCount, int columnIndexID, List <int> columnIndexList) { var results = new RegExpProcessingResultsCollection <RegExpStatisticsSingleProcessingResult>(); /////////////////////////////////////////////////////////////////////////////// long progressStep = 1; long progressMax = docsCount * progressStep; long progressValue = 0; ////////////////////////////////////////////////////////////////////////// Parallel.ForEach(enumerableDocs, (record, state) => { try { var count = results.Items.Count(); if (count > 0) { if (results.Items.Count() > 20000) { state.Stop(); _maxTotalMatchesReached = true; return; } /////////////////////////////////////////////////////////////////////////////// count = results.Items.GroupBy(x => x.Word).Count(); if (count > 500) { state.Stop(); _maxUniqueMatchesReached = true; return; } } /////////////////////////////////////////////////////////////////////////////// CalcDocumentStatisticsSingle(columnIndexID, columnIndexList, record, results); /////////////////////////////////////////////////////////////////////////////// var threadProgressValue = Interlocked.Add(ref progressValue, progressStep); var progressPercentage = (int)(threadProgressValue / (double)progressMax * 100D); if (!_logger.ReportProgress(progressPercentage, threadProgressValue)) { state.Stop(); } } catch (Exception ex) { _logger.HandleException(ex); } }); /////////////////////////////////////////////////////////////////////////////// return(results); }
protected void ExtractDocumentValues(double documentID, int columnIndexText, IDataRecord record, Dictionary <int, CSScriptManager> scripts, int columnID, RegExpProcessingResultsCollection <ColRegExpExtractProcessingResult> results, ColRegExpExtractProcessingParams param) { if (record.IsDBNull(columnIndexText)) { return; } /////////////////////////////////////////////////////////////////////////////// var docText = record.GetString(columnIndexText); if (scripts == null) { using (var docsConnection = DatabaseHelper.CreateConnection(param.DocumentsDatabaseFilePath, param.Password)) { docsConnection.Open(); foreach (var regExp in _listRegExps.Where(x => x.ExtractOptions != null && x.ExtractOptions.Extract)) { var noteText = DatabaseHelper.GetNoteText(docsConnection, documentID, regExp.ExtractOptions.NoteTextColumn); var extractResult = ExtractRegExpValues(regExp, documentID, noteText); if (extractResult != null) { results.Add(extractResult); } } /////////////////////////////////////////////////////////////////////////////// } } else { foreach (var pair in scripts) { var extractResult = ScriptExtractRegExpValues(documentID, docText, pair.Value, pair.Key); if (extractResult != null) { results.Add(extractResult); } } } }