예제 #1
0
        // not used function
        public RegExpProcessingResultsCollection <ColRegExpStatisticsProcessingResult> CalcScores(DataTable table, IEnumerable <DataRow> documentRows, ColRegExpStatisticsProcessingParams param)
        {
            var columnIndexID   = table.Columns.IndexOf("ED_ENC_NUM");
            var columnIndexText = table.Columns.IndexOf("NOTE_TEXT");

            if (columnIndexID == -1 || columnIndexText == -1)
            {
                throw new Exception("Cannot find source columns");
            }

            ///////////////////////////////////////////////////////////////////////////////

            var docsCount = table.Rows.Count;

            Parallel_CalcScores(DatabaseHelper.AsDataRecordEnumerable(documentRows), docsCount, columnIndexID, columnIndexText, param);

            var matchResults = new RegExpProcessingResultsCollection <ColRegExpStatisticsProcessingResult>(_listRegExps.Select(x => new ColRegExpStatisticsProcessingResult
            {
                ID             = x.ID,
                TotalMatches   = x.TotalMatches,
                TotalDocuments = x.TotalDocuments
            }));

            return(matchResults);
        }
        protected void ColRegExp_WriteScores(ViewsManager views, RegExpProcessingResultsCollection <ColRegExpStatisticsProcessingResult> matches)
        {
            var positiveDocumentsCount = (double)views.MainForm.datasetMain.Documents.Count(p => !p.IsScoreNull() && p.Score > 0);

            var regExpJoin = from row in views.MainForm.datasetMain.ColRegExp.Cast <MainDataSet.ColRegExpRow>()
                             .Where(x => x.RowState != DataRowState.Deleted)
                             join match in matches.Items
                             on row.ID equals match.ID
                             select new { RegExpRow = row, Match = match };

            foreach (var item in regExpJoin)
            {
                item.RegExpRow["TotalDocuments"] = item.Match.TotalDocuments;
                item.RegExpRow["TotalMatches"]   = item.Match.TotalMatches;
                item.RegExpRow["PosDocuments"]   = item.Match.PostitiveDocuments;

                ///////////////////////////////////////////////////////////////////////////////

                var positiveDocumetsPercentage = positiveDocumentsCount > 0 ? (item.Match.PostitiveDocuments / positiveDocumentsCount) * 100D : 0;

                item.RegExpRow["PercentPosDocuments"] = positiveDocumetsPercentage;
            }

            ///////////////////////////////////////////////////////////////////////////////

            //views.MainForm.adapterColRegExp.Update(views.MainForm.datasetMain.ColRegExp);
            //views.MainForm.adapterColRegExp.Fill(views.MainForm.datasetMain.ColRegExp);
        }
예제 #3
0
        public void CalcScores(RegExpScoreProcessingParams param)
        {
            var synergies = LoadSynergies(Path.Combine(param.WorkingFolder, param.SynergiesFileName));

            using (var docsConnection = DatabaseHelper.CreateConnection(param.DocumentsDatabaseFilePath, param.Password))
            {
                docsConnection.Open();


                ///////////////////////////////////////////////////////////////////////////////

                var noteColumnCount = DatabaseHelper.GetNoteColumnCount(docsConnection, "Documents", "NOTE_TEXT");

                ///////////////////////////////////////////////////////////////////////////////

                List <int> noteColumnIndexList = new List <int>();
                noteColumnIndexList.Add(2);
                ///////////////////////////////////////////////////////////////////////////////

                var docsCount = DatabaseHelper.GetRowsCount(docsConnection, "Documents");

                var query = "SELECT ED_ENC_NUM, Category, NOTE_TEXT";

                for (var noteColumnIndex = 1; noteColumnIndex < noteColumnCount; noteColumnIndex++)
                {
                    query += ", NOTE_TEXT" + noteColumnIndex.ToString();

                    noteColumnIndexList.Add(noteColumnIndex + 2);
                }

                query += " FROM Documents";

                var documentRecords = DatabaseHelper.GetDataRecords(docsConnection, query);

                ///////////////////////////////////////////////////////////////////////////////


                var results = Parallel_CalcScores(documentRecords, docsCount, 0, noteColumnIndexList, synergies);

                results.Serialize(param.GetFullPath(param.ScoreOutputFileName));

                ///////////////////////////////////////////////////////////////////////////////

                var matchResults = new RegExpProcessingResultsCollection <RegExpMatchProcessingResult>(_listRegExps.Select(x => new RegExpMatchProcessingResult
                {
                    RegExpID           = x.ID,
                    TotalMatches       = x.TotalMatches,
                    TotalDocuments     = x.TotalDocuments,
                    TotalRecords       = x.TotalRecords,
                    CategorizedRecords = x.CategorizedRecords
                }));

                matchResults.Serialize(param.GetFullPath(param.MatchesOutputFileName));
            }
        }
예제 #4
0
        protected RegExpProcessingResultsCollection <ColRegExpExtractProcessingResult> Parallel_ExtractValues(IEnumerable <IDataRecord> enumerableDocs, long docsCount, int columnIndexID, int columnIndexText, ColRegExpExtractProcessingParams param, Dictionary <int, CSScriptManager> scripts)
        {
            var results = new RegExpProcessingResultsCollection <ColRegExpExtractProcessingResult>();

            ///////////////////////////////////////////////////////////////////////////////

            long progressStep  = _listRegExps.Count;
            long progressMax   = docsCount * progressStep;
            long progressValue = 0;

            ///////////////////////////////////////////////////////////////////////////////

            Parallel.ForEach(enumerableDocs, (record, state) =>
            {
                try
                {
                    if (record.IsDBNull(columnIndexID))
                    {
                        return;
                    }

                    ///////////////////////////////////////////////////////////////////////////////

                    var documentID = record.GetDouble(columnIndexID);
                    if (param.DocumentsList.Any() && !param.DocumentsList.Contains(documentID))
                    {
                        return;
                    }

                    ///////////////////////////////////////////////////////////////////////////////

                    ExtractDocumentValues(documentID, columnIndexText, record, scripts, param.ColumnID, results, param);

                    ///////////////////////////////////////////////////////////////////////////////

                    var threadProgressValue = Interlocked.Add(ref progressValue, progressStep);

                    var progressPercentage = (int)(threadProgressValue / (double)progressMax * 100D);

                    if (!this.Logger.ReportProgress(progressPercentage, threadProgressValue))
                    {
                        state.Stop();
                    }
                }
                catch (Exception ex)
                {
                    Logger.HandleException(ex);
                }
            });

            ///////////////////////////////////////////////////////////////////////////////

            return(results);
        }
예제 #5
0
        protected RegExpProcessingResultsCollection <RegExpScoreProcessingResult> Parallel_CalcScores(IEnumerable <IDataRecord> enumerableDocs, long docsCount, int columnIndexID, List <int> columnIndexTextList, List <RegExpSynergy> synergies)
        {
            var results = new RegExpProcessingResultsCollection <RegExpScoreProcessingResult>();

            ///////////////////////////////////////////////////////////////////////////////

            long progressStep  = _listRegExps.Count;
            long progressMax   = docsCount * progressStep;
            long progressValue = 0;

            //////////////////////////////////////////////////////////////////////////

            Parallel.ForEach(enumerableDocs, (record, state) =>
            {
                try
                {
                    //List<int> tmp = new List<int>();
                    //tmp.Add(1);
                    var scoreResult = CalcDocumentScore(columnIndexID, columnIndexTextList, record, synergies);

                    if (scoreResult != null)
                    {
                        results.Add(scoreResult);
                    }


                    ///////////////////////////////////////////////////////////////////////////////

                    var threadProgressValue = Interlocked.Add(ref progressValue, progressStep);

                    var progressPercentage = (int)(threadProgressValue / (double)progressMax * 100D);

                    if (!this.Logger.ReportProgress(progressPercentage, threadProgressValue))
                    {
                        state.Stop();
                    }
                }
                catch (Exception ex)
                {
                    //Logger.AppendToLog("Parallel_CalcScores");
                    Logger.HandleException(ex);
                }
            });

            ///////////////////////////////////////////////////////////////////////////////

            return(results);
        }
예제 #6
0
        public void CalcScores(ColRegExpStatisticsProcessingParams param)
        {
            //MessageBox.Show("RegExpProcessor: CalcScores");
            using (var docsConnection = DatabaseHelper.CreateConnection(param.DocumentsDatabaseFilePath, param.Password))
            {
                docsConnection.Open();

                ///////////////////////////////////////////////////////////////////////////////

                var docsCount = DatabaseHelper.GetRowsCount(docsConnection, "Documents", param.OnlyPositiveScore ? "Score > 0" : null);

                string query = "SELECT ED_ENC_NUM, NOTE_TEXT, Score FROM Documents";

                if (param.OnlyPositiveScore)
                {
                    query += " WHERE Score > 0";
                }

                var documentRecords = DatabaseHelper.GetDataRecords(docsConnection, query);

                ///////////////////////////////////////////////////////////////////////////////

                Parallel_CalcScores(documentRecords, docsCount, 1, 2, param);

                ///////////////////////////////////////////////////////////////////////////////

                var matchResults = new RegExpProcessingResultsCollection <ColRegExpStatisticsProcessingResult>(_listRegExps.Select(x => new ColRegExpStatisticsProcessingResult
                {
                    ID             = x.ID,
                    TotalMatches   = x.TotalMatches,
                    TotalDocuments = x.TotalDocuments
                }));

                matchResults.Serialize(param.GetFullPath(param.MatchesOutputFileName));
            }
        }
예제 #7
0
        public RegExpProcessingResultsCollection <RegExpMatchProcessingResult> CalcScores(DataTable table, IEnumerable <DataRow> documentRows, List <RegExpSynergy> synergies)
        {
            var columnIndexID = table.Columns.IndexOf("ED_ENC_NUM");

            List <int> columnIndexTextList = new List <int>();

            for (int i = 0; i < table.Columns.Count; i++)
            {
                if (table.Columns[i].ColumnName.StartsWith("NOTE_TEXT"))
                {
                    columnIndexTextList.Add(i);
                }
            }

            // var columnIndexText = table.Columns.IndexOf("NOTE_TEXT");

            if (columnIndexID == -1 || columnIndexTextList.Count < 1)
            {
                throw new Exception("Cannot find source columns");
            }

            ///////////////////////////////////////////////////////////////////////////////

            var docsCount = table.Rows.Count;

            Parallel_CalcScores(DatabaseHelper.AsDataRecordEnumerable(documentRows), docsCount, columnIndexID, columnIndexTextList, synergies);

            var matchResults = new RegExpProcessingResultsCollection <RegExpMatchProcessingResult>(_listRegExps.Select(x => new RegExpMatchProcessingResult
            {
                RegExpID       = x.ID,
                TotalMatches   = x.TotalMatches,
                TotalDocuments = x.TotalDocuments
            }));

            return(matchResults);
        }
        protected int ColRegExp_WriteExtractedValues(ViewsManager views, RegExpProcessingResultsCollection <ColRegExpExtractProcessingResult> values, bool scriptExtract)
        {
            var progressMax = 0;

            ///////////////////////////////////////////////////////////////////////////////

            var join = (from extractResult in values.Items

                        join rowDynamicColumn in views.MainForm.datasetMain.DynamicColumns
                        on extractResult.ColumnID equals rowDynamicColumn.ID

                        join rowDocument in views.MainForm.datasetMain.Documents
                        on extractResult.DocumentID equals rowDocument.ED_ENC_NUM

                        orderby extractResult.ExtractOptions != null ? extractResult.ExtractOptions.Order : 0 descending

                        select new { extractResult, rowDocument, rowDynamicColumn })
                       .ToList();

            if (!join.Any())
            {
                return(0);
            }

            ///////////////////////////////////////////////////////////////////////////////

            var rowsToUpdate = new List <MainDataSet.DocumentsRow>();

            ///////////////////////////////////////////////////////////////////////////////

            int progressValue = 0;

            foreach (var j in join)
            {
                try
                {
                    var columnName = j.rowDynamicColumn.Title;

                    ///////////////////////////////////////////////////////////////////////////////

                    var needUpdate = false;

                    switch ((DynamicColumnType)j.rowDynamicColumn.Type)
                    {
                    case DynamicColumnType.FreeText:
                        needUpdate = Extract_WriteFreeTextValue(j.extractResult, j.rowDocument, columnName, scriptExtract);
                        break;

                    case DynamicColumnType.Numeric:
                        needUpdate = Extract_WriteNumericValue(j.extractResult, j.rowDocument, columnName, scriptExtract);
                        break;

                    case DynamicColumnType.DateTime:
                        needUpdate = Extract_WriteDateTimeValue(j.extractResult, j.rowDocument, columnName, scriptExtract);
                        break;
                    }

                    if (needUpdate && !j.rowDynamicColumn.Title.StartsWith("NOTE_TEXT"))
                    {
                        rowsToUpdate.Add(j.rowDocument);

                        progressMax++;
                    }
                    if (needUpdate && j.rowDynamicColumn.Title.StartsWith("NOTE_TEXT"))
                    {
                        progressMax++;
                    }

                    ///////////////////////////////////////////////////////////////////////////////

                    j.rowDocument.AcceptChanges();
                }
                catch (Exception ex)
                {
                    HandleException(ex);
                }

                ///////////////////////////////////////////////////////////////////////////////

                progressValue++;
                if (progressValue % 50 == 0)
                {
                    if (_worker.CancellationPending)
                    {
                        break;
                    }

                    var progressPercentage = (int)((double)progressValue / join.Count * 100D);
                    ReportProgress(progressPercentage, "Writing values " + progressValue + " of " + join.Count);
                }
            }

            ///////////////////////////////////////////////////////////////////////////////
            progressValue = 0;

            if (rowsToUpdate.Count > 0)
            {
                progressMax = rowsToUpdate.Count;

                progressValue = 0;
                ReportProgress(0);

                var columnsToUpdate = join.Where(x => !x.rowDynamicColumn.Title.StartsWith("NOTE_TEXT")).Select(x => x.rowDynamicColumn.Title).Distinct().ToList();
                var columnsInfo     = views.MainForm.adapterDocuments.GetColumnsInfo(columnsToUpdate);
                if (columnsToUpdate.Count != columnsInfo.Count)
                {
                    throw new Exception("Failed to create update command");
                }

                if (columnsInfo.Count != 0)
                {
                    var updateCommand = views.MainForm.adapterDocuments.CreateUpdateColumnsCommand(columnsInfo);

                    foreach (var row in rowsToUpdate)
                    {
                        views.MainForm.adapterDocuments.Update(row, columnsInfo, updateCommand);

                        ///////////////////////////////////////////////////////////////////////////////

                        progressValue++;
                        if (progressValue % 50 == 0)
                        {
                            if (_worker.CancellationPending)
                            {
                                break;
                            }

                            var progressPercentage = (int)((double)progressValue / progressMax * 100D);
                            ReportProgress(progressPercentage, "Writing values " + progressValue + " of " + join.Count);
                        }
                    }

                    views.MainForm.adapterDocuments.Fill();
                }
            }

            ///////////////////////////////////////////////////////////////////////////////
            /* For Note_Text Columns */
            var noteColumnsToUpdate = join.Where(x => x.rowDynamicColumn.Title.StartsWith("NOTE_TEXT")).Distinct().ToList();

            List <string> noteColumnsValue = new List <string>();

            foreach (var row in noteColumnsToUpdate)
            {
                List <ColumnInfo> noteColumnsInfo = new List <ColumnInfo>();
                ColumnInfo        colInfo         = new ColumnInfo();
                colInfo.Name = row.rowDynamicColumn.Title;
                // to give string type using name's type
                colInfo.Type = colInfo.Name.GetType();

                noteColumnsInfo.Add(colInfo);

                views.MainForm.adapterDocuments.StartBatchQuery();
                var updateCommand = views.MainForm.adapterDocuments.CreateUpdateColumnsCommand(noteColumnsInfo);

                updateCommand.Parameters[0].Value = row.extractResult.Value;
                updateCommand.Parameters[1].Value = row.extractResult.DocumentID;

                var count = updateCommand.ExecuteNonQuery();
                if (count != 1)
                {
                    throw new Exception(String.Format("Command updated {0} of expected 1 rows", count));
                }

                views.MainForm.adapterDocuments.StartBatchQuery();

                progressValue++;
                if (progressValue % 50 == 0)
                {
                    if (_worker.CancellationPending)
                    {
                        break;
                    }

                    var progressPercentage = (int)((double)progressValue / progressMax * 100D);
                    ReportProgress(progressPercentage, "Writing values " + progressValue + " of " + join.Count);
                }
            }

            ///////////////////////////////////////////////////////////////////////////////

            return(progressMax);
        }
        protected int ColRegExp_WritePythonExtractedValues(ViewsManager views, RegExpProcessingResultsCollection <RegExpPythonExtractSingleProcessingResult> values)
        {
            var progressMax = 0;

            ///////////////////////////////////////////////////////////////////////////////

            var join = (from extractResult in values.Items

                        join rowDynamicColumn in views.MainForm.datasetMain.DynamicColumns
                        on extractResult.ColumnID equals rowDynamicColumn.ID

                        join rowDocument in views.MainForm.datasetMain.Documents
                        on extractResult.DocumentID equals rowDocument.ED_ENC_NUM

                        select new { extractResult, rowDocument, rowDynamicColumn })
                       .ToList();

            if (!join.Any())
            {
                return(0);
            }

            ///////////////////////////////////////////////////////////////////////////////

            var rowsToUpdate = new List <MainDataSet.DocumentsRow>();

            ///////////////////////////////////////////////////////////////////////////////

            int progressValue = 0;

            foreach (var j in join)
            {
                try
                {
                    var columnName = j.rowDynamicColumn.Title;

                    ///////////////////////////////////////////////////////////////////////////////
                    if (columnName.StartsWith("NOTE_TEXT"))
                    {
                        var newValue = JsonConvert.SerializeObject(new Tuple <object, string>(JsonConvert.DeserializeObject(j.extractResult.Result), j.extractResult.NoteTextColumnName));

                        views.MainForm.adapterDocuments.StartBatchQuery();

                        var connection = views.MainForm.adapterDocuments.Connection;

                        var cmdText = String.Format("UPDATE [{0}] SET [{1}] = @NewValue WHERE [ED_ENC_NUM] = {2}", "Documents", columnName, j.extractResult.DocumentID);
                        var cmd     = new OleDbCommand(cmdText, connection);
                        cmd.Parameters.AddWithValue("@NewValue", newValue);

                        cmd.ExecuteNonQuery();
                    }
                    else
                    {
                        PythonExtract_WriteFreeTextValue(j.extractResult, j.rowDocument, columnName);
                    }
                }
                catch (Exception ex)
                {
                    HandleException(ex);
                }

                ///////////////////////////////////////////////////////////////////////////////

                progressValue++;
                if (progressValue % 20 == 0)
                {
                    if (_worker.CancellationPending)
                    {
                        break;
                    }

                    var progressPercentage = (int)((double)progressValue / join.Count * 50D);
                    ReportProgress(progressPercentage + 50, "Writing values " + progressValue + " of " + join.Count);
                }
                j.rowDocument.AcceptChanges();
            }
            ReportProgress(100);
            return(progressMax);
        }
        protected void RegExp_WriteScores(ViewsManager views, RegExpProcessingResultsCollection <RegExpMatchProcessingResult> matches, RegExpProcessingResultsCollection <RegExpScoreProcessingResult> scores)
        {
            var regExpJoin = from row in views.MainForm.datasetMain.RegExp.Cast <MainDataSet.RegExpRow>()
                             .Where(x => x.RowState != DataRowState.Deleted)
                             join match in matches.Items
                             on row.ID equals match.RegExpID
                             select new { RegExpRow = row, Match = match };

            foreach (var item in regExpJoin)
            {
                item.RegExpRow["TotalRecords"] = item.Match.TotalRecords;

                int categorized = 0;

                foreach (var key in item.Match.CategorizedRecords.Keys)
                {
                    categorized += item.Match.CategorizedRecords[key];
                }
                item.RegExpRow["TotalCategorized"] = categorized;

                item.RegExpRow["CategorizedRecords"] = item.Match.CategorizedRecords;

                item.RegExpRow["TotalDocuments"] = item.Match.TotalDocuments;
                item.RegExpRow["TotalMatches"]   = item.Match.TotalMatches;
            }

            ///////////////////////////////////////////////////////////////////////////////

            var docJoin = (from row in views.MainForm.datasetMain.Documents.Cast <MainDataSet.DocumentsRow>()
                           join score in scores.Items
                           on row.ED_ENC_NUM equals score.DocumentID
                           select new { DocumentRow = row, Scores = score })
                          .ToList();

            if (docJoin.Count == 0)
            {
                return;
            }

            ///////////////////////////////////////////////////////////////////////////////

            var noteColumnCount = docJoin[0].Scores.Score.Count();

            for (int i = 1; i < docJoin[0].Scores.Score.Count; i++)
            {
                try
                {
                    var score = docJoin[0].DocumentRow["Score" + i.ToString()];
                }
                catch (Exception e)
                {
                    noteColumnCount = i;
                    break;
                }
            }



            double progressMax   = docJoin.Count;
            var    progressValue = 0;

            foreach (var item in docJoin)
            {
                item.DocumentRow["Score"] = item.Scores.Score[0];
                for (int i = 1; i < noteColumnCount; i++)
                {
                    item.DocumentRow["Score" + i.ToString()] = item.Scores.Score[i];
                }


                if (item.Scores.CategoryID != -1)
                {
                    item.DocumentRow["Category"] = item.Scores.CategoryID;
                }

                ///////////////////////////////////////////////////////////////////////////////

                progressValue++;
                if (progressValue % 50 == 0)
                {
                    if (_worker.CancellationPending)
                    {
                        break;
                    }

                    var progressPercentage = (int)((progressValue / progressMax) * 100d);

                    ReportProgress(progressPercentage, "Writing value " + progressValue + " of " + progressMax);
                }
            }

            ///////////////////////////////////////////////////////////////////////////////

            progressValue = 0;
            ReportProgress(0);

            ///////////////////////////////////////////////////////////////////////////////

            var cmdWithCategory = CreateUpdateCommand(views, true, noteColumnCount);
            var cmdNoCategory   = CreateUpdateCommand(views, false, noteColumnCount);

            foreach (var item in docJoin)
            {
                if (item.Scores.CategoryID != -1)
                {
                    int i = 0;
                    for (i = 0; i < noteColumnCount; i++)
                    {
                        cmdWithCategory.Parameters[i].Value = item.Scores.Score[i];
                    }

                    cmdWithCategory.Parameters[i].Value     = item.Scores.CategoryID;
                    cmdWithCategory.Parameters[i + 1].Value = item.Scores.DocumentID;

                    cmdWithCategory.ExecuteNonQuery();
                }
                else
                {
                    int i = 0;
                    for (i = 0; i < noteColumnCount; i++)
                    {
                        cmdNoCategory.Parameters[i].Value = item.Scores.Score[i];
                    }

                    cmdNoCategory.Parameters[i].Value = item.Scores.DocumentID;

                    cmdNoCategory.ExecuteNonQuery();
                }

                item.DocumentRow.AcceptChanges();

                ///////////////////////////////////////////////////////////////////////////////

                progressValue++;
                if (progressValue % 50 == 0)
                {
                    if (_worker.CancellationPending)
                    {
                        break;
                    }

                    var progressPercentage = (int)((progressValue / progressMax) * 100d);

                    ReportProgress(progressPercentage, "Updating document row " + progressValue + " of " + progressMax);
                }
            }

            ///////////////////////////////////////////////////////////////////////////////

            views.MainForm.adapterDocuments.Fill();
        }
예제 #11
0
        private void CalcDocumentStatisticsSingle(int columnIndexID, List <int> columnIndexList, IDataRecord record, RegExpProcessingResultsCollection <RegExpStatisticsSingleProcessingResult> results)
        {
            if (record.IsDBNull(columnIndexID))
            {
                return;
            }

            ///////////////////////////////////////////////////////////////////////////////

            for (var i = 0; i < columnIndexList.Count; i++)
            {
                if (record.IsDBNull(columnIndexList[i]))
                {
                    return;
                }

                ///////////////////////////////////////////////////////////////////////////////

                var documentID = record.GetDouble(columnIndexID);
                var docText    = record.GetString(columnIndexList[i]);

                ///////////////////////////////////////////////////////////////////////////////

                var matches = _regExp.GetFilteredMatches(docText);
                if (matches.Any())
                {
                    matches.ForEach(x => results.Add(new RegExpStatisticsSingleProcessingResult
                    {
                        Word       = x.Value,
                        DocumentID = documentID,
                        ColumnID   = columnIndexList[i] - 1,
                        Start      = x.Index,
                        Length     = x.Length
                    }));
                }
            }
        }
예제 #12
0
        private RegExpProcessingResultsCollection <RegExpStatisticsSingleProcessingResult> Parallel_CalcStatisticsSingle(IEnumerable <IDataRecord> enumerableDocs, long docsCount, int columnIndexID, List <int> columnIndexList)
        {
            var results = new RegExpProcessingResultsCollection <RegExpStatisticsSingleProcessingResult>();

            ///////////////////////////////////////////////////////////////////////////////

            long progressStep  = 1;
            long progressMax   = docsCount * progressStep;
            long progressValue = 0;

            //////////////////////////////////////////////////////////////////////////

            Parallel.ForEach(enumerableDocs, (record, state) =>
            {
                try
                {
                    var count = results.Items.Count();
                    if (count > 0)
                    {
                        if (results.Items.Count() > 20000)
                        {
                            state.Stop();

                            _maxTotalMatchesReached = true;

                            return;
                        }

                        ///////////////////////////////////////////////////////////////////////////////

                        count = results.Items.GroupBy(x => x.Word).Count();

                        if (count > 500)
                        {
                            state.Stop();

                            _maxUniqueMatchesReached = true;

                            return;
                        }
                    }

                    ///////////////////////////////////////////////////////////////////////////////

                    CalcDocumentStatisticsSingle(columnIndexID, columnIndexList, record, results);

                    ///////////////////////////////////////////////////////////////////////////////

                    var threadProgressValue = Interlocked.Add(ref progressValue, progressStep);

                    var progressPercentage = (int)(threadProgressValue / (double)progressMax * 100D);

                    if (!_logger.ReportProgress(progressPercentage, threadProgressValue))
                    {
                        state.Stop();
                    }
                }
                catch (Exception ex)
                {
                    _logger.HandleException(ex);
                }
            });

            ///////////////////////////////////////////////////////////////////////////////

            return(results);
        }
예제 #13
0
        protected void ExtractDocumentValues(double documentID, int columnIndexText, IDataRecord record, Dictionary <int, CSScriptManager> scripts, int columnID, RegExpProcessingResultsCollection <ColRegExpExtractProcessingResult> results, ColRegExpExtractProcessingParams param)
        {
            if (record.IsDBNull(columnIndexText))
            {
                return;
            }

            ///////////////////////////////////////////////////////////////////////////////

            var docText = record.GetString(columnIndexText);

            if (scripts == null)
            {
                using (var docsConnection = DatabaseHelper.CreateConnection(param.DocumentsDatabaseFilePath, param.Password))
                {
                    docsConnection.Open();

                    foreach (var regExp in _listRegExps.Where(x => x.ExtractOptions != null && x.ExtractOptions.Extract))
                    {
                        var noteText = DatabaseHelper.GetNoteText(docsConnection, documentID, regExp.ExtractOptions.NoteTextColumn);

                        var extractResult = ExtractRegExpValues(regExp, documentID, noteText);
                        if (extractResult != null)
                        {
                            results.Add(extractResult);
                        }
                    }

                    ///////////////////////////////////////////////////////////////////////////////
                }
            }
            else
            {
                foreach (var pair in scripts)
                {
                    var extractResult = ScriptExtractRegExpValues(documentID, docText, pair.Value, pair.Key);
                    if (extractResult != null)
                    {
                        results.Add(extractResult);
                    }
                }
            }
        }