Example #1
0
        private void UpdateField(string prefix, string fieldToUpdate, EdgarTaskState state, int value)
        {
            string field        = prefix + fieldToUpdate;
            int    currentValue = (int)state.Dataset.GetType().GetProperty(field).GetValue(state.Dataset);

            state.Dataset.GetType().GetProperty(field).SetValue(state.Dataset, value);
            state.DatasetSharedRepo.UpdateEdgarDataset(state.Dataset, field);
        }
Example #2
0
        public virtual void Process(EdgarTaskState state, bool processInParallel, string fileToProcess, string fieldToUpdate)
        {
            try
            {
                Stopwatch watch = System.Diagnostics.Stopwatch.StartNew();
                Log.Info("Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " -- BEGIN PROCESS");
                int savedInDb;
                if (!IsAlreadyProcessed(state, fieldToUpdate, out savedInDb))
                {
                    string   cacheFolder = ConfigurationManager.AppSettings["cache_folder"];
                    string   tsvFileName = state.Dataset.RelativePath.Replace("/", "\\").Replace(".zip", "") + "\\" + fileToProcess;
                    string   filepath    = cacheFolder + tsvFileName;
                    string[] allLines    = File.ReadAllLines(filepath);
                    string   header      = allLines[0];

                    UpdateTotalField(state, fieldToUpdate, allLines.Length - 1);

                    ConcurrentBag <int> missing;
                    if (savedInDb == 0)
                    {
                        missing = null;
                    }
                    else
                    {
                        missing = GetMissingLines(state.Dataset.Id, allLines.Length - 1);
                    }

                    ProcessFile(missing, fileToProcess, fieldToUpdate, state, allLines, header, cacheFolder, tsvFileName, processInParallel);

                    Log.Info("Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " -- Process finished, updating dataset status");
                    savedInDb = state.DatasetSharedRepo.GetCount <T>(state.Dataset.Id);
                    UpdateProcessedField(state, fieldToUpdate, savedInDb);
                }
                else
                {
                    state.FileNameToReprocess = null;
                    Log.Info("Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " -- The complete file is already processed");
                }

                watch.Stop();
                TimeSpan ts          = watch.Elapsed;
                string   elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", ts.Hours, ts.Minutes, ts.Seconds, ts.Milliseconds / 10);
                Log.Info("Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " -- END PROCESS -- time: " + elapsedTime);
                state.ResultOk = true;
            }
            catch (Exception ex)
            {
                state.ResultOk  = false;
                state.Exception = new EdgarDatasetException(fileToProcess, ex);
                Log.Fatal("Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " -- Error: " + ex.Message, ex);
            }
        }
Example #3
0
 private bool IsAlreadyProcessed(EdgarTaskState state, string fieldToUpdate, out int savedInDb)
 {
     using (IAnalystEdgarDatasetsRepository repo = new AnalystEdgarRepository())
     {
         savedInDb = repo.GetCount <T>(state.Dataset.Id);
         int processed = (int)state.Dataset.GetType().GetProperty("Processed" + fieldToUpdate).GetValue(state.Dataset);
         if (savedInDb != processed)
         {
             UpdateProcessedField(state, fieldToUpdate, savedInDb);
         }
         int total = (int)state.Dataset.GetType().GetProperty("Total" + fieldToUpdate).GetValue(state.Dataset);
         return(savedInDb == processed && processed == total && total != 0);
     }
 }
        public override void ProcessFile(ConcurrentBag <int> missing, string fileToProcess, string fieldToUpdate, EdgarTaskState state, string[] allLines, string header, string cacheFolder, string tsvFileName, bool processInParallel)
        {
            //https://msdn.microsoft.com/en-us/library/ex21zs8x(v=vs.110).aspx
            //https://docs.microsoft.com/en-us/dotnet/framework/data/adonet/sql/transaction-and-bulk-copy-operations

            Log.Info("Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " -- BEGIN BULK PROCESS");
            using (IEdgarDatasetsBulkRepository repo = new EdgarDatasetsBulkRepository())
            {
                Log.Info("Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " -- Initializing variables");
                DataTable     dt         = repo.GetEmptyDataTable(RelatedTable);
                List <string> fieldNames = header.Split('\t').ToList();
                ConcurrentDictionary <int, string> failedLines = new ConcurrentDictionary <int, string>();
                List <Exception> exceptions = new List <Exception>();
                int    lineNumber           = 0;
                string prefixMsg            = "Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess;
                Log.Info("Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " -- Creating DataTable");
                //first line is the header
                for (int i = 1; i < allLines.Length; i++)
                {
                    lineNumber = i + 1;//i+1: indexes starts with 0 but header is line 1 and the first row is line 2
                    //It will be processed if:
                    //it's the first time (missing == null)
                    //or it's processed again and line wasn't processed the firs time (missing.Contains(i+1))
                    if (missing == null || missing.Contains(i + 1))
                    {
                        string line = allLines[i];
                        if (!string.IsNullOrEmpty(line))
                        {
                            try
                            {
                                List <string> fields = line.Split('\t').ToList();
                                DataRow       dr     = dt.NewRow();
                                Parse(fieldNames, fields, i + 1, dr, state.Dataset.Id);
                                dt.Rows.Add(dr);
                            }
                            catch (Exception ex)
                            {
                                EdgarLineException elex = new EdgarLineException(fileToProcess, lineNumber, ex);
                                exceptions.Add(elex);
                                failedLines.TryAdd(lineNumber, line);
                                Log.Error(prefixMsg + " -- line[" + lineNumber.ToString() + "]: " + line);
                                Log.Error(prefixMsg + " -- line[" + lineNumber.ToString() + "]: " + ex.Message, elex);
                                if (exceptions.Count > MaxErrorsAllowed)
                                {
                                    Log.Fatal(prefixMsg + " -- line[" + i.ToString() + "]: max errors allowed reached", ex);
                                    throw new EdgarDatasetException(fileToProcess, exceptions);
                                }
                            }
                        }
                    }
                }
                Log.Info("Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " -- Starting bulk copy");
                repo.BulkCopyTable(RelatedTable, dt);
                Log.Info("Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " --End bulk copy, now saving failed lines.");
                state.FileNameToReprocess = WriteFailedLines(cacheFolder, tsvFileName, header, failedLines, allLines.Length);
                Log.Info("Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " -- END BULK PROCESS");
            }
        }
        protected void ProcessRange(string fileToProcess, EdgarTaskState state, Tuple <int, int> range, string[] allLines, string header, ConcurrentBag <int> missing, ConcurrentDictionary <int, string> failedLines)
        {
            Stopwatch watch    = System.Diagnostics.Stopwatch.StartNew();
            string    rangeMsg = "Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " -- range: " + range.Item1 + " to " + range.Item2;

            Log.Info(rangeMsg + " -- BEGIN");

            /*
             * EF isn't thread safe and it doesn't allow parallel
             * https://stackoverflow.com/questions/12827599/parallel-doesnt-work-with-entity-framework
             * https://stackoverflow.com/questions/9099359/entity-framework-and-multi-threading
             * https://social.msdn.microsoft.com/Forums/en-US/e5cb847c-1d77-4cd0-abb7-b61890d99fae/multithreading-and-the-entity-framework?forum=adodotnetentityframework
             * solution: only 1 context for the entiry partition --> works
             */
            using (IAnalystEdgarDatasetsRepository repo = new AnalystEdgarRepository())
            {
                //It improves performance
                //https://msdn.microsoft.com/en-us/library/jj556205(v=vs.113).aspx
                repo.ContextConfigurationAutoDetectChangesEnabled = false;
                try
                {
                    List <string>    fieldNames = header.Split('\t').ToList();
                    List <Exception> exceptions = new List <Exception>();
                    string           line       = null;
                    int lineNumber = 0;
                    for (int i = range.Item1; i < range.Item2; i++)
                    {
                        try
                        {
                            lineNumber = i + 1;//i+1: indexes starts with 0 but header is line 1 and the first row is line 2
                            //It will be processed if:
                            //it's the first time (missing == null)
                            //or it's processed again and line wasn't processed the firs time (missing.Contains(i+1))
                            if (missing == null || missing.Contains(i + 1))
                            {
                                Log.Debug(rangeMsg + " -- parsing[" + i.ToString() + "]: " + line);
                                line = allLines[i];
                                if (!string.IsNullOrEmpty(line))//files with error lines has an empty line for processed lines
                                {
                                    List <string> fields = line.Split('\t').ToList();
                                    T             file   = Parse(repo, fieldNames, fields, lineNumber);
                                    Add(repo, state.Dataset, file);
                                }
                            }
                        }
                        catch (Exception ex)
                        {
                            EdgarLineException elex = new EdgarLineException(fileToProcess, lineNumber, ex);
                            exceptions.Add(elex);
                            failedLines.TryAdd(lineNumber, line);
                            Log.Error(rangeMsg + " -- line[" + lineNumber.ToString() + "]: " + line);
                            Log.Error(rangeMsg + " -- line[" + lineNumber.ToString() + "]: " + ex.Message, elex);
                            if (exceptions.Count > MaxErrorsAllowed)
                            {
                                Log.Fatal(rangeMsg + " -- line[" + i.ToString() + "]: max errors allowed reached", ex);
                                throw new EdgarDatasetException(fileToProcess, exceptions);
                            }
                        }
                    }
                }
                finally
                {
                    repo.ContextConfigurationAutoDetectChangesEnabled = true;
                }
            }
            watch.Stop();
            TimeSpan ts          = watch.Elapsed;
            string   elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", ts.Hours, ts.Minutes, ts.Seconds, ts.Milliseconds / 10);

            Log.Info(rangeMsg + " -- END");
            Log.Info(rangeMsg + " -- time: " + elapsedTime);
        }
        public override void ProcessFile(ConcurrentBag <int> missing, string fileToProcess, string fieldToUpdate, EdgarTaskState state, string[] allLines, string header, string cacheFolder, string tsvFileName, bool processInParallel)
        {
            ConcurrentDictionary <int, string> failedLines = new ConcurrentDictionary <int, string>();

            if (processInParallel && allLines.Length > 1)
            {
                //https://docs.microsoft.com/en-us/dotnet/standard/parallel-programming/custom-partitioners-for-plinq-and-tpl?view=netframework-4.5.2

                // Partition the entire source array.
                OrderablePartitioner <Tuple <int, int> > rangePartitioner = Partitioner.Create(1, allLines.Length);

                // Loop over the partitions in parallel.
                Parallel.ForEach(rangePartitioner, (range, loopState) =>
                {
                    ProcessRange(fileToProcess, state, range, allLines, header, missing, failedLines);
                });
            }
            else
            {
                ProcessRange(fileToProcess, state, new Tuple <int, int>(1, allLines.Length), allLines, header, missing, failedLines);
            }
            state.FileNameToReprocess = WriteFailedLines(cacheFolder, tsvFileName, header, failedLines, allLines.Length);
        }
Example #7
0
 public abstract void ProcessFile(ConcurrentBag <int> missing, string fileToProcess, string fieldToUpdate, EdgarTaskState state, string[] allLines, string header, string cacheFolder, string tsvFileName, bool processInParallel);
Example #8
0
 private void UpdateTotalField(EdgarTaskState state, string fieldToUpdate, int value)
 {
     UpdateField("Total", fieldToUpdate, state, value);
 }
Example #9
0
 private void UpdateProcessedField(EdgarTaskState state, string fieldToUpdate, int value)
 {
     UpdateField("Processed", fieldToUpdate, state, value);
 }