예제 #1
0
        private ConcurrentBag <int> GetMissingLines(int datasetId, int totalLines)
        {
            List <int> missing;

            using (IEdgarDatasetsRepository repo = new EdgarRepository())
            {
                missing = repo.GetMissingLines(datasetId, RelatedTable, totalLines);
            }
            ConcurrentBag <int> bag = new ConcurrentBag <int>(missing);

            return(bag);
        }
예제 #2
0
        public ConcurrentDictionary <string, int> GetAsConcurrent(int datasetId)
        {
            ConcurrentDictionary <string, int> ret = new ConcurrentDictionary <string, int>();

            using (IEdgarDatasetsRepository repository = new EdgarRepository())
            {
                IList <EdgarTuple> keysId = GetKeys(repository, datasetId);
                foreach (EdgarTuple t in keysId)
                {
                    ret.TryAdd(t.Key, t.Id);
                }
            }
            return(ret);
        }
예제 #3
0
 private bool IsAlreadyProcessed(EdgarTaskState state, string fieldToUpdate, out int savedInDb)
 {
     using (IEdgarDatasetsRepository repo = new EdgarRepository())
     {
         savedInDb = repo.GetCount <T>(state.Dataset.Id);
         int processed = (int)state.Dataset.GetType().GetProperty("Processed" + fieldToUpdate).GetValue(state.Dataset);
         if (savedInDb != processed)
         {
             UpdateProcessedField(state, fieldToUpdate, savedInDb);
         }
         int total = (int)state.Dataset.GetType().GetProperty("Total" + fieldToUpdate).GetValue(state.Dataset);
         return(savedInDb == processed && processed == total && total != 0);
     }
 }
예제 #4
0
        public List <IndexEntry> ParseMasterIndexFile(string content)
        {
            logger.Info("ParseMasterIndex - Init");
            bool          isData = false;
            List <string> lines  = content.Split('\n').ToList();
            int           i      = 0;

            logger.Info("ParseMasterIndex - Finding when start the data");
            while (!isData)
            {
                if (lines[i] == "--------------------------------------------------------------------------------")
                {
                    isData = true;
                }
                i++;
            }
            logger.Info($"ParseMasterIndex - Data start at line {i+1}, getting lines to process");
            var linesToProces = lines.GetRange(i, lines.Count - (i + 1));

            logger.Info($"ParseMasterIndex - Start partitioning");
            OrderablePartitioner <Tuple <int, int> > rangePartitioner = Partitioner.Create(0, linesToProces.Count);
            ConcurrentBag <IndexEntry> indexEntries = new ConcurrentBag <IndexEntry>();

            // Loop over the partitions in parallel.
            Parallel.ForEach(rangePartitioner, (range, loopState) =>
            {
                using (IEdgarFilesRepository repo = new EdgarRepository())
                {
                    logger.Info($"$ParseMasterIndex - Processing range: from { range.Item1} to {range.Item2}");
                    for (int j = range.Item1; j < range.Item2; j++)
                    {
                        string line      = linesToProces[j];
                        IndexEntry entry = ParseMasterIndexLine(line, j, repo);
                        indexEntries.Add(entry);
                    }
                }
            });
            logger.Info($"ParseMasterIndex - Partitioning end");
            logger.Info("ParseMasterIndex - End");
            return(indexEntries.ToList());
        }
예제 #5
0
        public static void Initialize(EdgarContext context)
        {
            try
            {
                log.Info("Starting initialization");
                context.Database.EnsureCreated();
                log.Info("Database is created");

                /*
                 * Tags are case sensitive, example:
                 * tag	version	custom	abstract	datatype	iord	crdr	tlabel	doc
                 * EchostarXviMember	0001558370-16-009751	1	1	member			Echostar Xvi [Member]	Represents the information pertaining to satellites assets owned by the entity, EchoStar XVI.
                 * EchoStarXVIMember	0001558370-16-009751	1	1	member			Echo Star X V I [Member]	Represents information pertaining to satellites assets leased from EchoStar, Echo Star XVI, which are accounted for as operating leases.
                 *
                 * And EF isn't case sensitive by default
                 * https://stackoverflow.com/questions/3843060/linq-to-entities-case-sensitive-comparison
                 * https://milinaudara.wordpress.com/2015/02/04/case-sensitive-search-using-entity-framework-with-custom-annotation/
                 *
                 * //this works for sql server and it has to be run before the index is created
                 * ALTER TABLE EdgarDatasetsTag
                 * ALTER COLUMN Tag VARCHAR(10)
                 * COLLATE SQL_Latin1_General_CP1_CS_AS
                 */

                //Solution
                //http://www.entityframeworktutorial.net/code-first/database-initialization-strategy-in-code-first.aspx

                context.Database.ExecuteSqlRaw("ALTER TABLE EdgarDatasetTags ALTER COLUMN Tag NVARCHAR(256) COLLATE SQL_Latin1_General_CP1_CS_AS NOT NULL");
                log.Info(" created");
                context.Database.ExecuteSqlRaw("ALTER TABLE EdgarDatasetTags ALTER COLUMN Version NVARCHAR(20) COLLATE SQL_Latin1_General_CP1_CS_AS NOT NULL");
                log.Info(" created");
                context.Database.ExecuteSqlRaw("CREATE UNIQUE INDEX IX_TagVersion ON EdgarDatasetTags (Tag, Version,DatasetId)");
                log.Info(" created");
                context.Database.ExecuteSqlRaw(GetTextScript("alter column ADSH.sql"));
                log.Info(" created");

                List <string> scripts = new List <string>();
                context.Database.ExecuteSqlRaw(GetTextScript("create GET_MISSING_LINE_NUMBERS.sql"));
                log.Info("GET_MISSING_LINE_NUMBERS created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_DISABLE_PRESENTATION_INDEXES.sql"));
                log.Info("SP_DISABLE_PRESENTATION_INDEXES created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETCALC_INSERT.sql"));
                log.Info("SP_EDGARDATASETCALC_INSERT created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETDIMENSIONS_INSERT.sql"));
                log.Info("SP_EDGARDATASETDIMENSIONS_INSERT created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETNUMBER_INSERT.sql"));
                log.Info("SP_EDGARDATASETNUMBER_INSERT created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETPRESENTATIONS_INSERT.sql"));
                log.Info("SP_EDGARDATASETPRESENTATIONS_INSERT created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETRENDERS_INSERT.sql"));
                log.Info("SP_EDGARDATASETRENDERS_INSERT created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETSUBMISSIONS_INSERT.sql"));
                log.Info("SP_EDGARDATASETSUBMISSIONS_INSERT created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETTAGS_INSERT.sql"));
                log.Info("SP_EDGARDATASETTAGS_INSERT created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETTEXT_INSERT.sql"));
                log.Info("SP_EDGARDATASETTEXT_INSERT created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_CALCULATIONS_KEYS.sql"));
                log.Info("SP_GET_CALCULATIONS_KEYS created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_DIMENSIONS_KEYS.sql"));
                log.Info("SP_GET_DIMENSIONS_KEYS created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_NUMBER_KEYS.sql"));
                log.Info("SP_GET_NUMBER_KEYS created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_PRESENTATION_KEYS.sql"));
                log.Info("SP_GET_PRESENTATION_KEYS created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_RENDER_KEYS.sql"));
                log.Info("SP_GET_RENDER_KEYS created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_SUBMISSIONS_KEYS.sql"));
                log.Info("SP_GET_SUBMISSIONS_KEYS created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_TAGS_KEYS.sql"));
                log.Info("SP_GET_TAGS_KEYS created");
                context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_TEXT_KEYS.sql"));
                log.Info("SP_GET_TEXT_KEYS created");
                context.Database.ExecuteSqlRaw(GetTextScript("create table LOG.sql"));
                log.Info("LOG created");
                context.Database.ExecuteSqlRaw(GetTextScript("create table numbers.sql"));
                log.Info("numbers created");
                IEdgarRepository repo = new EdgarRepository(context);
                log.Info("Loading initial data");
                EdgarInitialLoader.LoadInitialData(repo);
                log.Info("SecForms and SICs loaded");
                EdgarInitialLoader.LoadInitialDatasets((IEdgarDatasetsRepository)repo);
                log.Info("Datasets loaded");
                EdgarInitialLoader.LoadInitialFullIndexes((IEdgarFilesRepository)repo);
                log.Info("Indexes loaded");

                log.Info("Seed end");
            }
            catch (Exception ex)
            {
                log.Fatal("Error seeding Datasets database: " + ex.Message, ex);
                throw ex;
            }
        }
예제 #6
0
        protected void ProcessRange(string fileToProcess, EdgarTaskState state, Tuple <int, int> range, string[] allLines, string header, ConcurrentBag <int> missing, ConcurrentDictionary <int, string> failedLines)
        {
            Stopwatch watch    = System.Diagnostics.Stopwatch.StartNew();
            string    rangeMsg = "Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " -- range: " + range.Item1 + " to " + range.Item2;

            Log.Info(rangeMsg + " -- BEGIN");

            /*
             * EF isn't thread safe and it doesn't allow parallel
             * https://stackoverflow.com/questions/12827599/parallel-doesnt-work-with-entity-framework
             * https://stackoverflow.com/questions/9099359/entity-framework-and-multi-threading
             * https://social.msdn.microsoft.com/Forums/en-US/e5cb847c-1d77-4cd0-abb7-b61890d99fae/multithreading-and-the-entity-framework?forum=adodotnetentityframework
             * solution: only 1 context for the entiry partition --> works
             */
            using (IEdgarDatasetsRepository repo = new EdgarRepository())
            {
                //It improves performance
                //https://msdn.microsoft.com/en-us/library/jj556205(v=vs.113).aspx
                repo.ContextConfigurationAutoDetectChangesEnabled = false;
                try
                {
                    List <string>    fieldNames = header.Split('\t').ToList();
                    List <Exception> exceptions = new List <Exception>();
                    string           line       = null;
                    int lineNumber = 0;
                    for (int i = range.Item1; i < range.Item2; i++)
                    {
                        try
                        {
                            lineNumber = i + 1;//i+1: indexes starts with 0 but header is line 1 and the first row is line 2
                            //It will be processed if:
                            //it's the first time (missing == null)
                            //or it's processed again and line wasn't processed the firs time (missing.Contains(i+1))
                            if (missing == null || missing.Contains(i + 1))
                            {
                                Log.Debug(rangeMsg + " -- parsing[" + i.ToString() + "]: " + line);
                                line = allLines[i];
                                if (!string.IsNullOrEmpty(line))//files with error lines has an empty line for processed lines
                                {
                                    List <string> fields = line.Split('\t').ToList();
                                    T             file   = Parse(repo, fieldNames, fields, lineNumber);
                                    Add(repo, state.Dataset, file);
                                }
                            }
                        }
                        catch (Exception ex)
                        {
                            EdgarLineException elex = new EdgarLineException(fileToProcess, lineNumber, ex);
                            exceptions.Add(elex);
                            failedLines.TryAdd(lineNumber, line);
                            Log.Error(rangeMsg + " -- line[" + lineNumber.ToString() + "]: " + line);
                            Log.Error(rangeMsg + " -- line[" + lineNumber.ToString() + "]: " + ex.Message, elex);
                            if (exceptions.Count > MaxErrorsAllowed)
                            {
                                Log.Fatal(rangeMsg + " -- line[" + i.ToString() + "]: max errors allowed reached", ex);
                                throw new EdgarDatasetException(fileToProcess, exceptions);
                            }
                        }
                    }
                }
                finally
                {
                    repo.ContextConfigurationAutoDetectChangesEnabled = true;
                }
            }
            watch.Stop();
            TimeSpan ts          = watch.Elapsed;
            string   elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", ts.Hours, ts.Minutes, ts.Seconds, ts.Milliseconds / 10);

            Log.Info(rangeMsg + " -- END");
            Log.Info(rangeMsg + " -- time: " + elapsedTime);
        }