private ConcurrentBag <int> GetMissingLines(int datasetId, int totalLines) { List <int> missing; using (IEdgarDatasetsRepository repo = new EdgarRepository()) { missing = repo.GetMissingLines(datasetId, RelatedTable, totalLines); } ConcurrentBag <int> bag = new ConcurrentBag <int>(missing); return(bag); }
public ConcurrentDictionary <string, int> GetAsConcurrent(int datasetId) { ConcurrentDictionary <string, int> ret = new ConcurrentDictionary <string, int>(); using (IEdgarDatasetsRepository repository = new EdgarRepository()) { IList <EdgarTuple> keysId = GetKeys(repository, datasetId); foreach (EdgarTuple t in keysId) { ret.TryAdd(t.Key, t.Id); } } return(ret); }
private bool IsAlreadyProcessed(EdgarTaskState state, string fieldToUpdate, out int savedInDb) { using (IEdgarDatasetsRepository repo = new EdgarRepository()) { savedInDb = repo.GetCount <T>(state.Dataset.Id); int processed = (int)state.Dataset.GetType().GetProperty("Processed" + fieldToUpdate).GetValue(state.Dataset); if (savedInDb != processed) { UpdateProcessedField(state, fieldToUpdate, savedInDb); } int total = (int)state.Dataset.GetType().GetProperty("Total" + fieldToUpdate).GetValue(state.Dataset); return(savedInDb == processed && processed == total && total != 0); } }
public List <IndexEntry> ParseMasterIndexFile(string content) { logger.Info("ParseMasterIndex - Init"); bool isData = false; List <string> lines = content.Split('\n').ToList(); int i = 0; logger.Info("ParseMasterIndex - Finding when start the data"); while (!isData) { if (lines[i] == "--------------------------------------------------------------------------------") { isData = true; } i++; } logger.Info($"ParseMasterIndex - Data start at line {i+1}, getting lines to process"); var linesToProces = lines.GetRange(i, lines.Count - (i + 1)); logger.Info($"ParseMasterIndex - Start partitioning"); OrderablePartitioner <Tuple <int, int> > rangePartitioner = Partitioner.Create(0, linesToProces.Count); ConcurrentBag <IndexEntry> indexEntries = new ConcurrentBag <IndexEntry>(); // Loop over the partitions in parallel. Parallel.ForEach(rangePartitioner, (range, loopState) => { using (IEdgarFilesRepository repo = new EdgarRepository()) { logger.Info($"$ParseMasterIndex - Processing range: from { range.Item1} to {range.Item2}"); for (int j = range.Item1; j < range.Item2; j++) { string line = linesToProces[j]; IndexEntry entry = ParseMasterIndexLine(line, j, repo); indexEntries.Add(entry); } } }); logger.Info($"ParseMasterIndex - Partitioning end"); logger.Info("ParseMasterIndex - End"); return(indexEntries.ToList()); }
public static void Initialize(EdgarContext context) { try { log.Info("Starting initialization"); context.Database.EnsureCreated(); log.Info("Database is created"); /* * Tags are case sensitive, example: * tag version custom abstract datatype iord crdr tlabel doc * EchostarXviMember 0001558370-16-009751 1 1 member Echostar Xvi [Member] Represents the information pertaining to satellites assets owned by the entity, EchoStar XVI. * EchoStarXVIMember 0001558370-16-009751 1 1 member Echo Star X V I [Member] Represents information pertaining to satellites assets leased from EchoStar, Echo Star XVI, which are accounted for as operating leases. * * And EF isn't case sensitive by default * https://stackoverflow.com/questions/3843060/linq-to-entities-case-sensitive-comparison * https://milinaudara.wordpress.com/2015/02/04/case-sensitive-search-using-entity-framework-with-custom-annotation/ * * //this works for sql server and it has to be run before the index is created * ALTER TABLE EdgarDatasetsTag * ALTER COLUMN Tag VARCHAR(10) * COLLATE SQL_Latin1_General_CP1_CS_AS */ //Solution //http://www.entityframeworktutorial.net/code-first/database-initialization-strategy-in-code-first.aspx context.Database.ExecuteSqlRaw("ALTER TABLE EdgarDatasetTags ALTER COLUMN Tag NVARCHAR(256) COLLATE SQL_Latin1_General_CP1_CS_AS NOT NULL"); log.Info(" created"); context.Database.ExecuteSqlRaw("ALTER TABLE EdgarDatasetTags ALTER COLUMN Version NVARCHAR(20) COLLATE SQL_Latin1_General_CP1_CS_AS NOT NULL"); log.Info(" created"); context.Database.ExecuteSqlRaw("CREATE UNIQUE INDEX IX_TagVersion ON EdgarDatasetTags (Tag, Version,DatasetId)"); log.Info(" created"); context.Database.ExecuteSqlRaw(GetTextScript("alter column ADSH.sql")); log.Info(" created"); List <string> scripts = new List <string>(); context.Database.ExecuteSqlRaw(GetTextScript("create GET_MISSING_LINE_NUMBERS.sql")); log.Info("GET_MISSING_LINE_NUMBERS created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_DISABLE_PRESENTATION_INDEXES.sql")); log.Info("SP_DISABLE_PRESENTATION_INDEXES created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETCALC_INSERT.sql")); log.Info("SP_EDGARDATASETCALC_INSERT created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETDIMENSIONS_INSERT.sql")); log.Info("SP_EDGARDATASETDIMENSIONS_INSERT created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETNUMBER_INSERT.sql")); log.Info("SP_EDGARDATASETNUMBER_INSERT created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETPRESENTATIONS_INSERT.sql")); log.Info("SP_EDGARDATASETPRESENTATIONS_INSERT created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETRENDERS_INSERT.sql")); log.Info("SP_EDGARDATASETRENDERS_INSERT created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETSUBMISSIONS_INSERT.sql")); log.Info("SP_EDGARDATASETSUBMISSIONS_INSERT created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETTAGS_INSERT.sql")); log.Info("SP_EDGARDATASETTAGS_INSERT created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_EDGARDATASETTEXT_INSERT.sql")); log.Info("SP_EDGARDATASETTEXT_INSERT created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_CALCULATIONS_KEYS.sql")); log.Info("SP_GET_CALCULATIONS_KEYS created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_DIMENSIONS_KEYS.sql")); log.Info("SP_GET_DIMENSIONS_KEYS created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_NUMBER_KEYS.sql")); log.Info("SP_GET_NUMBER_KEYS created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_PRESENTATION_KEYS.sql")); log.Info("SP_GET_PRESENTATION_KEYS created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_RENDER_KEYS.sql")); log.Info("SP_GET_RENDER_KEYS created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_SUBMISSIONS_KEYS.sql")); log.Info("SP_GET_SUBMISSIONS_KEYS created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_TAGS_KEYS.sql")); log.Info("SP_GET_TAGS_KEYS created"); context.Database.ExecuteSqlRaw(GetTextScript("create SP_GET_TEXT_KEYS.sql")); log.Info("SP_GET_TEXT_KEYS created"); context.Database.ExecuteSqlRaw(GetTextScript("create table LOG.sql")); log.Info("LOG created"); context.Database.ExecuteSqlRaw(GetTextScript("create table numbers.sql")); log.Info("numbers created"); IEdgarRepository repo = new EdgarRepository(context); log.Info("Loading initial data"); EdgarInitialLoader.LoadInitialData(repo); log.Info("SecForms and SICs loaded"); EdgarInitialLoader.LoadInitialDatasets((IEdgarDatasetsRepository)repo); log.Info("Datasets loaded"); EdgarInitialLoader.LoadInitialFullIndexes((IEdgarFilesRepository)repo); log.Info("Indexes loaded"); log.Info("Seed end"); } catch (Exception ex) { log.Fatal("Error seeding Datasets database: " + ex.Message, ex); throw ex; } }
protected void ProcessRange(string fileToProcess, EdgarTaskState state, Tuple <int, int> range, string[] allLines, string header, ConcurrentBag <int> missing, ConcurrentDictionary <int, string> failedLines) { Stopwatch watch = System.Diagnostics.Stopwatch.StartNew(); string rangeMsg = "Datasetid " + state.Dataset.Id.ToString() + " -- " + fileToProcess + " -- range: " + range.Item1 + " to " + range.Item2; Log.Info(rangeMsg + " -- BEGIN"); /* * EF isn't thread safe and it doesn't allow parallel * https://stackoverflow.com/questions/12827599/parallel-doesnt-work-with-entity-framework * https://stackoverflow.com/questions/9099359/entity-framework-and-multi-threading * https://social.msdn.microsoft.com/Forums/en-US/e5cb847c-1d77-4cd0-abb7-b61890d99fae/multithreading-and-the-entity-framework?forum=adodotnetentityframework * solution: only 1 context for the entiry partition --> works */ using (IEdgarDatasetsRepository repo = new EdgarRepository()) { //It improves performance //https://msdn.microsoft.com/en-us/library/jj556205(v=vs.113).aspx repo.ContextConfigurationAutoDetectChangesEnabled = false; try { List <string> fieldNames = header.Split('\t').ToList(); List <Exception> exceptions = new List <Exception>(); string line = null; int lineNumber = 0; for (int i = range.Item1; i < range.Item2; i++) { try { lineNumber = i + 1;//i+1: indexes starts with 0 but header is line 1 and the first row is line 2 //It will be processed if: //it's the first time (missing == null) //or it's processed again and line wasn't processed the firs time (missing.Contains(i+1)) if (missing == null || missing.Contains(i + 1)) { Log.Debug(rangeMsg + " -- parsing[" + i.ToString() + "]: " + line); line = allLines[i]; if (!string.IsNullOrEmpty(line))//files with error lines has an empty line for processed lines { List <string> fields = line.Split('\t').ToList(); T file = Parse(repo, fieldNames, fields, lineNumber); Add(repo, state.Dataset, file); } } } catch (Exception ex) { EdgarLineException elex = new EdgarLineException(fileToProcess, lineNumber, ex); exceptions.Add(elex); failedLines.TryAdd(lineNumber, line); Log.Error(rangeMsg + " -- line[" + lineNumber.ToString() + "]: " + line); Log.Error(rangeMsg + " -- line[" + lineNumber.ToString() + "]: " + ex.Message, elex); if (exceptions.Count > MaxErrorsAllowed) { Log.Fatal(rangeMsg + " -- line[" + i.ToString() + "]: max errors allowed reached", ex); throw new EdgarDatasetException(fileToProcess, exceptions); } } } } finally { repo.ContextConfigurationAutoDetectChangesEnabled = true; } } watch.Stop(); TimeSpan ts = watch.Elapsed; string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", ts.Hours, ts.Minutes, ts.Seconds, ts.Milliseconds / 10); Log.Info(rangeMsg + " -- END"); Log.Info(rangeMsg + " -- time: " + elapsedTime); }