/// <summary> /// Iterates over each equity directory and aggregates the data into the coarse file /// </summary> /// <param name="dataDirectory">The Lean /Data directory</param> /// <param name="ignoreMaplessSymbols">Ignore symbols without a QuantQuote map file.</param> public static IEnumerable <string> ProcessEquityDirectories(string dataDirectory, bool ignoreMaplessSymbols, DateTime?startDate) { var exclusions = ReadExclusionsFile(ExclusionsFile); var equity = Path.Combine(dataDirectory, "equity"); foreach (var directory in Directory.EnumerateDirectories(equity)) { var dailyFolder = Path.Combine(directory, "daily"); var mapFileFolder = Path.Combine(directory, "map_files"); var coarseFolder = Path.Combine(directory, "fundamental", "coarse"); if (!Directory.Exists(coarseFolder)) { Directory.CreateDirectory(coarseFolder); } var lastProcessedDate = startDate ?? GetLastProcessedDate(coarseFolder); var factorFileProvider = new LocalDiskFactorFileProvider(); var files = ProcessDailyFolder(dailyFolder, coarseFolder, MapFileResolver.Create(mapFileFolder), factorFileProvider, exclusions, ignoreMaplessSymbols, lastProcessedDate); foreach (var file in files) { yield return(file); } } }
public TradingEconomicsEarningsDownloader(string destinationFolder) { _fromDate = new DateTime(1998, 1, 1); _toDate = DateTime.Now; _destinationFolder = Path.Combine(destinationFolder, "earnings"); _requestGate = new RateGate(1, TimeSpan.FromSeconds(1)); _mapFileResolver = MapFileResolver.Create(Globals.DataFolder, Market.USA); Directory.CreateDirectory(_destinationFolder); }
/// <summary> /// Iterates over each equity directory and aggregates the data into the coarse file /// </summary> /// <param name="dataDirectory">The Lean /Data directory</param> /// <param name="ignoreMaplessSymbols">Ignore symbols without a QuantQuote map file.</param> public static void ProcessEquityDirectories(string dataDirectory, bool ignoreMaplessSymbols) { var exclusions = ReadExclusionsFile(ExclusionsFile); var equity = Path.Combine(dataDirectory, "equity"); foreach (var directory in Directory.EnumerateDirectories(equity)) { var dailyFolder = Path.Combine(directory, "daily"); var mapFileFolder = Path.Combine(directory, "map_files"); var coarseFolder = Path.Combine(directory, "fundamental", "coarse"); if (!Directory.Exists(coarseFolder)) { Directory.CreateDirectory(coarseFolder); } ProcessDailyFolder(dailyFolder, coarseFolder, MapFileResolver.Create(mapFileFolder), exclusions, ignoreMaplessSymbols); } }
public void DoesNotEmitInvalidData() { var startTime = new DateTime(2014, 06, 06, 0, 0, 0); var endTime = new DateTime(2014, 06, 09, 20, 0, 0); var canonicalSymbol = Symbol.Create("AAPL", SecurityType.Option, Market.USA, "?AAPL"); var quoteCurrency = new Cash(Currencies.USD, 0, 1); var exchangeHours = MarketHoursDatabase.FromDataFolder().GetExchangeHours(Market.USA, canonicalSymbol, SecurityType.Option); var config = new SubscriptionDataConfig( typeof(ZipEntryName), canonicalSymbol, Resolution.Minute, TimeZones.Utc, TimeZones.NewYork, true, false, false, false, TickType.Quote, false, DataNormalizationMode.Raw ); var option = new Option( canonicalSymbol, exchangeHours, quoteCurrency, new OptionSymbolProperties(SymbolProperties.GetDefault(Currencies.USD)), ErrorCurrencyConverter.Instance, RegisteredSecurityDataTypesProvider.Null, new SecurityCache() ); var fillForwardResolution = Ref.CreateReadOnly(() => Resolution.Minute.ToTimeSpan()); Func <SubscriptionRequest, IEnumerator <BaseData>, IEnumerator <BaseData> > underlyingEnumeratorFunc = (req, input) => { input = new BaseDataCollectionAggregatorEnumerator(input, req.Configuration.Symbol); return(new FillForwardEnumerator( input, option.Exchange, fillForwardResolution, false, endTime, Resolution.Minute.ToTimeSpan(), TimeZones.Utc, startTime)); }; var factory = new OptionChainUniverseSubscriptionEnumeratorFactory(underlyingEnumeratorFunc, MapFileResolver.Create(Globals.DataFolder, Market.USA), new LocalDiskFactorFileProvider(new LocalDiskMapFileProvider())); var request = new SubscriptionRequest(true, null, option, config, startTime, endTime); var enumerator = factory.CreateEnumerator(request, new DefaultDataProvider()); var emittedCount = 0; foreach (var data in enumerator.AsEnumerable()) { emittedCount++; var optionData = data as OptionChainUniverseDataCollection; Assert.IsNotNull(optionData); Assert.IsNotNull(optionData.Underlying); Assert.AreNotEqual(0, optionData.Data.Count); } // 9:30 to 15:59 -> 6.5 hours * 60 => 390 minutes * 2 days = 780 Assert.AreEqual(780, emittedCount); }
/// <summary> /// Converts the data from raw format (*.nz.tar.gz) to json files consumable by LEAN /// </summary> /// <param name="processingDate">Date to process SEC filings for</param> public void Process(DateTime processingDate) { // Process data into dictionary of CIK -> List{T} of tickers foreach (var line in File.ReadLines(Path.Combine(RawSource, "cik-ticker-mappings.txt"))) { var tickerCik = line.Split('\t'); var ticker = tickerCik[0]; // tickerCik[0] = symbol, tickerCik[1] = CIK // Note that SEC tickers come in lowercase, so we don't have to alter the ticker var cikFormatted = tickerCik[1].PadLeft(10, '0'); List <string> symbol; if (!CikTicker.TryGetValue(cikFormatted, out symbol)) { symbol = new List <string>(); CikTicker[cikFormatted] = symbol; } // SEC data list contains a null value in the ticker.txt file if (!string.IsNullOrWhiteSpace(ticker)) { symbol.Add(ticker); } } // Merge both data sources to a single CIK -> List{T} of tickers foreach (var line in File.ReadLines(Path.Combine(RawSource, "cik-ticker-mappings-rankandfile.txt"))) { var tickerInfo = line.Split('|'); var companyCik = tickerInfo[0].PadLeft(10, '0'); var companyTicker = tickerInfo[1].ToLower(); List <string> symbol; if (!CikTicker.TryGetValue(companyCik, out symbol)) { symbol = new List <string>(); CikTicker[companyCik] = symbol; } // Add null check just in case data comes malformed if (!symbol.Contains(companyTicker) && !string.IsNullOrWhiteSpace(companyTicker)) { symbol.Add(companyTicker); } } var formattedDate = processingDate.ToString(DateFormat.EightCharacter); var remoteRawData = new FileInfo(Path.Combine(RawSource, $"{formattedDate}.nc.tar.gz")); if (!remoteRawData.Exists) { if (Holidays.Contains(processingDate) || USHoliday.Dates.Contains(processingDate)) { Log.Trace("SECDataConverter.Process(): File is missing, but we expected it to be missing. Nothing to do."); return; } throw new Exception($"SECDataConverter.Process(): Raw data {remoteRawData} not found. No processing can be done."); } // Copy the raw data to a temp path on disk Log.Trace($"SECDataConverter.Process(): Copying raw data locally..."); var tempPath = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString()); var localRawData = remoteRawData.CopyTo(tempPath); Log.Trace($"SECDataConverter.Process(): Copied raw data from {remoteRawData.FullName} - to: {tempPath}"); Log.Trace($"SECDataConverter.Process(): Start processing..."); var mapFileResolver = MapFileResolver.Create(Globals.DataFolder, Market.USA); var ncFilesRead = 0; var startingTime = DateTime.Now; var loopStartingTime = startingTime; // For the meantime, let's only process .nc files, and deal with correction files later. Parallel.ForEach( Compression.UnTar(localRawData.OpenRead(), isTarGz: true).Where(kvp => kvp.Key.EndsWith(".nc")), new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount / 2 }, rawReportFilePath => { var factory = new SECReportFactory(); var xmlText = new StringBuilder(); // We need to escape any nested XML to ensure our deserialization happens smoothly var parsingText = false; // SEC data is line separated by UNIX style line endings. No need to worry about a carriage line here. foreach (var line in Encoding.UTF8.GetString(rawReportFilePath.Value).Split('\n')) { var newTextLine = line; var currentTagName = GetTagNameFromLine(newTextLine); // This tag is present rarely in SEC reports, but is unclosed without value when encountered. // Verified by searching with ripgrep for "CONFIRMING-COPY" // // Sometimes, ASSIGNED-SIC contains no value and causes errors. Check to make sure that when // we encounter that tag we check if it has a value. // // "Appearance of the <FLAWED> tag in // an EX-27 document header signals unreliable tagging within the // following document text stream; however, in the absence of a // <FLAWED> tag, tagging is still not guaranteed to be complete // because of allowance in the financial data specifications for // omitted tags when the submission also includes a financial data // schedule of article type CT." if (currentTagName == "CONFIRMING-COPY" || (currentTagName == "ASSIGNED-SIC" && !HasValue(line)) || currentTagName == "FLAWED") { continue; } // Indicates that the form is a paper submission and that the current file has no contents if (currentTagName == "PAPER") { continue; } // Don't encode the closing tag if (currentTagName == "/TEXT") { parsingText = false; } // To ensure that we can serialize/deserialize data with hours, minutes, seconds if (currentTagName == "FILING-DATE" || currentTagName == "PERIOD" || currentTagName == "DATE-OF-FILING-CHANGE" || currentTagName == "DATE-CHANGED") { newTextLine = $"{newTextLine.TrimEnd()} 00:00:00"; } // Encode all contents inside tags to prevent errors in XML parsing. // The json deserializer will convert these values back to their original form if (!parsingText && HasValue(newTextLine)) { newTextLine = $"<{currentTagName}>{SecurityElement.Escape(GetTagValueFromLine(newTextLine))}</{currentTagName}>"; } // Escape all contents inside TEXT tags else if (parsingText) { newTextLine = SecurityElement.Escape(newTextLine); } // Don't encode the opening tag if (currentTagName == "TEXT") { parsingText = true; } xmlText.AppendLine(newTextLine); } var counter = Interlocked.Increment(ref ncFilesRead); if (counter % 100 == 0) { var interval = DateTime.Now - loopStartingTime; Log.Trace($"SECDataConverter.Process(): {counter} nc files read at {100 / interval.TotalMinutes:N2} files/min."); loopStartingTime = DateTime.Now; } ISECReport report; try { report = factory.CreateSECReport(xmlText.ToString()); } // Ignore unsupported form types for now catch (DataException) { return; } catch (XmlException e) { Log.Error(e, $"SECDataConverter.Process(): Failed to parse XML from file: {rawReportFilePath.Key}"); return; } catch (Exception e) { Log.Error(e, "SECDataConverter.Process(): Unknown error encountered"); return; } // First filer listed in SEC report is usually the company listed on stock exchanges var companyCik = report.Report.Filers.First().CompanyData.Cik; // Some companies can operate under two tickers, but have the same CIK. // Don't bother continuing if we don't find any tickers for the given CIK List <string> tickers; if (!CikTicker.TryGetValue(companyCik, out tickers)) { return; } if (!File.Exists(Path.Combine(RawSource, "indexes", $"{companyCik}.json"))) { Log.Error($"SECDataConverter.Process(): {report.Report.FilingDate:yyyy-MM-dd}:{rawReportFilePath.Key} - Failed to find index file for ticker {tickers.FirstOrDefault()} with CIK: {companyCik}"); return; } try { // The index file can potentially be corrupted GetPublicationDate(report, companyCik); } catch (Exception e) { Log.Error(e, $"SECDataConverter.Process(): {report.Report.FilingDate:yyyy-MM-dd}:{rawReportFilePath.Key} - Index file loading failed for ticker: {tickers.FirstOrDefault()} with CIK: {companyCik} even though it exists"); } // Default to company CIK if no known ticker is found. // If the equity is not does not resolve to a map file or // it is not found in the map files, we skip writing it. foreach (var ticker in tickers) { var tickerMapFile = mapFileResolver.ResolveMapFile(ticker, processingDate); if (!tickerMapFile.Any()) { Log.Trace($"SECDataConverter.Process(): {processingDate} - Failed to find map file for ticker: {ticker}"); continue; } // Map the current ticker to the ticker it was in the past using the map file system var mappedTicker = tickerMapFile.GetMappedSymbol(processingDate); // If no suitable date is found for the symbol in the map file, we skip writing the data if (string.IsNullOrEmpty(mappedTicker)) { Log.Trace($"SECDataConverter.Process(): {processingDate} - Failed to find mapped symbol for ticker: {ticker}"); continue; } var tickerReports = Reports.GetOrAdd( mappedTicker, _ => new ConcurrentDictionary <DateTime, List <ISECReport> >() ); var reports = tickerReports.GetOrAdd( report.Report.FilingDate.Date, _ => new List <ISECReport>() ); reports.Add(report); } } ); Log.Trace($"SECDataConverter.Process(): {ncFilesRead} nc files read finished in {DateTime.Now - startingTime:g}."); Parallel.ForEach( Reports.Keys, ticker => { List <ISECReport> reports; if (!Reports[ticker].TryRemove(processingDate, out reports)) { return; } WriteReport(reports, ticker); } ); // Delete the raw data we copied to the temp folder File.Delete(tempPath); }